xref: /sqlite-3.40.0/src/pager.c (revision 85b623f2)
1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This is the implementation of the page cache subsystem or "pager".
13 **
14 ** The pager is used to access a database disk file.  It implements
15 ** atomic commit and rollback through the use of a journal file that
16 ** is separate from the database file.  The pager also implements file
17 ** locking to prevent two processes from writing the same database
18 ** file simultaneously, or one process from reading the database while
19 ** another is writing.
20 **
21 ** @(#) $Id: pager.c,v 1.400 2007/12/13 21:54:11 drh Exp $
22 */
23 #ifndef SQLITE_OMIT_DISKIO
24 #include "sqliteInt.h"
25 #include <assert.h>
26 #include <string.h>
27 
28 /*
29 ** Macros for troubleshooting.  Normally turned off
30 */
31 #if 0
32 #define sqlite3DebugPrintf printf
33 #define PAGERTRACE1(X)       sqlite3DebugPrintf(X)
34 #define PAGERTRACE2(X,Y)     sqlite3DebugPrintf(X,Y)
35 #define PAGERTRACE3(X,Y,Z)   sqlite3DebugPrintf(X,Y,Z)
36 #define PAGERTRACE4(X,Y,Z,W) sqlite3DebugPrintf(X,Y,Z,W)
37 #define PAGERTRACE5(X,Y,Z,W,V) sqlite3DebugPrintf(X,Y,Z,W,V)
38 #else
39 #define PAGERTRACE1(X)
40 #define PAGERTRACE2(X,Y)
41 #define PAGERTRACE3(X,Y,Z)
42 #define PAGERTRACE4(X,Y,Z,W)
43 #define PAGERTRACE5(X,Y,Z,W,V)
44 #endif
45 
46 /*
47 ** The following two macros are used within the PAGERTRACEX() macros above
48 ** to print out file-descriptors.
49 **
50 ** PAGERID() takes a pointer to a Pager struct as its argument. The
51 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file
52 ** struct as its argument.
53 */
54 #define PAGERID(p) ((int)(p->fd))
55 #define FILEHANDLEID(fd) ((int)fd)
56 
57 /*
58 ** The page cache as a whole is always in one of the following
59 ** states:
60 **
61 **   PAGER_UNLOCK        The page cache is not currently reading or
62 **                       writing the database file.  There is no
63 **                       data held in memory.  This is the initial
64 **                       state.
65 **
66 **   PAGER_SHARED        The page cache is reading the database.
67 **                       Writing is not permitted.  There can be
68 **                       multiple readers accessing the same database
69 **                       file at the same time.
70 **
71 **   PAGER_RESERVED      This process has reserved the database for writing
72 **                       but has not yet made any changes.  Only one process
73 **                       at a time can reserve the database.  The original
74 **                       database file has not been modified so other
75 **                       processes may still be reading the on-disk
76 **                       database file.
77 **
78 **   PAGER_EXCLUSIVE     The page cache is writing the database.
79 **                       Access is exclusive.  No other processes or
80 **                       threads can be reading or writing while one
81 **                       process is writing.
82 **
83 **   PAGER_SYNCED        The pager moves to this state from PAGER_EXCLUSIVE
84 **                       after all dirty pages have been written to the
85 **                       database file and the file has been synced to
86 **                       disk. All that remains to do is to remove or
87 **                       truncate the journal file and the transaction
88 **                       will be committed.
89 **
90 ** The page cache comes up in PAGER_UNLOCK.  The first time a
91 ** sqlite3PagerGet() occurs, the state transitions to PAGER_SHARED.
92 ** After all pages have been released using sqlite_page_unref(),
93 ** the state transitions back to PAGER_UNLOCK.  The first time
94 ** that sqlite3PagerWrite() is called, the state transitions to
95 ** PAGER_RESERVED.  (Note that sqlite3PagerWrite() can only be
96 ** called on an outstanding page which means that the pager must
97 ** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
98 ** PAGER_RESERVED means that there is an open rollback journal.
99 ** The transition to PAGER_EXCLUSIVE occurs before any changes
100 ** are made to the database file, though writes to the rollback
101 ** journal occurs with just PAGER_RESERVED.  After an sqlite3PagerRollback()
102 ** or sqlite3PagerCommitPhaseTwo(), the state can go back to PAGER_SHARED,
103 ** or it can stay at PAGER_EXCLUSIVE if we are in exclusive access mode.
104 */
105 #define PAGER_UNLOCK      0
106 #define PAGER_SHARED      1   /* same as SHARED_LOCK */
107 #define PAGER_RESERVED    2   /* same as RESERVED_LOCK */
108 #define PAGER_EXCLUSIVE   4   /* same as EXCLUSIVE_LOCK */
109 #define PAGER_SYNCED      5
110 
111 /*
112 ** If the SQLITE_BUSY_RESERVED_LOCK macro is set to true at compile-time,
113 ** then failed attempts to get a reserved lock will invoke the busy callback.
114 ** This is off by default.  To see why, consider the following scenario:
115 **
116 ** Suppose thread A already has a shared lock and wants a reserved lock.
117 ** Thread B already has a reserved lock and wants an exclusive lock.  If
118 ** both threads are using their busy callbacks, it might be a long time
119 ** be for one of the threads give up and allows the other to proceed.
120 ** But if the thread trying to get the reserved lock gives up quickly
121 ** (if it never invokes its busy callback) then the contention will be
122 ** resolved quickly.
123 */
124 #ifndef SQLITE_BUSY_RESERVED_LOCK
125 # define SQLITE_BUSY_RESERVED_LOCK 0
126 #endif
127 
128 /*
129 ** This macro rounds values up so that if the value is an address it
130 ** is guaranteed to be an address that is aligned to an 8-byte boundary.
131 */
132 #define FORCE_ALIGNMENT(X)   (((X)+7)&~7)
133 
134 typedef struct PgHdr PgHdr;
135 
136 /*
137 ** Each pager stores all currently unreferenced pages in a list sorted
138 ** in least-recently-used (LRU) order (i.e. the first item on the list has
139 ** not been referenced in a long time, the last item has been recently
140 ** used). An instance of this structure is included as part of each
141 ** pager structure for this purpose (variable Pager.lru).
142 **
143 ** Additionally, if memory-management is enabled, all unreferenced pages
144 ** are stored in a global LRU list (global variable sqlite3LruPageList).
145 **
146 ** In both cases, the PagerLruList.pFirstSynced variable points to
147 ** the first page in the corresponding list that does not require an
148 ** fsync() operation before its memory can be reclaimed. If no such
149 ** page exists, PagerLruList.pFirstSynced is set to NULL.
150 */
151 typedef struct PagerLruList PagerLruList;
152 struct PagerLruList {
153   PgHdr *pFirst;         /* First page in LRU list */
154   PgHdr *pLast;          /* Last page in LRU list (the most recently used) */
155   PgHdr *pFirstSynced;   /* First page in list with PgHdr.needSync==0 */
156 };
157 
158 /*
159 ** The following structure contains the next and previous pointers used
160 ** to link a PgHdr structure into a PagerLruList linked list.
161 */
162 typedef struct PagerLruLink PagerLruLink;
163 struct PagerLruLink {
164   PgHdr *pNext;
165   PgHdr *pPrev;
166 };
167 
168 /*
169 ** Each in-memory image of a page begins with the following header.
170 ** This header is only visible to this pager module.  The client
171 ** code that calls pager sees only the data that follows the header.
172 **
173 ** Client code should call sqlite3PagerWrite() on a page prior to making
174 ** any modifications to that page.  The first time sqlite3PagerWrite()
175 ** is called, the original page contents are written into the rollback
176 ** journal and PgHdr.inJournal and PgHdr.needSync are set.  Later, once
177 ** the journal page has made it onto the disk surface, PgHdr.needSync
178 ** is cleared.  The modified page cannot be written back into the original
179 ** database file until the journal pages has been synced to disk and the
180 ** PgHdr.needSync has been cleared.
181 **
182 ** The PgHdr.dirty flag is set when sqlite3PagerWrite() is called and
183 ** is cleared again when the page content is written back to the original
184 ** database file.
185 **
186 ** Details of important structure elements:
187 **
188 ** needSync
189 **
190 **     If this is true, this means that it is not safe to write the page
191 **     content to the database because the original content needed
192 **     for rollback has not by synced to the main rollback journal.
193 **     The original content may have been written to the rollback journal
194 **     but it has not yet been synced.  So we cannot write to the database
195 **     file because power failure might cause the page in the journal file
196 **     to never reach the disk.  It is as if the write to the journal file
197 **     does not occur until the journal file is synced.
198 **
199 **     This flag is false if the page content exactly matches what
200 **     currently exists in the database file.  The needSync flag is also
201 **     false if the original content has been written to the main rollback
202 **     journal and synced.  If the page represents a new page that has
203 **     been added onto the end of the database during the current
204 **     transaction, the needSync flag is true until the original database
205 **     size in the journal header has been synced to disk.
206 **
207 ** inJournal
208 **
209 **     This is true if the original page has been written into the main
210 **     rollback journal.  This is always false for new pages added to
211 **     the end of the database file during the current transaction.
212 **     And this flag says nothing about whether or not the journal
213 **     has been synced to disk.  For pages that are in the original
214 **     database file, the following expression should always be true:
215 **
216 **       inJournal = (pPager->aInJournal[(pgno-1)/8] & (1<<((pgno-1)%8))!=0
217 **
218 **     The pPager->aInJournal[] array is only valid for the original
219 **     pages of the database, not new pages that are added to the end
220 **     of the database, so obviously the above expression cannot be
221 **     valid for new pages.  For new pages inJournal is always 0.
222 **
223 ** dirty
224 **
225 **     When true, this means that the content of the page has been
226 **     modified and needs to be written back to the database file.
227 **     If false, it means that either the content of the page is
228 **     unchanged or else the content is unimportant and we do not
229 **     care whether or not it is preserved.
230 **
231 ** alwaysRollback
232 **
233 **     This means that the sqlite3PagerDontRollback() API should be
234 **     ignored for this page.  The DontRollback() API attempts to say
235 **     that the content of the page on disk is unimportant (it is an
236 **     unused page on the freelist) so that it is unnecessary to
237 **     rollback changes to this page because the content of the page
238 **     can change without changing the meaning of the database.  This
239 **     flag overrides any DontRollback() attempt.  This flag is set
240 **     when a page that originally contained valid data is added to
241 **     the freelist.  Later in the same transaction, this page might
242 **     be pulled from the freelist and reused for something different
243 **     and at that point the DontRollback() API will be called because
244 **     pages taken from the freelist do not need to be protected by
245 **     the rollback journal.  But this flag says that the page was
246 **     not originally part of the freelist so that it still needs to
247 **     be rolled back in spite of any subsequent DontRollback() calls.
248 **
249 ** needRead
250 **
251 **     This flag means (when true) that the content of the page has
252 **     not yet been loaded from disk.  The in-memory content is just
253 **     garbage.  (Actually, we zero the content, but you should not
254 **     make any assumptions about the content nevertheless.)  If the
255 **     content is needed in the future, it should be read from the
256 **     original database file.
257 */
258 struct PgHdr {
259   Pager *pPager;                 /* The pager to which this page belongs */
260   Pgno pgno;                     /* The page number for this page */
261   PgHdr *pNextHash, *pPrevHash;  /* Hash collision chain for PgHdr.pgno */
262   PagerLruLink free;             /* Next and previous free pages */
263   PgHdr *pNextAll;               /* A list of all pages */
264   u8 inJournal;                  /* TRUE if has been written to journal */
265   u8 dirty;                      /* TRUE if we need to write back changes */
266   u8 needSync;                   /* Sync journal before writing this page */
267   u8 alwaysRollback;             /* Disable DontRollback() for this page */
268   u8 needRead;                   /* Read content if PagerWrite() is called */
269   short int nRef;                /* Number of users of this page */
270   PgHdr *pDirty, *pPrevDirty;    /* Dirty pages */
271 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
272   PagerLruLink gfree;            /* Global list of nRef==0 pages */
273 #endif
274 #ifdef SQLITE_CHECK_PAGES
275   u32 pageHash;
276 #endif
277   void *pData;                   /* Page data */
278   /* Pager.nExtra bytes of local data appended to this header */
279 };
280 
281 /*
282 ** For an in-memory only database, some extra information is recorded about
283 ** each page so that changes can be rolled back.  (Journal files are not
284 ** used for in-memory databases.)  The following information is added to
285 ** the end of every EXTRA block for in-memory databases.
286 **
287 ** This information could have been added directly to the PgHdr structure.
288 ** But then it would take up an extra 8 bytes of storage on every PgHdr
289 ** even for disk-based databases.  Splitting it out saves 8 bytes.  This
290 ** is only a savings of 0.8% but those percentages add up.
291 */
292 typedef struct PgHistory PgHistory;
293 struct PgHistory {
294   u8 *pOrig;     /* Original page text.  Restore to this on a full rollback */
295   u8 *pStmt;     /* Text as it was at the beginning of the current statement */
296   PgHdr *pNextStmt, *pPrevStmt;  /* List of pages in the statement journal */
297   u8 inStmt;                     /* TRUE if in the statement subjournal */
298 };
299 
300 /*
301 ** A macro used for invoking the codec if there is one
302 */
303 #ifdef SQLITE_HAS_CODEC
304 # define CODEC1(P,D,N,X) if( P->xCodec!=0 ){ P->xCodec(P->pCodecArg,D,N,X); }
305 # define CODEC2(P,D,N,X) ((char*)(P->xCodec!=0?P->xCodec(P->pCodecArg,D,N,X):D))
306 #else
307 # define CODEC1(P,D,N,X) /* NO-OP */
308 # define CODEC2(P,D,N,X) ((char*)D)
309 #endif
310 
311 /*
312 ** Convert a pointer to a PgHdr into a pointer to its data
313 ** and back again.
314 */
315 #define PGHDR_TO_DATA(P)    ((P)->pData)
316 #define PGHDR_TO_EXTRA(G,P) ((void*)&((G)[1]))
317 #define PGHDR_TO_HIST(P,PGR)  \
318             ((PgHistory*)&((char*)(&(P)[1]))[(PGR)->nExtra])
319 
320 /*
321 ** A open page cache is an instance of the following structure.
322 **
323 ** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or
324 ** or SQLITE_FULL. Once one of the first three errors occurs, it persists
325 ** and is returned as the result of every major pager API call.  The
326 ** SQLITE_FULL return code is slightly different. It persists only until the
327 ** next successful rollback is performed on the pager cache. Also,
328 ** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup()
329 ** APIs, they may still be used successfully.
330 */
331 struct Pager {
332   sqlite3_vfs *pVfs;          /* OS functions to use for IO */
333   u8 journalOpen;             /* True if journal file descriptors is valid */
334   u8 journalStarted;          /* True if header of journal is synced */
335   u8 useJournal;              /* Use a rollback journal on this file */
336   u8 noReadlock;              /* Do not bother to obtain readlocks */
337   u8 stmtOpen;                /* True if the statement subjournal is open */
338   u8 stmtInUse;               /* True we are in a statement subtransaction */
339   u8 stmtAutoopen;            /* Open stmt journal when main journal is opened*/
340   u8 noSync;                  /* Do not sync the journal if true */
341   u8 fullSync;                /* Do extra syncs of the journal for robustness */
342   u8 sync_flags;              /* One of SYNC_NORMAL or SYNC_FULL */
343   u8 state;                   /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
344   u8 tempFile;                /* zFilename is a temporary file */
345   u8 readOnly;                /* True for a read-only database */
346   u8 needSync;                /* True if an fsync() is needed on the journal */
347   u8 dirtyCache;              /* True if cached pages have changed */
348   u8 alwaysRollback;          /* Disable DontRollback() for all pages */
349   u8 memDb;                   /* True to inhibit all file I/O */
350   u8 setMaster;               /* True if a m-j name has been written to jrnl */
351   u8 doNotSync;               /* Boolean. While true, do not spill the cache */
352   u8 exclusiveMode;           /* Boolean. True if locking_mode==EXCLUSIVE */
353   u8 changeCountDone;         /* Set after incrementing the change-counter */
354   u32 vfsFlags;               /* Flags for sqlite3_vfs.xOpen() */
355   int errCode;                /* One of several kinds of errors */
356   int dbSize;                 /* Number of pages in the file */
357   int origDbSize;             /* dbSize before the current change */
358   int stmtSize;               /* Size of database (in pages) at stmt_begin() */
359   int nRec;                   /* Number of pages written to the journal */
360   u32 cksumInit;              /* Quasi-random value added to every checksum */
361   int stmtNRec;               /* Number of records in stmt subjournal */
362   int nExtra;                 /* Add this many bytes to each in-memory page */
363   int pageSize;               /* Number of bytes in a page */
364   int nPage;                  /* Total number of in-memory pages */
365   int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
366   int mxPage;                 /* Maximum number of pages to hold in cache */
367   Pgno mxPgno;                /* Maximum allowed size of the database */
368   u8 *aInJournal;             /* One bit for each page in the database file */
369   u8 *aInStmt;                /* One bit for each page in the database */
370   char *zFilename;            /* Name of the database file */
371   char *zJournal;             /* Name of the journal file */
372   char *zDirectory;           /* Directory hold database and journal files */
373   char *zStmtJrnl;            /* Name of the statement journal file */
374   sqlite3_file *fd, *jfd;     /* File descriptors for database and journal */
375   sqlite3_file *stfd;         /* File descriptor for the statement subjournal*/
376   BusyHandler *pBusyHandler;  /* Pointer to sqlite.busyHandler */
377   PagerLruList lru;           /* LRU list of free pages */
378   PgHdr *pAll;                /* List of all pages */
379   PgHdr *pStmt;               /* List of pages in the statement subjournal */
380   PgHdr *pDirty;              /* List of all dirty pages */
381   i64 journalOff;             /* Current byte offset in the journal file */
382   i64 journalHdr;             /* Byte offset to previous journal header */
383   i64 stmtHdrOff;             /* First journal header written this statement */
384   i64 stmtCksum;              /* cksumInit when statement was started */
385   i64 stmtJSize;              /* Size of journal at stmt_begin() */
386   int sectorSize;             /* Assumed sector size during rollback */
387 #ifdef SQLITE_TEST
388   int nHit, nMiss;            /* Cache hits and missing */
389   int nRead, nWrite;          /* Database pages read/written */
390 #endif
391   void (*xDestructor)(DbPage*,int); /* Call this routine when freeing pages */
392   void (*xReiniter)(DbPage*,int);   /* Call this routine when reloading pages */
393 #ifdef SQLITE_HAS_CODEC
394   void *(*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
395   void *pCodecArg;            /* First argument to xCodec() */
396 #endif
397   int nHash;                  /* Size of the pager hash table */
398   PgHdr **aHash;              /* Hash table to map page number to PgHdr */
399 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
400   Pager *pNext;               /* Doubly linked list of pagers on which */
401   Pager *pPrev;               /* sqlite3_release_memory() will work */
402   int iInUseMM;               /* Non-zero if unavailable to MM */
403   int iInUseDB;               /* Non-zero if in sqlite3_release_memory() */
404 #endif
405   char *pTmpSpace;            /* Pager.pageSize bytes of space for tmp use */
406   char dbFileVers[16];        /* Changes whenever database file changes */
407 };
408 
409 /*
410 ** The following global variables hold counters used for
411 ** testing purposes only.  These variables do not exist in
412 ** a non-testing build.  These variables are not thread-safe.
413 */
414 #ifdef SQLITE_TEST
415 int sqlite3_pager_readdb_count = 0;    /* Number of full pages read from DB */
416 int sqlite3_pager_writedb_count = 0;   /* Number of full pages written to DB */
417 int sqlite3_pager_writej_count = 0;    /* Number of pages written to journal */
418 int sqlite3_pager_pgfree_count = 0;    /* Number of cache pages freed */
419 # define PAGER_INCR(v)  v++
420 #else
421 # define PAGER_INCR(v)
422 #endif
423 
424 /*
425 ** The following variable points to the head of a double-linked list
426 ** of all pagers that are eligible for page stealing by the
427 ** sqlite3_release_memory() interface.  Access to this list is
428 ** protected by the SQLITE_MUTEX_STATIC_MEM2 mutex.
429 */
430 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
431 static Pager *sqlite3PagerList = 0;
432 static PagerLruList sqlite3LruPageList = {0, 0, 0};
433 #endif
434 
435 
436 /*
437 ** Journal files begin with the following magic string.  The data
438 ** was obtained from /dev/random.  It is used only as a sanity check.
439 **
440 ** Since version 2.8.0, the journal format contains additional sanity
441 ** checking information.  If the power fails while the journal is begin
442 ** written, semi-random garbage data might appear in the journal
443 ** file after power is restored.  If an attempt is then made
444 ** to roll the journal back, the database could be corrupted.  The additional
445 ** sanity checking data is an attempt to discover the garbage in the
446 ** journal and ignore it.
447 **
448 ** The sanity checking information for the new journal format consists
449 ** of a 32-bit checksum on each page of data.  The checksum covers both
450 ** the page number and the pPager->pageSize bytes of data for the page.
451 ** This cksum is initialized to a 32-bit random value that appears in the
452 ** journal file right after the header.  The random initializer is important,
453 ** because garbage data that appears at the end of a journal is likely
454 ** data that was once in other files that have now been deleted.  If the
455 ** garbage data came from an obsolete journal file, the checksums might
456 ** be correct.  But by initializing the checksum to random value which
457 ** is different for every journal, we minimize that risk.
458 */
459 static const unsigned char aJournalMagic[] = {
460   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
461 };
462 
463 /*
464 ** The size of the header and of each page in the journal is determined
465 ** by the following macros.
466 */
467 #define JOURNAL_PG_SZ(pPager)  ((pPager->pageSize) + 8)
468 
469 /*
470 ** The journal header size for this pager. In the future, this could be
471 ** set to some value read from the disk controller. The important
472 ** characteristic is that it is the same size as a disk sector.
473 */
474 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
475 
476 /*
477 ** The macro MEMDB is true if we are dealing with an in-memory database.
478 ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
479 ** the value of MEMDB will be a constant and the compiler will optimize
480 ** out code that would never execute.
481 */
482 #ifdef SQLITE_OMIT_MEMORYDB
483 # define MEMDB 0
484 #else
485 # define MEMDB pPager->memDb
486 #endif
487 
488 /*
489 ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
490 ** reserved for working around a windows/posix incompatibility). It is
491 ** used in the journal to signify that the remainder of the journal file
492 ** is devoted to storing a master journal name - there are no more pages to
493 ** roll back. See comments for function writeMasterJournal() for details.
494 */
495 /* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */
496 #define PAGER_MJ_PGNO(x) ((PENDING_BYTE/((x)->pageSize))+1)
497 
498 /*
499 ** The maximum legal page number is (2^31 - 1).
500 */
501 #define PAGER_MAX_PGNO 2147483647
502 
503 /*
504 ** The pagerEnter() and pagerLeave() routines acquire and release
505 ** a mutex on each pager.  The mutex is recursive.
506 **
507 ** This is a special-purpose mutex.  It only provides mutual exclusion
508 ** between the Btree and the Memory Management sqlite3_release_memory()
509 ** function.  It does not prevent, for example, two Btrees from accessing
510 ** the same pager at the same time.  Other general-purpose mutexes in
511 ** the btree layer handle that chore.
512 */
513 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
514   static void pagerEnter(Pager *p){
515     p->iInUseDB++;
516     if( p->iInUseMM && p->iInUseDB==1 ){
517       sqlite3_mutex *mutex;
518       mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
519       p->iInUseDB = 0;
520       sqlite3_mutex_enter(mutex);
521       p->iInUseDB = 1;
522       sqlite3_mutex_leave(mutex);
523     }
524     assert( p->iInUseMM==0 );
525   }
526   static void pagerLeave(Pager *p){
527     p->iInUseDB--;
528     assert( p->iInUseDB>=0 );
529   }
530 #else
531 # define pagerEnter(X)
532 # define pagerLeave(X)
533 #endif
534 
535 /*
536 ** Enable reference count tracking (for debugging) here:
537 */
538 #ifdef SQLITE_DEBUG
539   int pager3_refinfo_enable = 0;
540   static void pager_refinfo(PgHdr *p){
541     static int cnt = 0;
542     if( !pager3_refinfo_enable ) return;
543     sqlite3DebugPrintf(
544        "REFCNT: %4d addr=%p nRef=%-3d total=%d\n",
545        p->pgno, PGHDR_TO_DATA(p), p->nRef, p->pPager->nRef
546     );
547     cnt++;   /* Something to set a breakpoint on */
548   }
549 # define REFINFO(X)  pager_refinfo(X)
550 #else
551 # define REFINFO(X)
552 #endif
553 
554 /*
555 ** Add page pPg to the end of the linked list managed by structure
556 ** pList (pPg becomes the last entry in the list - the most recently
557 ** used). Argument pLink should point to either pPg->free or pPg->gfree,
558 ** depending on whether pPg is being added to the pager-specific or
559 ** global LRU list.
560 */
561 static void listAdd(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
562   pLink->pNext = 0;
563   pLink->pPrev = pList->pLast;
564 
565 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
566   assert(pLink==&pPg->free || pLink==&pPg->gfree);
567   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
568 #endif
569 
570   if( pList->pLast ){
571     int iOff = (char *)pLink - (char *)pPg;
572     PagerLruLink *pLastLink = (PagerLruLink *)(&((u8 *)pList->pLast)[iOff]);
573     pLastLink->pNext = pPg;
574   }else{
575     assert(!pList->pFirst);
576     pList->pFirst = pPg;
577   }
578 
579   pList->pLast = pPg;
580   if( !pList->pFirstSynced && pPg->needSync==0 ){
581     pList->pFirstSynced = pPg;
582   }
583 }
584 
585 /*
586 ** Remove pPg from the list managed by the structure pointed to by pList.
587 **
588 ** Argument pLink should point to either pPg->free or pPg->gfree, depending
589 ** on whether pPg is being added to the pager-specific or global LRU list.
590 */
591 static void listRemove(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
592   int iOff = (char *)pLink - (char *)pPg;
593 
594 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
595   assert(pLink==&pPg->free || pLink==&pPg->gfree);
596   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
597 #endif
598 
599   if( pPg==pList->pFirst ){
600     pList->pFirst = pLink->pNext;
601   }
602   if( pPg==pList->pLast ){
603     pList->pLast = pLink->pPrev;
604   }
605   if( pLink->pPrev ){
606     PagerLruLink *pPrevLink = (PagerLruLink *)(&((u8 *)pLink->pPrev)[iOff]);
607     pPrevLink->pNext = pLink->pNext;
608   }
609   if( pLink->pNext ){
610     PagerLruLink *pNextLink = (PagerLruLink *)(&((u8 *)pLink->pNext)[iOff]);
611     pNextLink->pPrev = pLink->pPrev;
612   }
613   if( pPg==pList->pFirstSynced ){
614     PgHdr *p = pLink->pNext;
615     while( p && p->needSync ){
616       PagerLruLink *pL = (PagerLruLink *)(&((u8 *)p)[iOff]);
617       p = pL->pNext;
618     }
619     pList->pFirstSynced = p;
620   }
621 
622   pLink->pNext = pLink->pPrev = 0;
623 }
624 
625 /*
626 ** Add page pPg to the list of free pages for the pager. If
627 ** memory-management is enabled, also add the page to the global
628 ** list of free pages.
629 */
630 static void lruListAdd(PgHdr *pPg){
631   listAdd(&pPg->pPager->lru, &pPg->free, pPg);
632 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
633   if( !pPg->pPager->memDb ){
634     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
635     listAdd(&sqlite3LruPageList, &pPg->gfree, pPg);
636     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
637   }
638 #endif
639 }
640 
641 /*
642 ** Remove page pPg from the list of free pages for the associated pager.
643 ** If memory-management is enabled, also remove pPg from the global list
644 ** of free pages.
645 */
646 static void lruListRemove(PgHdr *pPg){
647   listRemove(&pPg->pPager->lru, &pPg->free, pPg);
648 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
649   if( !pPg->pPager->memDb ){
650     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
651     listRemove(&sqlite3LruPageList, &pPg->gfree, pPg);
652     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
653   }
654 #endif
655 }
656 
657 /*
658 ** This function is called just after the needSync flag has been cleared
659 ** from all pages managed by pPager (usually because the journal file
660 ** has just been synced). It updates the pPager->lru.pFirstSynced variable
661 ** and, if memory-management is enabled, the sqlite3LruPageList.pFirstSynced
662 ** variable also.
663 */
664 static void lruListSetFirstSynced(Pager *pPager){
665   pPager->lru.pFirstSynced = pPager->lru.pFirst;
666 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
667   if( !pPager->memDb ){
668     PgHdr *p;
669     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
670     for(p=sqlite3LruPageList.pFirst; p && p->needSync; p=p->gfree.pNext);
671     assert(p==pPager->lru.pFirstSynced || p==sqlite3LruPageList.pFirstSynced);
672     sqlite3LruPageList.pFirstSynced = p;
673     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
674   }
675 #endif
676 }
677 
678 /*
679 ** Return true if page *pPg has already been written to the statement
680 ** journal (or statement snapshot has been created, if *pPg is part
681 ** of an in-memory database).
682 */
683 static int pageInStatement(PgHdr *pPg){
684   Pager *pPager = pPg->pPager;
685   if( MEMDB ){
686     return PGHDR_TO_HIST(pPg, pPager)->inStmt;
687   }else{
688     Pgno pgno = pPg->pgno;
689     u8 *a = pPager->aInStmt;
690     return (a && (int)pgno<=pPager->stmtSize && (a[pgno/8] & (1<<(pgno&7))));
691   }
692 }
693 
694 /*
695 ** Change the size of the pager hash table to N.  N must be a power
696 ** of two.
697 */
698 static void pager_resize_hash_table(Pager *pPager, int N){
699   PgHdr **aHash, *pPg;
700   assert( N>0 && (N&(N-1))==0 );
701   pagerLeave(pPager);
702   sqlite3MallocBenignFailure((int)pPager->aHash);
703   aHash = sqlite3MallocZero( sizeof(aHash[0])*N );
704   pagerEnter(pPager);
705   if( aHash==0 ){
706     /* Failure to rehash is not an error.  It is only a performance hit. */
707     return;
708   }
709   sqlite3_free(pPager->aHash);
710   pPager->nHash = N;
711   pPager->aHash = aHash;
712   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
713     int h;
714     if( pPg->pgno==0 ){
715       assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
716       continue;
717     }
718     h = pPg->pgno & (N-1);
719     pPg->pNextHash = aHash[h];
720     if( aHash[h] ){
721       aHash[h]->pPrevHash = pPg;
722     }
723     aHash[h] = pPg;
724     pPg->pPrevHash = 0;
725   }
726 }
727 
728 /*
729 ** Read a 32-bit integer from the given file descriptor.  Store the integer
730 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
731 ** error code is something goes wrong.
732 **
733 ** All values are stored on disk as big-endian.
734 */
735 static int read32bits(sqlite3_file *fd, i64 offset, u32 *pRes){
736   unsigned char ac[4];
737   int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);
738   if( rc==SQLITE_OK ){
739     *pRes = sqlite3Get4byte(ac);
740   }
741   return rc;
742 }
743 
744 /*
745 ** Write a 32-bit integer into a string buffer in big-endian byte order.
746 */
747 #define put32bits(A,B)  sqlite3Put4byte((u8*)A,B)
748 
749 /*
750 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
751 ** on success or an error code is something goes wrong.
752 */
753 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){
754   char ac[4];
755   put32bits(ac, val);
756   return sqlite3OsWrite(fd, ac, 4, offset);
757 }
758 
759 /*
760 ** If file pFd is open, call sqlite3OsUnlock() on it.
761 */
762 static int osUnlock(sqlite3_file *pFd, int eLock){
763   if( !pFd->pMethods ){
764     return SQLITE_OK;
765   }
766   return sqlite3OsUnlock(pFd, eLock);
767 }
768 
769 /*
770 ** This function determines whether or not the atomic-write optimization
771 ** can be used with this pager. The optimization can be used if:
772 **
773 **  (a) the value returned by OsDeviceCharacteristics() indicates that
774 **      a database page may be written atomically, and
775 **  (b) the value returned by OsSectorSize() is less than or equal
776 **      to the page size.
777 **
778 ** If the optimization cannot be used, 0 is returned. If it can be used,
779 ** then the value returned is the size of the journal file when it
780 ** contains rollback data for exactly one page.
781 */
782 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
783 static int jrnlBufferSize(Pager *pPager){
784   int dc;           /* Device characteristics */
785   int nSector;      /* Sector size */
786   int nPage;        /* Page size */
787   sqlite3_file *fd = pPager->fd;
788 
789   if( fd->pMethods ){
790     dc = sqlite3OsDeviceCharacteristics(fd);
791     nSector = sqlite3OsSectorSize(fd);
792     nPage = pPager->pageSize;
793   }
794 
795   assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
796   assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
797 
798   if( !fd->pMethods || (dc&(SQLITE_IOCAP_ATOMIC|(nPage>>8))&&nSector<=nPage) ){
799     return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);
800   }
801   return 0;
802 }
803 #endif
804 
805 /*
806 ** This function should be called when an error occurs within the pager
807 ** code. The first argument is a pointer to the pager structure, the
808 ** second the error-code about to be returned by a pager API function.
809 ** The value returned is a copy of the second argument to this function.
810 **
811 ** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL
812 ** the error becomes persistent. Until the persisten error is cleared,
813 ** subsequent API calls on this Pager will immediately return the same
814 ** error code.
815 **
816 ** A persistent error indicates that the contents of the pager-cache
817 ** cannot be trusted. This state can be cleared by completely discarding
818 ** the contents of the pager-cache. If a transaction was active when
819 ** the persistent error occured, then the rollback journal may need
820 ** to be replayed.
821 */
822 static void pager_unlock(Pager *pPager);
823 static int pager_error(Pager *pPager, int rc){
824   int rc2 = rc & 0xff;
825   assert(
826        pPager->errCode==SQLITE_FULL ||
827        pPager->errCode==SQLITE_OK ||
828        (pPager->errCode & 0xff)==SQLITE_IOERR
829   );
830   if(
831     rc2==SQLITE_FULL ||
832     rc2==SQLITE_IOERR ||
833     rc2==SQLITE_CORRUPT
834   ){
835     pPager->errCode = rc;
836     if( pPager->state==PAGER_UNLOCK && pPager->nRef==0 ){
837       /* If the pager is already unlocked, call pager_unlock() now to
838       ** clear the error state and ensure that the pager-cache is
839       ** completely empty.
840       */
841       pager_unlock(pPager);
842     }
843   }
844   return rc;
845 }
846 
847 /*
848 ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking
849 ** on the cache using a hash function.  This is used for testing
850 ** and debugging only.
851 */
852 #ifdef SQLITE_CHECK_PAGES
853 /*
854 ** Return a 32-bit hash of the page data for pPage.
855 */
856 static u32 pager_datahash(int nByte, unsigned char *pData){
857   u32 hash = 0;
858   int i;
859   for(i=0; i<nByte; i++){
860     hash = (hash*1039) + pData[i];
861   }
862   return hash;
863 }
864 static u32 pager_pagehash(PgHdr *pPage){
865   return pager_datahash(pPage->pPager->pageSize,
866                         (unsigned char *)PGHDR_TO_DATA(pPage));
867 }
868 
869 /*
870 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
871 ** is defined, and NDEBUG is not defined, an assert() statement checks
872 ** that the page is either dirty or still matches the calculated page-hash.
873 */
874 #define CHECK_PAGE(x) checkPage(x)
875 static void checkPage(PgHdr *pPg){
876   Pager *pPager = pPg->pPager;
877   assert( !pPg->pageHash || pPager->errCode || MEMDB || pPg->dirty ||
878       pPg->pageHash==pager_pagehash(pPg) );
879 }
880 
881 #else
882 #define pager_datahash(X,Y)  0
883 #define pager_pagehash(X)  0
884 #define CHECK_PAGE(x)
885 #endif
886 
887 /*
888 ** When this is called the journal file for pager pPager must be open.
889 ** The master journal file name is read from the end of the file and
890 ** written into memory supplied by the caller.
891 **
892 ** zMaster must point to a buffer of at least nMaster bytes allocated by
893 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is
894 ** enough space to write the master journal name). If the master journal
895 ** name in the journal is longer than nMaster bytes (including a
896 ** nul-terminator), then this is handled as if no master journal name
897 ** were present in the journal.
898 **
899 ** If no master journal file name is present zMaster[0] is set to 0 and
900 ** SQLITE_OK returned.
901 */
902 static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, int nMaster){
903   int rc;
904   u32 len;
905   i64 szJ;
906   u32 cksum;
907   int i;
908   unsigned char aMagic[8]; /* A buffer to hold the magic header */
909 
910   zMaster[0] = '\0';
911 
912   rc = sqlite3OsFileSize(pJrnl, &szJ);
913   if( rc!=SQLITE_OK || szJ<16 ) return rc;
914 
915   rc = read32bits(pJrnl, szJ-16, &len);
916   if( rc!=SQLITE_OK ) return rc;
917 
918   if( len>=nMaster ){
919     return SQLITE_OK;
920   }
921 
922   rc = read32bits(pJrnl, szJ-12, &cksum);
923   if( rc!=SQLITE_OK ) return rc;
924 
925   rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8);
926   if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc;
927 
928   rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len);
929   if( rc!=SQLITE_OK ){
930     return rc;
931   }
932   zMaster[len] = '\0';
933 
934   /* See if the checksum matches the master journal name */
935   for(i=0; i<len; i++){
936     cksum -= zMaster[i];
937    }
938   if( cksum ){
939     /* If the checksum doesn't add up, then one or more of the disk sectors
940     ** containing the master journal filename is corrupted. This means
941     ** definitely roll back, so just return SQLITE_OK and report a (nul)
942     ** master-journal filename.
943     */
944     zMaster[0] = '\0';
945   }
946 
947   return SQLITE_OK;
948 }
949 
950 /*
951 ** Seek the journal file descriptor to the next sector boundary where a
952 ** journal header may be read or written. Pager.journalOff is updated with
953 ** the new seek offset.
954 **
955 ** i.e for a sector size of 512:
956 **
957 ** Input Offset              Output Offset
958 ** ---------------------------------------
959 ** 0                         0
960 ** 512                       512
961 ** 100                       512
962 ** 2000                      2048
963 **
964 */
965 static void seekJournalHdr(Pager *pPager){
966   i64 offset = 0;
967   i64 c = pPager->journalOff;
968   if( c ){
969     offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
970   }
971   assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
972   assert( offset>=c );
973   assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
974   pPager->journalOff = offset;
975 }
976 
977 /*
978 ** The journal file must be open when this routine is called. A journal
979 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
980 ** current location.
981 **
982 ** The format for the journal header is as follows:
983 ** - 8 bytes: Magic identifying journal format.
984 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
985 ** - 4 bytes: Random number used for page hash.
986 ** - 4 bytes: Initial database page count.
987 ** - 4 bytes: Sector size used by the process that wrote this journal.
988 **
989 ** Followed by (JOURNAL_HDR_SZ - 24) bytes of unused space.
990 */
991 static int writeJournalHdr(Pager *pPager){
992   char zHeader[sizeof(aJournalMagic)+16];
993   int rc;
994 
995   if( pPager->stmtHdrOff==0 ){
996     pPager->stmtHdrOff = pPager->journalOff;
997   }
998 
999   seekJournalHdr(pPager);
1000   pPager->journalHdr = pPager->journalOff;
1001 
1002   memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
1003 
1004   /*
1005   ** Write the nRec Field - the number of page records that follow this
1006   ** journal header. Normally, zero is written to this value at this time.
1007   ** After the records are added to the journal (and the journal synced,
1008   ** if in full-sync mode), the zero is overwritten with the true number
1009   ** of records (see syncJournal()).
1010   **
1011   ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When
1012   ** reading the journal this value tells SQLite to assume that the
1013   ** rest of the journal file contains valid page records. This assumption
1014   ** is dangerous, as if a failure occured whilst writing to the journal
1015   ** file it may contain some garbage data. There are two scenarios
1016   ** where this risk can be ignored:
1017   **
1018   **   * When the pager is in no-sync mode. Corruption can follow a
1019   **     power failure in this case anyway.
1020   **
1021   **   * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees
1022   **     that garbage data is never appended to the journal file.
1023   */
1024   assert(pPager->fd->pMethods||pPager->noSync);
1025   if( (pPager->noSync)
1026    || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)
1027   ){
1028     put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);
1029   }else{
1030     put32bits(&zHeader[sizeof(aJournalMagic)], 0);
1031   }
1032 
1033   /* The random check-hash initialiser */
1034   sqlite3Randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
1035   put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
1036   /* The initial database size */
1037   put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbSize);
1038   /* The assumed sector size for this process */
1039   put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
1040   IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, sizeof(zHeader)))
1041   rc = sqlite3OsWrite(pPager->jfd, zHeader, sizeof(zHeader),pPager->journalOff);
1042   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
1043 
1044   /* The journal header has been written successfully. Seek the journal
1045   ** file descriptor to the end of the journal header sector.
1046   */
1047   if( rc==SQLITE_OK ){
1048     IOTRACE(("JTAIL %p %lld\n", pPager, pPager->journalOff-1))
1049     rc = sqlite3OsWrite(pPager->jfd, "\000", 1, pPager->journalOff-1);
1050   }
1051   return rc;
1052 }
1053 
1054 /*
1055 ** The journal file must be open when this is called. A journal header file
1056 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
1057 ** file. See comments above function writeJournalHdr() for a description of
1058 ** the journal header format.
1059 **
1060 ** If the header is read successfully, *nRec is set to the number of
1061 ** page records following this header and *dbSize is set to the size of the
1062 ** database before the transaction began, in pages. Also, pPager->cksumInit
1063 ** is set to the value read from the journal header. SQLITE_OK is returned
1064 ** in this case.
1065 **
1066 ** If the journal header file appears to be corrupted, SQLITE_DONE is
1067 ** returned and *nRec and *dbSize are not set.  If JOURNAL_HDR_SZ bytes
1068 ** cannot be read from the journal file an error code is returned.
1069 */
1070 static int readJournalHdr(
1071   Pager *pPager,
1072   i64 journalSize,
1073   u32 *pNRec,
1074   u32 *pDbSize
1075 ){
1076   int rc;
1077   unsigned char aMagic[8]; /* A buffer to hold the magic header */
1078   i64 jrnlOff;
1079 
1080   seekJournalHdr(pPager);
1081   if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
1082     return SQLITE_DONE;
1083   }
1084   jrnlOff = pPager->journalOff;
1085 
1086   rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), jrnlOff);
1087   if( rc ) return rc;
1088   jrnlOff += sizeof(aMagic);
1089 
1090   if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
1091     return SQLITE_DONE;
1092   }
1093 
1094   rc = read32bits(pPager->jfd, jrnlOff, pNRec);
1095   if( rc ) return rc;
1096 
1097   rc = read32bits(pPager->jfd, jrnlOff+4, &pPager->cksumInit);
1098   if( rc ) return rc;
1099 
1100   rc = read32bits(pPager->jfd, jrnlOff+8, pDbSize);
1101   if( rc ) return rc;
1102 
1103   /* Update the assumed sector-size to match the value used by
1104   ** the process that created this journal. If this journal was
1105   ** created by a process other than this one, then this routine
1106   ** is being called from within pager_playback(). The local value
1107   ** of Pager.sectorSize is restored at the end of that routine.
1108   */
1109   rc = read32bits(pPager->jfd, jrnlOff+12, (u32 *)&pPager->sectorSize);
1110   if( rc ) return rc;
1111 
1112   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
1113   return SQLITE_OK;
1114 }
1115 
1116 
1117 /*
1118 ** Write the supplied master journal name into the journal file for pager
1119 ** pPager at the current location. The master journal name must be the last
1120 ** thing written to a journal file. If the pager is in full-sync mode, the
1121 ** journal file descriptor is advanced to the next sector boundary before
1122 ** anything is written. The format is:
1123 **
1124 ** + 4 bytes: PAGER_MJ_PGNO.
1125 ** + N bytes: length of master journal name.
1126 ** + 4 bytes: N
1127 ** + 4 bytes: Master journal name checksum.
1128 ** + 8 bytes: aJournalMagic[].
1129 **
1130 ** The master journal page checksum is the sum of the bytes in the master
1131 ** journal name.
1132 **
1133 ** If zMaster is a NULL pointer (occurs for a single database transaction),
1134 ** this call is a no-op.
1135 */
1136 static int writeMasterJournal(Pager *pPager, const char *zMaster){
1137   int rc;
1138   int len;
1139   int i;
1140   i64 jrnlOff;
1141   u32 cksum = 0;
1142   char zBuf[sizeof(aJournalMagic)+2*4];
1143 
1144   if( !zMaster || pPager->setMaster) return SQLITE_OK;
1145   pPager->setMaster = 1;
1146 
1147   len = strlen(zMaster);
1148   for(i=0; i<len; i++){
1149     cksum += zMaster[i];
1150   }
1151 
1152   /* If in full-sync mode, advance to the next disk sector before writing
1153   ** the master journal name. This is in case the previous page written to
1154   ** the journal has already been synced.
1155   */
1156   if( pPager->fullSync ){
1157     seekJournalHdr(pPager);
1158   }
1159   jrnlOff = pPager->journalOff;
1160   pPager->journalOff += (len+20);
1161 
1162   rc = write32bits(pPager->jfd, jrnlOff, PAGER_MJ_PGNO(pPager));
1163   if( rc!=SQLITE_OK ) return rc;
1164   jrnlOff += 4;
1165 
1166   rc = sqlite3OsWrite(pPager->jfd, zMaster, len, jrnlOff);
1167   if( rc!=SQLITE_OK ) return rc;
1168   jrnlOff += len;
1169 
1170   put32bits(zBuf, len);
1171   put32bits(&zBuf[4], cksum);
1172   memcpy(&zBuf[8], aJournalMagic, sizeof(aJournalMagic));
1173   rc = sqlite3OsWrite(pPager->jfd, zBuf, 8+sizeof(aJournalMagic), jrnlOff);
1174   pPager->needSync = !pPager->noSync;
1175   return rc;
1176 }
1177 
1178 /*
1179 ** Add or remove a page from the list of all pages that are in the
1180 ** statement journal.
1181 **
1182 ** The Pager keeps a separate list of pages that are currently in
1183 ** the statement journal.  This helps the sqlite3PagerStmtCommit()
1184 ** routine run MUCH faster for the common case where there are many
1185 ** pages in memory but only a few are in the statement journal.
1186 */
1187 static void page_add_to_stmt_list(PgHdr *pPg){
1188   Pager *pPager = pPg->pPager;
1189   PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
1190   assert( MEMDB );
1191   if( !pHist->inStmt ){
1192     assert( pHist->pPrevStmt==0 && pHist->pNextStmt==0 );
1193     if( pPager->pStmt ){
1194       PGHDR_TO_HIST(pPager->pStmt, pPager)->pPrevStmt = pPg;
1195     }
1196     pHist->pNextStmt = pPager->pStmt;
1197     pPager->pStmt = pPg;
1198     pHist->inStmt = 1;
1199   }
1200 }
1201 
1202 /*
1203 ** Find a page in the hash table given its page number.  Return
1204 ** a pointer to the page or NULL if not found.
1205 */
1206 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
1207   PgHdr *p;
1208   if( pPager->aHash==0 ) return 0;
1209   p = pPager->aHash[pgno & (pPager->nHash-1)];
1210   while( p && p->pgno!=pgno ){
1211     p = p->pNextHash;
1212   }
1213   return p;
1214 }
1215 
1216 /*
1217 ** Clear the in-memory cache.  This routine
1218 ** sets the state of the pager back to what it was when it was first
1219 ** opened.  Any outstanding pages are invalidated and subsequent attempts
1220 ** to access those pages will likely result in a coredump.
1221 */
1222 static void pager_reset(Pager *pPager){
1223   PgHdr *pPg, *pNext;
1224   if( pPager->errCode ) return;
1225   for(pPg=pPager->pAll; pPg; pPg=pNext){
1226     IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
1227     PAGER_INCR(sqlite3_pager_pgfree_count);
1228     pNext = pPg->pNextAll;
1229     lruListRemove(pPg);
1230     sqlite3_free(pPg);
1231   }
1232   assert(pPager->lru.pFirst==0);
1233   assert(pPager->lru.pFirstSynced==0);
1234   assert(pPager->lru.pLast==0);
1235   pPager->pStmt = 0;
1236   pPager->pAll = 0;
1237   pPager->pDirty = 0;
1238   pPager->nHash = 0;
1239   sqlite3_free(pPager->aHash);
1240   pPager->nPage = 0;
1241   pPager->aHash = 0;
1242   pPager->nRef = 0;
1243 }
1244 
1245 /*
1246 ** Unlock the database file.
1247 **
1248 ** If the pager is currently in error state, discard the contents of
1249 ** the cache and reset the Pager structure internal state. If there is
1250 ** an open journal-file, then the next time a shared-lock is obtained
1251 ** on the pager file (by this or any other process), it will be
1252 ** treated as a hot-journal and rolled back.
1253 */
1254 static void pager_unlock(Pager *pPager){
1255   if( !pPager->exclusiveMode ){
1256     if( !MEMDB ){
1257       if( pPager->fd->pMethods ){
1258         osUnlock(pPager->fd, NO_LOCK);
1259       }
1260       pPager->dbSize = -1;
1261       IOTRACE(("UNLOCK %p\n", pPager))
1262 
1263       /* If Pager.errCode is set, the contents of the pager cache cannot be
1264       ** trusted. Now that the pager file is unlocked, the contents of the
1265       ** cache can be discarded and the error code safely cleared.
1266       */
1267       if( pPager->errCode ){
1268         pPager->errCode = SQLITE_OK;
1269         pager_reset(pPager);
1270         if( pPager->stmtOpen ){
1271           sqlite3OsClose(pPager->stfd);
1272           sqlite3_free(pPager->aInStmt);
1273           pPager->aInStmt = 0;
1274         }
1275         if( pPager->journalOpen ){
1276           sqlite3OsClose(pPager->jfd);
1277           pPager->journalOpen = 0;
1278           sqlite3_free(pPager->aInJournal);
1279           pPager->aInJournal = 0;
1280         }
1281         pPager->stmtOpen = 0;
1282         pPager->stmtInUse = 0;
1283         pPager->journalOff = 0;
1284         pPager->journalStarted = 0;
1285         pPager->stmtAutoopen = 0;
1286         pPager->origDbSize = 0;
1287       }
1288     }
1289 
1290     if( !MEMDB || pPager->errCode==SQLITE_OK ){
1291       pPager->state = PAGER_UNLOCK;
1292       pPager->changeCountDone = 0;
1293     }
1294   }
1295 }
1296 
1297 /*
1298 ** Execute a rollback if a transaction is active and unlock the
1299 ** database file. If the pager has already entered the error state,
1300 ** do not attempt the rollback.
1301 */
1302 static void pagerUnlockAndRollback(Pager *p){
1303   assert( p->state>=PAGER_RESERVED || p->journalOpen==0 );
1304   if( p->errCode==SQLITE_OK && p->state>=PAGER_RESERVED ){
1305     sqlite3PagerRollback(p);
1306   }
1307   pager_unlock(p);
1308   assert( p->errCode || !p->journalOpen || (p->exclusiveMode&&!p->journalOff) );
1309   assert( p->errCode || !p->stmtOpen || p->exclusiveMode );
1310 }
1311 
1312 /*
1313 ** This routine ends a transaction.  A transaction is ended by either
1314 ** a COMMIT or a ROLLBACK.
1315 **
1316 ** When this routine is called, the pager has the journal file open and
1317 ** a RESERVED or EXCLUSIVE lock on the database.  This routine will release
1318 ** the database lock and acquires a SHARED lock in its place if that is
1319 ** the appropriate thing to do.  Release locks usually is appropriate,
1320 ** unless we are in exclusive access mode or unless this is a
1321 ** COMMIT AND BEGIN or ROLLBACK AND BEGIN operation.
1322 **
1323 ** The journal file is either deleted or truncated.
1324 **
1325 ** TODO: Consider keeping the journal file open for temporary databases.
1326 ** This might give a performance improvement on windows where opening
1327 ** a file is an expensive operation.
1328 */
1329 static int pager_end_transaction(Pager *pPager){
1330   PgHdr *pPg;
1331   int rc = SQLITE_OK;
1332   int rc2 = SQLITE_OK;
1333   assert( !MEMDB );
1334   if( pPager->state<PAGER_RESERVED ){
1335     return SQLITE_OK;
1336   }
1337   sqlite3PagerStmtCommit(pPager);
1338   if( pPager->stmtOpen && !pPager->exclusiveMode ){
1339     sqlite3OsClose(pPager->stfd);
1340     pPager->stmtOpen = 0;
1341   }
1342   if( pPager->journalOpen ){
1343     if( pPager->exclusiveMode
1344           && (rc = sqlite3OsTruncate(pPager->jfd, 0))==SQLITE_OK ){;
1345       pPager->journalOff = 0;
1346       pPager->journalStarted = 0;
1347     }else{
1348       sqlite3OsClose(pPager->jfd);
1349       pPager->journalOpen = 0;
1350       if( rc==SQLITE_OK ){
1351         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
1352       }
1353     }
1354     sqlite3_free( pPager->aInJournal );
1355     pPager->aInJournal = 0;
1356     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1357       pPg->inJournal = 0;
1358       pPg->dirty = 0;
1359       pPg->needSync = 0;
1360       pPg->alwaysRollback = 0;
1361 #ifdef SQLITE_CHECK_PAGES
1362       pPg->pageHash = pager_pagehash(pPg);
1363 #endif
1364     }
1365     pPager->pDirty = 0;
1366     pPager->dirtyCache = 0;
1367     pPager->nRec = 0;
1368   }else{
1369     assert( pPager->aInJournal==0 );
1370     assert( pPager->dirtyCache==0 || pPager->useJournal==0 );
1371   }
1372 
1373   if( !pPager->exclusiveMode ){
1374     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
1375     pPager->state = PAGER_SHARED;
1376   }else if( pPager->state==PAGER_SYNCED ){
1377     pPager->state = PAGER_EXCLUSIVE;
1378   }
1379   pPager->origDbSize = 0;
1380   pPager->setMaster = 0;
1381   pPager->needSync = 0;
1382   lruListSetFirstSynced(pPager);
1383   pPager->dbSize = -1;
1384 
1385   return (rc==SQLITE_OK?rc2:rc);
1386 }
1387 
1388 /*
1389 ** Compute and return a checksum for the page of data.
1390 **
1391 ** This is not a real checksum.  It is really just the sum of the
1392 ** random initial value and the page number.  We experimented with
1393 ** a checksum of the entire data, but that was found to be too slow.
1394 **
1395 ** Note that the page number is stored at the beginning of data and
1396 ** the checksum is stored at the end.  This is important.  If journal
1397 ** corruption occurs due to a power failure, the most likely scenario
1398 ** is that one end or the other of the record will be changed.  It is
1399 ** much less likely that the two ends of the journal record will be
1400 ** correct and the middle be corrupt.  Thus, this "checksum" scheme,
1401 ** though fast and simple, catches the mostly likely kind of corruption.
1402 **
1403 ** FIX ME:  Consider adding every 200th (or so) byte of the data to the
1404 ** checksum.  That way if a single page spans 3 or more disk sectors and
1405 ** only the middle sector is corrupt, we will still have a reasonable
1406 ** chance of failing the checksum and thus detecting the problem.
1407 */
1408 static u32 pager_cksum(Pager *pPager, const u8 *aData){
1409   u32 cksum = pPager->cksumInit;
1410   int i = pPager->pageSize-200;
1411   while( i>0 ){
1412     cksum += aData[i];
1413     i -= 200;
1414   }
1415   return cksum;
1416 }
1417 
1418 /* Forward declaration */
1419 static void makeClean(PgHdr*);
1420 
1421 /*
1422 ** Read a single page from the journal file opened on file descriptor
1423 ** jfd.  Playback this one page.
1424 **
1425 ** If useCksum==0 it means this journal does not use checksums.  Checksums
1426 ** are not used in statement journals because statement journals do not
1427 ** need to survive power failures.
1428 */
1429 static int pager_playback_one_page(
1430   Pager *pPager,
1431   sqlite3_file *jfd,
1432   i64 offset,
1433   int useCksum
1434 ){
1435   int rc;
1436   PgHdr *pPg;                   /* An existing page in the cache */
1437   Pgno pgno;                    /* The page number of a page in journal */
1438   u32 cksum;                    /* Checksum used for sanity checking */
1439   u8 *aData = (u8 *)pPager->pTmpSpace;   /* Temp storage for a page */
1440 
1441   /* useCksum should be true for the main journal and false for
1442   ** statement journals.  Verify that this is always the case
1443   */
1444   assert( jfd == (useCksum ? pPager->jfd : pPager->stfd) );
1445   assert( aData );
1446 
1447   rc = read32bits(jfd, offset, &pgno);
1448   if( rc!=SQLITE_OK ) return rc;
1449   rc = sqlite3OsRead(jfd, aData, pPager->pageSize, offset+4);
1450   if( rc!=SQLITE_OK ) return rc;
1451   pPager->journalOff += pPager->pageSize + 4;
1452 
1453   /* Sanity checking on the page.  This is more important that I originally
1454   ** thought.  If a power failure occurs while the journal is being written,
1455   ** it could cause invalid data to be written into the journal.  We need to
1456   ** detect this invalid data (with high probability) and ignore it.
1457   */
1458   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
1459     return SQLITE_DONE;
1460   }
1461   if( pgno>(unsigned)pPager->dbSize ){
1462     return SQLITE_OK;
1463   }
1464   if( useCksum ){
1465     rc = read32bits(jfd, offset+pPager->pageSize+4, &cksum);
1466     if( rc ) return rc;
1467     pPager->journalOff += 4;
1468     if( pager_cksum(pPager, aData)!=cksum ){
1469       return SQLITE_DONE;
1470     }
1471   }
1472 
1473   assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
1474 
1475   /* If the pager is in RESERVED state, then there must be a copy of this
1476   ** page in the pager cache. In this case just update the pager cache,
1477   ** not the database file. The page is left marked dirty in this case.
1478   **
1479   ** An exception to the above rule: If the database is in no-sync mode
1480   ** and a page is moved during an incremental vacuum then the page may
1481   ** not be in the pager cache. Later: if a malloc() or IO error occurs
1482   ** during a Movepage() call, then the page may not be in the cache
1483   ** either. So the condition described in the above paragraph is not
1484   ** assert()able.
1485   **
1486   ** If in EXCLUSIVE state, then we update the pager cache if it exists
1487   ** and the main file. The page is then marked not dirty.
1488   **
1489   ** Ticket #1171:  The statement journal might contain page content that is
1490   ** different from the page content at the start of the transaction.
1491   ** This occurs when a page is changed prior to the start of a statement
1492   ** then changed again within the statement.  When rolling back such a
1493   ** statement we must not write to the original database unless we know
1494   ** for certain that original page contents are synced into the main rollback
1495   ** journal.  Otherwise, a power loss might leave modified data in the
1496   ** database file without an entry in the rollback journal that can
1497   ** restore the database to its original form.  Two conditions must be
1498   ** met before writing to the database files. (1) the database must be
1499   ** locked.  (2) we know that the original page content is fully synced
1500   ** in the main journal either because the page is not in cache or else
1501   ** the page is marked as needSync==0.
1502   */
1503   pPg = pager_lookup(pPager, pgno);
1504   PAGERTRACE4("PLAYBACK %d page %d hash(%08x)\n",
1505                PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, aData));
1506   if( pPager->state>=PAGER_EXCLUSIVE && (pPg==0 || pPg->needSync==0) ){
1507     i64 offset = (pgno-1)*(i64)pPager->pageSize;
1508     rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, offset);
1509     if( pPg ){
1510       makeClean(pPg);
1511     }
1512   }
1513   if( pPg ){
1514     /* No page should ever be explicitly rolled back that is in use, except
1515     ** for page 1 which is held in use in order to keep the lock on the
1516     ** database active. However such a page may be rolled back as a result
1517     ** of an internal error resulting in an automatic call to
1518     ** sqlite3PagerRollback().
1519     */
1520     void *pData;
1521     /* assert( pPg->nRef==0 || pPg->pgno==1 ); */
1522     pData = PGHDR_TO_DATA(pPg);
1523     memcpy(pData, aData, pPager->pageSize);
1524     if( pPager->xReiniter ){
1525       pPager->xReiniter(pPg, pPager->pageSize);
1526     }
1527 #ifdef SQLITE_CHECK_PAGES
1528     pPg->pageHash = pager_pagehash(pPg);
1529 #endif
1530     /* If this was page 1, then restore the value of Pager.dbFileVers.
1531     ** Do this before any decoding. */
1532     if( pgno==1 ){
1533       memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));
1534     }
1535 
1536     /* Decode the page just read from disk */
1537     CODEC1(pPager, pData, pPg->pgno, 3);
1538   }
1539   return rc;
1540 }
1541 
1542 /*
1543 ** Parameter zMaster is the name of a master journal file. A single journal
1544 ** file that referred to the master journal file has just been rolled back.
1545 ** This routine checks if it is possible to delete the master journal file,
1546 ** and does so if it is.
1547 **
1548 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not
1549 ** available for use within this function.
1550 **
1551 **
1552 ** The master journal file contains the names of all child journals.
1553 ** To tell if a master journal can be deleted, check to each of the
1554 ** children.  If all children are either missing or do not refer to
1555 ** a different master journal, then this master journal can be deleted.
1556 */
1557 static int pager_delmaster(Pager *pPager, const char *zMaster){
1558   sqlite3_vfs *pVfs = pPager->pVfs;
1559   int rc;
1560   int master_open = 0;
1561   sqlite3_file *pMaster;
1562   sqlite3_file *pJournal;
1563   char *zMasterJournal = 0; /* Contents of master journal file */
1564   i64 nMasterJournal;       /* Size of master journal file */
1565 
1566   /* Open the master journal file exclusively in case some other process
1567   ** is running this routine also. Not that it makes too much difference.
1568   */
1569   pMaster = (sqlite3_file *)sqlite3_malloc(pVfs->szOsFile * 2);
1570   pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile);
1571   if( !pMaster ){
1572     rc = SQLITE_NOMEM;
1573   }else{
1574     int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL);
1575     rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);
1576   }
1577   if( rc!=SQLITE_OK ) goto delmaster_out;
1578   master_open = 1;
1579 
1580   rc = sqlite3OsFileSize(pMaster, &nMasterJournal);
1581   if( rc!=SQLITE_OK ) goto delmaster_out;
1582 
1583   if( nMasterJournal>0 ){
1584     char *zJournal;
1585     char *zMasterPtr = 0;
1586     int nMasterPtr = pPager->pVfs->mxPathname+1;
1587 
1588     /* Load the entire master journal file into space obtained from
1589     ** sqlite3_malloc() and pointed to by zMasterJournal.
1590     */
1591     zMasterJournal = (char *)sqlite3_malloc(nMasterJournal + nMasterPtr);
1592     if( !zMasterJournal ){
1593       rc = SQLITE_NOMEM;
1594       goto delmaster_out;
1595     }
1596     zMasterPtr = &zMasterJournal[nMasterJournal];
1597     rc = sqlite3OsRead(pMaster, zMasterJournal, nMasterJournal, 0);
1598     if( rc!=SQLITE_OK ) goto delmaster_out;
1599 
1600     zJournal = zMasterJournal;
1601     while( (zJournal-zMasterJournal)<nMasterJournal ){
1602       if( sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS) ){
1603         /* One of the journals pointed to by the master journal exists.
1604         ** Open it and check if it points at the master journal. If
1605         ** so, return without deleting the master journal file.
1606         */
1607         int c;
1608         int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL);
1609         rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);
1610         if( rc!=SQLITE_OK ){
1611           goto delmaster_out;
1612         }
1613 
1614         rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);
1615         sqlite3OsClose(pJournal);
1616         if( rc!=SQLITE_OK ){
1617           goto delmaster_out;
1618         }
1619 
1620         c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;
1621         if( c ){
1622           /* We have a match. Do not delete the master journal file. */
1623           goto delmaster_out;
1624         }
1625       }
1626       zJournal += (strlen(zJournal)+1);
1627     }
1628   }
1629 
1630   rc = sqlite3OsDelete(pVfs, zMaster, 0);
1631 
1632 delmaster_out:
1633   if( zMasterJournal ){
1634     sqlite3_free(zMasterJournal);
1635   }
1636   if( master_open ){
1637     sqlite3OsClose(pMaster);
1638   }
1639   sqlite3_free(pMaster);
1640   return rc;
1641 }
1642 
1643 
1644 static void pager_truncate_cache(Pager *pPager);
1645 
1646 /*
1647 ** Truncate the main file of the given pager to the number of pages
1648 ** indicated. Also truncate the cached representation of the file.
1649 **
1650 ** Might might be the case that the file on disk is smaller than nPage.
1651 ** This can happen, for example, if we are in the middle of a transaction
1652 ** which has extended the file size and the new pages are still all held
1653 ** in cache, then an INSERT or UPDATE does a statement rollback.  Some
1654 ** operating system implementations can get confused if you try to
1655 ** truncate a file to some size that is larger than it currently is,
1656 ** so detect this case and do not do the truncation.
1657 */
1658 static int pager_truncate(Pager *pPager, int nPage){
1659   int rc = SQLITE_OK;
1660   if( pPager->state>=PAGER_EXCLUSIVE && pPager->fd->pMethods ){
1661     i64 currentSize, newSize;
1662     rc = sqlite3OsFileSize(pPager->fd, &currentSize);
1663     newSize = pPager->pageSize*(i64)nPage;
1664     if( rc==SQLITE_OK && currentSize>newSize ){
1665       rc = sqlite3OsTruncate(pPager->fd, newSize);
1666     }
1667   }
1668   if( rc==SQLITE_OK ){
1669     pPager->dbSize = nPage;
1670     pager_truncate_cache(pPager);
1671   }
1672   return rc;
1673 }
1674 
1675 /*
1676 ** Set the sectorSize for the given pager.
1677 **
1678 ** The sector size is the larger of the sector size reported
1679 ** by sqlite3OsSectorSize() and the pageSize.
1680 */
1681 static void setSectorSize(Pager *pPager){
1682   assert(pPager->fd->pMethods||pPager->tempFile);
1683   if( !pPager->tempFile ){
1684     /* Sector size doesn't matter for temporary files. Also, the file
1685     ** may not have been opened yet, in whcih case the OsSectorSize()
1686     ** call will segfault.
1687     */
1688     pPager->sectorSize = sqlite3OsSectorSize(pPager->fd);
1689   }
1690   if( pPager->sectorSize<pPager->pageSize ){
1691     pPager->sectorSize = pPager->pageSize;
1692   }
1693 }
1694 
1695 /*
1696 ** Playback the journal and thus restore the database file to
1697 ** the state it was in before we started making changes.
1698 **
1699 ** The journal file format is as follows:
1700 **
1701 **  (1)  8 byte prefix.  A copy of aJournalMagic[].
1702 **  (2)  4 byte big-endian integer which is the number of valid page records
1703 **       in the journal.  If this value is 0xffffffff, then compute the
1704 **       number of page records from the journal size.
1705 **  (3)  4 byte big-endian integer which is the initial value for the
1706 **       sanity checksum.
1707 **  (4)  4 byte integer which is the number of pages to truncate the
1708 **       database to during a rollback.
1709 **  (5)  4 byte integer which is the number of bytes in the master journal
1710 **       name.  The value may be zero (indicate that there is no master
1711 **       journal.)
1712 **  (6)  N bytes of the master journal name.  The name will be nul-terminated
1713 **       and might be shorter than the value read from (5).  If the first byte
1714 **       of the name is \000 then there is no master journal.  The master
1715 **       journal name is stored in UTF-8.
1716 **  (7)  Zero or more pages instances, each as follows:
1717 **        +  4 byte page number.
1718 **        +  pPager->pageSize bytes of data.
1719 **        +  4 byte checksum
1720 **
1721 ** When we speak of the journal header, we mean the first 6 items above.
1722 ** Each entry in the journal is an instance of the 7th item.
1723 **
1724 ** Call the value from the second bullet "nRec".  nRec is the number of
1725 ** valid page entries in the journal.  In most cases, you can compute the
1726 ** value of nRec from the size of the journal file.  But if a power
1727 ** failure occurred while the journal was being written, it could be the
1728 ** case that the size of the journal file had already been increased but
1729 ** the extra entries had not yet made it safely to disk.  In such a case,
1730 ** the value of nRec computed from the file size would be too large.  For
1731 ** that reason, we always use the nRec value in the header.
1732 **
1733 ** If the nRec value is 0xffffffff it means that nRec should be computed
1734 ** from the file size.  This value is used when the user selects the
1735 ** no-sync option for the journal.  A power failure could lead to corruption
1736 ** in this case.  But for things like temporary table (which will be
1737 ** deleted when the power is restored) we don't care.
1738 **
1739 ** If the file opened as the journal file is not a well-formed
1740 ** journal file then all pages up to the first corrupted page are rolled
1741 ** back (or no pages if the journal header is corrupted). The journal file
1742 ** is then deleted and SQLITE_OK returned, just as if no corruption had
1743 ** been encountered.
1744 **
1745 ** If an I/O or malloc() error occurs, the journal-file is not deleted
1746 ** and an error code is returned.
1747 */
1748 static int pager_playback(Pager *pPager, int isHot){
1749   sqlite3_vfs *pVfs = pPager->pVfs;
1750   i64 szJ;                 /* Size of the journal file in bytes */
1751   u32 nRec;                /* Number of Records in the journal */
1752   int i;                   /* Loop counter */
1753   Pgno mxPg = 0;           /* Size of the original file in pages */
1754   int rc;                  /* Result code of a subroutine */
1755   char *zMaster = 0;       /* Name of master journal file if any */
1756 
1757   /* Figure out how many records are in the journal.  Abort early if
1758   ** the journal is empty.
1759   */
1760   assert( pPager->journalOpen );
1761   rc = sqlite3OsFileSize(pPager->jfd, &szJ);
1762   if( rc!=SQLITE_OK || szJ==0 ){
1763     goto end_playback;
1764   }
1765 
1766   /* Read the master journal name from the journal, if it is present.
1767   ** If a master journal file name is specified, but the file is not
1768   ** present on disk, then the journal is not hot and does not need to be
1769   ** played back.
1770   */
1771   zMaster = pPager->pTmpSpace;
1772   rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1773   assert( rc!=SQLITE_DONE );
1774   if( rc!=SQLITE_OK
1775    || (zMaster[0] && !sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS))
1776   ){
1777     zMaster = 0;
1778     if( rc==SQLITE_DONE ) rc = SQLITE_OK;
1779     goto end_playback;
1780   }
1781   pPager->journalOff = 0;
1782   zMaster = 0;
1783 
1784   /* This loop terminates either when the readJournalHdr() call returns
1785   ** SQLITE_DONE or an IO error occurs. */
1786   while( 1 ){
1787 
1788     /* Read the next journal header from the journal file.  If there are
1789     ** not enough bytes left in the journal file for a complete header, or
1790     ** it is corrupted, then a process must of failed while writing it.
1791     ** This indicates nothing more needs to be rolled back.
1792     */
1793     rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
1794     if( rc!=SQLITE_OK ){
1795       if( rc==SQLITE_DONE ){
1796         rc = SQLITE_OK;
1797       }
1798       goto end_playback;
1799     }
1800 
1801     /* If nRec is 0xffffffff, then this journal was created by a process
1802     ** working in no-sync mode. This means that the rest of the journal
1803     ** file consists of pages, there are no more journal headers. Compute
1804     ** the value of nRec based on this assumption.
1805     */
1806     if( nRec==0xffffffff ){
1807       assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
1808       nRec = (szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager);
1809     }
1810 
1811     /* If nRec is 0 and this rollback is of a transaction created by this
1812     ** process and if this is the final header in the journal, then it means
1813     ** that this part of the journal was being filled but has not yet been
1814     ** synced to disk.  Compute the number of pages based on the remaining
1815     ** size of the file.
1816     **
1817     ** The third term of the test was added to fix ticket #2565.
1818     */
1819     if( nRec==0 && !isHot &&
1820         pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
1821       nRec = (szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager);
1822     }
1823 
1824     /* If this is the first header read from the journal, truncate the
1825     ** database file back to its original size.
1826     */
1827     if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
1828       rc = pager_truncate(pPager, mxPg);
1829       if( rc!=SQLITE_OK ){
1830         goto end_playback;
1831       }
1832     }
1833 
1834     /* Copy original pages out of the journal and back into the database file.
1835     */
1836     for(i=0; i<nRec; i++){
1837       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1838       if( rc!=SQLITE_OK ){
1839         if( rc==SQLITE_DONE ){
1840           rc = SQLITE_OK;
1841           pPager->journalOff = szJ;
1842           break;
1843         }else{
1844           goto end_playback;
1845         }
1846       }
1847     }
1848   }
1849   /*NOTREACHED*/
1850   assert( 0 );
1851 
1852 end_playback:
1853   if( rc==SQLITE_OK ){
1854     zMaster = pPager->pTmpSpace;
1855     rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
1856   }
1857   if( rc==SQLITE_OK ){
1858     rc = pager_end_transaction(pPager);
1859   }
1860   if( rc==SQLITE_OK && zMaster[0] ){
1861     /* If there was a master journal and this routine will return success,
1862     ** see if it is possible to delete the master journal.
1863     */
1864     rc = pager_delmaster(pPager, zMaster);
1865   }
1866 
1867   /* The Pager.sectorSize variable may have been updated while rolling
1868   ** back a journal created by a process with a different sector size
1869   ** value. Reset it to the correct value for this process.
1870   */
1871   setSectorSize(pPager);
1872   return rc;
1873 }
1874 
1875 /*
1876 ** Playback the statement journal.
1877 **
1878 ** This is similar to playing back the transaction journal but with
1879 ** a few extra twists.
1880 **
1881 **    (1)  The number of pages in the database file at the start of
1882 **         the statement is stored in pPager->stmtSize, not in the
1883 **         journal file itself.
1884 **
1885 **    (2)  In addition to playing back the statement journal, also
1886 **         playback all pages of the transaction journal beginning
1887 **         at offset pPager->stmtJSize.
1888 */
1889 static int pager_stmt_playback(Pager *pPager){
1890   i64 szJ;                 /* Size of the full journal */
1891   i64 hdrOff;
1892   int nRec;                /* Number of Records */
1893   int i;                   /* Loop counter */
1894   int rc;
1895 
1896   szJ = pPager->journalOff;
1897 #ifndef NDEBUG
1898   {
1899     i64 os_szJ;
1900     rc = sqlite3OsFileSize(pPager->jfd, &os_szJ);
1901     if( rc!=SQLITE_OK ) return rc;
1902     assert( szJ==os_szJ );
1903   }
1904 #endif
1905 
1906   /* Set hdrOff to be the offset just after the end of the last journal
1907   ** page written before the first journal-header for this statement
1908   ** transaction was written, or the end of the file if no journal
1909   ** header was written.
1910   */
1911   hdrOff = pPager->stmtHdrOff;
1912   assert( pPager->fullSync || !hdrOff );
1913   if( !hdrOff ){
1914     hdrOff = szJ;
1915   }
1916 
1917   /* Truncate the database back to its original size.
1918   */
1919   rc = pager_truncate(pPager, pPager->stmtSize);
1920   assert( pPager->state>=PAGER_SHARED );
1921 
1922   /* Figure out how many records are in the statement journal.
1923   */
1924   assert( pPager->stmtInUse && pPager->journalOpen );
1925   nRec = pPager->stmtNRec;
1926 
1927   /* Copy original pages out of the statement journal and back into the
1928   ** database file.  Note that the statement journal omits checksums from
1929   ** each record since power-failure recovery is not important to statement
1930   ** journals.
1931   */
1932   for(i=0; i<nRec; i++){
1933     i64 offset = i*(4+pPager->pageSize);
1934     rc = pager_playback_one_page(pPager, pPager->stfd, offset, 0);
1935     assert( rc!=SQLITE_DONE );
1936     if( rc!=SQLITE_OK ) goto end_stmt_playback;
1937   }
1938 
1939   /* Now roll some pages back from the transaction journal. Pager.stmtJSize
1940   ** was the size of the journal file when this statement was started, so
1941   ** everything after that needs to be rolled back, either into the
1942   ** database, the memory cache, or both.
1943   **
1944   ** If it is not zero, then Pager.stmtHdrOff is the offset to the start
1945   ** of the first journal header written during this statement transaction.
1946   */
1947   pPager->journalOff = pPager->stmtJSize;
1948   pPager->cksumInit = pPager->stmtCksum;
1949   while( pPager->journalOff < hdrOff ){
1950     rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1951     assert( rc!=SQLITE_DONE );
1952     if( rc!=SQLITE_OK ) goto end_stmt_playback;
1953   }
1954 
1955   while( pPager->journalOff < szJ ){
1956     u32 nJRec;         /* Number of Journal Records */
1957     u32 dummy;
1958     rc = readJournalHdr(pPager, szJ, &nJRec, &dummy);
1959     if( rc!=SQLITE_OK ){
1960       assert( rc!=SQLITE_DONE );
1961       goto end_stmt_playback;
1962     }
1963     if( nJRec==0 ){
1964       nJRec = (szJ - pPager->journalOff) / (pPager->pageSize+8);
1965     }
1966     for(i=nJRec-1; i>=0 && pPager->journalOff < szJ; i--){
1967       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
1968       assert( rc!=SQLITE_DONE );
1969       if( rc!=SQLITE_OK ) goto end_stmt_playback;
1970     }
1971   }
1972 
1973   pPager->journalOff = szJ;
1974 
1975 end_stmt_playback:
1976   if( rc==SQLITE_OK) {
1977     pPager->journalOff = szJ;
1978     /* pager_reload_cache(pPager); */
1979   }
1980   return rc;
1981 }
1982 
1983 /*
1984 ** Change the maximum number of in-memory pages that are allowed.
1985 */
1986 void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){
1987   if( mxPage>10 ){
1988     pPager->mxPage = mxPage;
1989   }else{
1990     pPager->mxPage = 10;
1991   }
1992 }
1993 
1994 /*
1995 ** Adjust the robustness of the database to damage due to OS crashes
1996 ** or power failures by changing the number of syncs()s when writing
1997 ** the rollback journal.  There are three levels:
1998 **
1999 **    OFF       sqlite3OsSync() is never called.  This is the default
2000 **              for temporary and transient files.
2001 **
2002 **    NORMAL    The journal is synced once before writes begin on the
2003 **              database.  This is normally adequate protection, but
2004 **              it is theoretically possible, though very unlikely,
2005 **              that an inopertune power failure could leave the journal
2006 **              in a state which would cause damage to the database
2007 **              when it is rolled back.
2008 **
2009 **    FULL      The journal is synced twice before writes begin on the
2010 **              database (with some additional information - the nRec field
2011 **              of the journal header - being written in between the two
2012 **              syncs).  If we assume that writing a
2013 **              single disk sector is atomic, then this mode provides
2014 **              assurance that the journal will not be corrupted to the
2015 **              point of causing damage to the database during rollback.
2016 **
2017 ** Numeric values associated with these states are OFF==1, NORMAL=2,
2018 ** and FULL=3.
2019 */
2020 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2021 void sqlite3PagerSetSafetyLevel(Pager *pPager, int level, int full_fsync){
2022   pPager->noSync =  level==1 || pPager->tempFile;
2023   pPager->fullSync = level==3 && !pPager->tempFile;
2024   pPager->sync_flags = (full_fsync?SQLITE_SYNC_FULL:SQLITE_SYNC_NORMAL);
2025   if( pPager->noSync ) pPager->needSync = 0;
2026 }
2027 #endif
2028 
2029 /*
2030 ** The following global variable is incremented whenever the library
2031 ** attempts to open a temporary file.  This information is used for
2032 ** testing and analysis only.
2033 */
2034 #ifdef SQLITE_TEST
2035 int sqlite3_opentemp_count = 0;
2036 #endif
2037 
2038 /*
2039 ** Open a temporary file.
2040 **
2041 ** Write the file descriptor into *fd.  Return SQLITE_OK on success or some
2042 ** other error code if we fail. The OS will automatically delete the temporary
2043 ** file when it is closed.
2044 */
2045 static int sqlite3PagerOpentemp(
2046   sqlite3_vfs *pVfs,    /* The virtual file system layer */
2047   sqlite3_file *pFile,  /* Write the file descriptor here */
2048   char *zFilename,      /* Name of the file.  Might be NULL */
2049   int vfsFlags          /* Flags passed through to the VFS */
2050 ){
2051   int rc;
2052   assert( zFilename!=0 );
2053 
2054 #ifdef SQLITE_TEST
2055   sqlite3_opentemp_count++;  /* Used for testing and analysis only */
2056 #endif
2057 
2058   vfsFlags |=  SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
2059             SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE;
2060   rc = sqlite3OsOpen(pVfs, zFilename, pFile, vfsFlags, 0);
2061   assert( rc!=SQLITE_OK || pFile->pMethods );
2062   return rc;
2063 }
2064 
2065 /*
2066 ** Create a new page cache and put a pointer to the page cache in *ppPager.
2067 ** The file to be cached need not exist.  The file is not locked until
2068 ** the first call to sqlite3PagerGet() and is only held open until the
2069 ** last page is released using sqlite3PagerUnref().
2070 **
2071 ** If zFilename is NULL then a randomly-named temporary file is created
2072 ** and used as the file to be cached.  The file will be deleted
2073 ** automatically when it is closed.
2074 **
2075 ** If zFilename is ":memory:" then all information is held in cache.
2076 ** It is never written to disk.  This can be used to implement an
2077 ** in-memory database.
2078 */
2079 int sqlite3PagerOpen(
2080   sqlite3_vfs *pVfs,       /* The virtual file system to use */
2081   Pager **ppPager,         /* Return the Pager structure here */
2082   const char *zFilename,   /* Name of the database file to open */
2083   int nExtra,              /* Extra bytes append to each in-memory page */
2084   int flags,               /* flags controlling this file */
2085   int vfsFlags             /* flags passed through to sqlite3_vfs.xOpen() */
2086 ){
2087   u8 *pPtr;
2088   Pager *pPager = 0;
2089   int rc = SQLITE_OK;
2090   int i;
2091   int tempFile = 0;
2092   int memDb = 0;
2093   int readOnly = 0;
2094   int useJournal = (flags & PAGER_OMIT_JOURNAL)==0;
2095   int noReadlock = (flags & PAGER_NO_READLOCK)!=0;
2096   int journalFileSize = sqlite3JournalSize(pVfs);
2097   int nDefaultPage = SQLITE_DEFAULT_PAGE_SIZE;
2098   char *zPathname;
2099   int nPathname;
2100 
2101   /* The default return is a NULL pointer */
2102   *ppPager = 0;
2103 
2104   /* Compute the full pathname */
2105   nPathname = pVfs->mxPathname+1;
2106   zPathname = sqlite3_malloc(nPathname);
2107   if( zPathname==0 ){
2108     return SQLITE_NOMEM;
2109   }
2110   if( zFilename && zFilename[0] ){
2111 #ifndef SQLITE_OMIT_MEMORYDB
2112     if( strcmp(zFilename,":memory:")==0 ){
2113       memDb = 1;
2114       zPathname[0] = 0;
2115     }else
2116 #endif
2117     {
2118       rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);
2119     }
2120   }else{
2121     rc = sqlite3OsGetTempname(pVfs, nPathname, zPathname);
2122   }
2123   if( rc!=SQLITE_OK ){
2124     sqlite3_free(zPathname);
2125     return rc;
2126   }
2127   nPathname = strlen(zPathname);
2128 
2129   /* Allocate memory for the pager structure */
2130   pPager = sqlite3MallocZero(
2131     sizeof(*pPager) +           /* Pager structure */
2132     journalFileSize +           /* The journal file structure */
2133     pVfs->szOsFile * 2 +        /* The db and stmt journal files */
2134     4*nPathname + 40            /* zFilename, zDirectory, zJournal, zStmtJrnl */
2135   );
2136   if( !pPager ){
2137     sqlite3_free(zPathname);
2138     return SQLITE_NOMEM;
2139   }
2140   pPtr = (u8 *)&pPager[1];
2141   pPager->vfsFlags = vfsFlags;
2142   pPager->fd = (sqlite3_file*)&pPtr[pVfs->szOsFile*0];
2143   pPager->stfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*1];
2144   pPager->jfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*2];
2145   pPager->zFilename = (char*)&pPtr[pVfs->szOsFile*2+journalFileSize];
2146   pPager->zDirectory = &pPager->zFilename[nPathname+1];
2147   pPager->zJournal = &pPager->zDirectory[nPathname+1];
2148   pPager->zStmtJrnl = &pPager->zJournal[nPathname+10];
2149   pPager->pVfs = pVfs;
2150   memcpy(pPager->zFilename, zPathname, nPathname+1);
2151   sqlite3_free(zPathname);
2152 
2153   /* Open the pager file.
2154   */
2155   if( zFilename && zFilename[0] && !memDb ){
2156     if( nPathname>(pVfs->mxPathname - sizeof("-journal")) ){
2157       rc = SQLITE_CANTOPEN;
2158     }else{
2159       int fout = 0;
2160       rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd,
2161                          pPager->vfsFlags, &fout);
2162       readOnly = (fout&SQLITE_OPEN_READONLY);
2163 
2164       /* If the file was successfully opened for read/write access,
2165       ** choose a default page size in case we have to create the
2166       ** database file. The default page size is the maximum of:
2167       **
2168       **    + SQLITE_DEFAULT_PAGE_SIZE,
2169       **    + The value returned by sqlite3OsSectorSize()
2170       **    + The largest page size that can be written atomically.
2171       */
2172       if( rc==SQLITE_OK && !readOnly ){
2173         int iSectorSize = sqlite3OsSectorSize(pPager->fd);
2174         if( nDefaultPage<iSectorSize ){
2175           nDefaultPage = iSectorSize;
2176         }
2177 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
2178         {
2179           int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2180           int ii;
2181           assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
2182           assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
2183           assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);
2184           for(ii=nDefaultPage; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){
2185             if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ) nDefaultPage = ii;
2186           }
2187         }
2188 #endif
2189         if( nDefaultPage>SQLITE_MAX_DEFAULT_PAGE_SIZE ){
2190           nDefaultPage = SQLITE_MAX_DEFAULT_PAGE_SIZE;
2191         }
2192       }
2193     }
2194   }else if( !memDb ){
2195     /* If a temporary file is requested, it is not opened immediately.
2196     ** In this case we accept the default page size and delay actually
2197     ** opening the file until the first call to OsWrite().
2198     */
2199     tempFile = 1;
2200     pPager->state = PAGER_EXCLUSIVE;
2201   }
2202 
2203   if( pPager && rc==SQLITE_OK ){
2204     pPager->pTmpSpace = (char *)sqlite3_malloc(nDefaultPage);
2205   }
2206 
2207   /* If an error occured in either of the blocks above.
2208   ** Free the Pager structure and close the file.
2209   ** Since the pager is not allocated there is no need to set
2210   ** any Pager.errMask variables.
2211   */
2212   if( !pPager || !pPager->pTmpSpace ){
2213     sqlite3OsClose(pPager->fd);
2214     sqlite3_free(pPager);
2215     return ((rc==SQLITE_OK)?SQLITE_NOMEM:rc);
2216   }
2217 
2218   PAGERTRACE3("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename);
2219   IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))
2220 
2221   /* Fill in Pager.zDirectory[] */
2222   memcpy(pPager->zDirectory, pPager->zFilename, nPathname+1);
2223   for(i=strlen(pPager->zDirectory); i>0 && pPager->zDirectory[i-1]!='/'; i--){}
2224   if( i>0 ) pPager->zDirectory[i-1] = 0;
2225 
2226   /* Fill in Pager.zJournal[] and Pager.zStmtJrnl[] */
2227   memcpy(pPager->zJournal, pPager->zFilename, nPathname);
2228   memcpy(&pPager->zJournal[nPathname], "-journal", 9);
2229   memcpy(pPager->zStmtJrnl, pPager->zFilename, nPathname);
2230   memcpy(&pPager->zStmtJrnl[nPathname], "-stmtjrnl", 10);
2231 
2232   /* pPager->journalOpen = 0; */
2233   pPager->useJournal = useJournal && !memDb;
2234   pPager->noReadlock = noReadlock && readOnly;
2235   /* pPager->stmtOpen = 0; */
2236   /* pPager->stmtInUse = 0; */
2237   /* pPager->nRef = 0; */
2238   pPager->dbSize = memDb-1;
2239   pPager->pageSize = nDefaultPage;
2240   /* pPager->stmtSize = 0; */
2241   /* pPager->stmtJSize = 0; */
2242   /* pPager->nPage = 0; */
2243   pPager->mxPage = 100;
2244   pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;
2245   /* pPager->state = PAGER_UNLOCK; */
2246   assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) );
2247   /* pPager->errMask = 0; */
2248   pPager->tempFile = tempFile;
2249   assert( tempFile==PAGER_LOCKINGMODE_NORMAL
2250           || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );
2251   assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );
2252   pPager->exclusiveMode = tempFile;
2253   pPager->memDb = memDb;
2254   pPager->readOnly = readOnly;
2255   /* pPager->needSync = 0; */
2256   pPager->noSync = pPager->tempFile || !useJournal;
2257   pPager->fullSync = (pPager->noSync?0:1);
2258   pPager->sync_flags = SQLITE_SYNC_NORMAL;
2259   /* pPager->pFirst = 0; */
2260   /* pPager->pFirstSynced = 0; */
2261   /* pPager->pLast = 0; */
2262   pPager->nExtra = FORCE_ALIGNMENT(nExtra);
2263   assert(pPager->fd->pMethods||memDb||tempFile);
2264   if( !memDb ){
2265     setSectorSize(pPager);
2266   }
2267   /* pPager->pBusyHandler = 0; */
2268   /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */
2269   *ppPager = pPager;
2270 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
2271   pPager->iInUseMM = 0;
2272   pPager->iInUseDB = 0;
2273   if( !memDb ){
2274     sqlite3_mutex *mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
2275     sqlite3_mutex_enter(mutex);
2276     pPager->pNext = sqlite3PagerList;
2277     if( sqlite3PagerList ){
2278       assert( sqlite3PagerList->pPrev==0 );
2279       sqlite3PagerList->pPrev = pPager;
2280     }
2281     pPager->pPrev = 0;
2282     sqlite3PagerList = pPager;
2283     sqlite3_mutex_leave(mutex);
2284   }
2285 #endif
2286   return SQLITE_OK;
2287 }
2288 
2289 /*
2290 ** Set the busy handler function.
2291 */
2292 void sqlite3PagerSetBusyhandler(Pager *pPager, BusyHandler *pBusyHandler){
2293   pPager->pBusyHandler = pBusyHandler;
2294 }
2295 
2296 /*
2297 ** Set the destructor for this pager.  If not NULL, the destructor is called
2298 ** when the reference count on each page reaches zero.  The destructor can
2299 ** be used to clean up information in the extra segment appended to each page.
2300 **
2301 ** The destructor is not called as a result sqlite3PagerClose().
2302 ** Destructors are only called by sqlite3PagerUnref().
2303 */
2304 void sqlite3PagerSetDestructor(Pager *pPager, void (*xDesc)(DbPage*,int)){
2305   pPager->xDestructor = xDesc;
2306 }
2307 
2308 /*
2309 ** Set the reinitializer for this pager.  If not NULL, the reinitializer
2310 ** is called when the content of a page in cache is restored to its original
2311 ** value as a result of a rollback.  The callback gives higher-level code
2312 ** an opportunity to restore the EXTRA section to agree with the restored
2313 ** page data.
2314 */
2315 void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*,int)){
2316   pPager->xReiniter = xReinit;
2317 }
2318 
2319 /*
2320 ** Set the page size to *pPageSize. If the suggest new page size is
2321 ** inappropriate, then an alternative page size is set to that
2322 ** value before returning.
2323 */
2324 int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){
2325   int rc = SQLITE_OK;
2326   u16 pageSize = *pPageSize;
2327   assert( pageSize==0 || (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );
2328   if( pageSize && pageSize!=pPager->pageSize
2329    && !pPager->memDb && pPager->nRef==0
2330   ){
2331     char *pNew = (char *)sqlite3_malloc(pageSize);
2332     if( !pNew ){
2333       rc = SQLITE_NOMEM;
2334     }else{
2335       pagerEnter(pPager);
2336       pager_reset(pPager);
2337       pPager->pageSize = pageSize;
2338       setSectorSize(pPager);
2339       sqlite3_free(pPager->pTmpSpace);
2340       pPager->pTmpSpace = pNew;
2341       pagerLeave(pPager);
2342     }
2343   }
2344   *pPageSize = pPager->pageSize;
2345   return rc;
2346 }
2347 
2348 /*
2349 ** Return a pointer to the "temporary page" buffer held internally
2350 ** by the pager.  This is a buffer that is big enough to hold the
2351 ** entire content of a database page.  This buffer is used internally
2352 ** during rollback and will be overwritten whenever a rollback
2353 ** occurs.  But other modules are free to use it too, as long as
2354 ** no rollbacks are happening.
2355 */
2356 void *sqlite3PagerTempSpace(Pager *pPager){
2357   return pPager->pTmpSpace;
2358 }
2359 
2360 /*
2361 ** Attempt to set the maximum database page count if mxPage is positive.
2362 ** Make no changes if mxPage is zero or negative.  And never reduce the
2363 ** maximum page count below the current size of the database.
2364 **
2365 ** Regardless of mxPage, return the current maximum page count.
2366 */
2367 int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){
2368   if( mxPage>0 ){
2369     pPager->mxPgno = mxPage;
2370   }
2371   sqlite3PagerPagecount(pPager);
2372   return pPager->mxPgno;
2373 }
2374 
2375 /*
2376 ** The following set of routines are used to disable the simulated
2377 ** I/O error mechanism.  These routines are used to avoid simulated
2378 ** errors in places where we do not care about errors.
2379 **
2380 ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops
2381 ** and generate no code.
2382 */
2383 #ifdef SQLITE_TEST
2384 extern int sqlite3_io_error_pending;
2385 extern int sqlite3_io_error_hit;
2386 static int saved_cnt;
2387 void disable_simulated_io_errors(void){
2388   saved_cnt = sqlite3_io_error_pending;
2389   sqlite3_io_error_pending = -1;
2390 }
2391 void enable_simulated_io_errors(void){
2392   sqlite3_io_error_pending = saved_cnt;
2393 }
2394 #else
2395 # define disable_simulated_io_errors()
2396 # define enable_simulated_io_errors()
2397 #endif
2398 
2399 /*
2400 ** Read the first N bytes from the beginning of the file into memory
2401 ** that pDest points to.
2402 **
2403 ** No error checking is done. The rational for this is that this function
2404 ** may be called even if the file does not exist or contain a header. In
2405 ** these cases sqlite3OsRead() will return an error, to which the correct
2406 ** response is to zero the memory at pDest and continue.  A real IO error
2407 ** will presumably recur and be picked up later (Todo: Think about this).
2408 */
2409 int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
2410   int rc = SQLITE_OK;
2411   memset(pDest, 0, N);
2412   assert(MEMDB||pPager->fd->pMethods||pPager->tempFile);
2413   if( pPager->fd->pMethods ){
2414     IOTRACE(("DBHDR %p 0 %d\n", pPager, N))
2415     rc = sqlite3OsRead(pPager->fd, pDest, N, 0);
2416     if( rc==SQLITE_IOERR_SHORT_READ ){
2417       rc = SQLITE_OK;
2418     }
2419   }
2420   return rc;
2421 }
2422 
2423 /*
2424 ** Return the total number of pages in the disk file associated with
2425 ** pPager.
2426 **
2427 ** If the PENDING_BYTE lies on the page directly after the end of the
2428 ** file, then consider this page part of the file too. For example, if
2429 ** PENDING_BYTE is byte 4096 (the first byte of page 5) and the size of the
2430 ** file is 4096 bytes, 5 is returned instead of 4.
2431 */
2432 int sqlite3PagerPagecount(Pager *pPager){
2433   i64 n = 0;
2434   int rc;
2435   assert( pPager!=0 );
2436   if( pPager->errCode ){
2437     return 0;
2438   }
2439   if( pPager->dbSize>=0 ){
2440     n = pPager->dbSize;
2441   } else {
2442     assert(pPager->fd->pMethods||pPager->tempFile);
2443     if( (pPager->fd->pMethods)
2444      && (rc = sqlite3OsFileSize(pPager->fd, &n))!=SQLITE_OK ){
2445       pPager->nRef++;
2446       pager_error(pPager, rc);
2447       pPager->nRef--;
2448       return 0;
2449     }
2450     if( n>0 && n<pPager->pageSize ){
2451       n = 1;
2452     }else{
2453       n /= pPager->pageSize;
2454     }
2455     if( pPager->state!=PAGER_UNLOCK ){
2456       pPager->dbSize = n;
2457     }
2458   }
2459   if( n==(PENDING_BYTE/pPager->pageSize) ){
2460     n++;
2461   }
2462   if( n>pPager->mxPgno ){
2463     pPager->mxPgno = n;
2464   }
2465   return n;
2466 }
2467 
2468 
2469 #ifndef SQLITE_OMIT_MEMORYDB
2470 /*
2471 ** Clear a PgHistory block
2472 */
2473 static void clearHistory(PgHistory *pHist){
2474   sqlite3_free(pHist->pOrig);
2475   sqlite3_free(pHist->pStmt);
2476   pHist->pOrig = 0;
2477   pHist->pStmt = 0;
2478 }
2479 #else
2480 #define clearHistory(x)
2481 #endif
2482 
2483 /*
2484 ** Forward declaration
2485 */
2486 static int syncJournal(Pager*);
2487 
2488 /*
2489 ** Unlink pPg from its hash chain. Also set the page number to 0 to indicate
2490 ** that the page is not part of any hash chain. This is required because the
2491 ** sqlite3PagerMovepage() routine can leave a page in the
2492 ** pNextFree/pPrevFree list that is not a part of any hash-chain.
2493 */
2494 static void unlinkHashChain(Pager *pPager, PgHdr *pPg){
2495   if( pPg->pgno==0 ){
2496     assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
2497     return;
2498   }
2499   if( pPg->pNextHash ){
2500     pPg->pNextHash->pPrevHash = pPg->pPrevHash;
2501   }
2502   if( pPg->pPrevHash ){
2503     assert( pPager->aHash[pPg->pgno & (pPager->nHash-1)]!=pPg );
2504     pPg->pPrevHash->pNextHash = pPg->pNextHash;
2505   }else{
2506     int h = pPg->pgno & (pPager->nHash-1);
2507     pPager->aHash[h] = pPg->pNextHash;
2508   }
2509   if( MEMDB ){
2510     clearHistory(PGHDR_TO_HIST(pPg, pPager));
2511   }
2512   pPg->pgno = 0;
2513   pPg->pNextHash = pPg->pPrevHash = 0;
2514 }
2515 
2516 /*
2517 ** Unlink a page from the free list (the list of all pages where nRef==0)
2518 ** and from its hash collision chain.
2519 */
2520 static void unlinkPage(PgHdr *pPg){
2521   Pager *pPager = pPg->pPager;
2522 
2523   /* Unlink from free page list */
2524   lruListRemove(pPg);
2525 
2526   /* Unlink from the pgno hash table */
2527   unlinkHashChain(pPager, pPg);
2528 }
2529 
2530 /*
2531 ** This routine is used to truncate the cache when a database
2532 ** is truncated.  Drop from the cache all pages whose pgno is
2533 ** larger than pPager->dbSize and is unreferenced.
2534 **
2535 ** Referenced pages larger than pPager->dbSize are zeroed.
2536 **
2537 ** Actually, at the point this routine is called, it would be
2538 ** an error to have a referenced page.  But rather than delete
2539 ** that page and guarantee a subsequent segfault, it seems better
2540 ** to zero it and hope that we error out sanely.
2541 */
2542 static void pager_truncate_cache(Pager *pPager){
2543   PgHdr *pPg;
2544   PgHdr **ppPg;
2545   int dbSize = pPager->dbSize;
2546 
2547   ppPg = &pPager->pAll;
2548   while( (pPg = *ppPg)!=0 ){
2549     if( pPg->pgno<=dbSize ){
2550       ppPg = &pPg->pNextAll;
2551     }else if( pPg->nRef>0 ){
2552       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
2553       ppPg = &pPg->pNextAll;
2554     }else{
2555       *ppPg = pPg->pNextAll;
2556       IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
2557       PAGER_INCR(sqlite3_pager_pgfree_count);
2558       unlinkPage(pPg);
2559       makeClean(pPg);
2560       sqlite3_free(pPg);
2561       pPager->nPage--;
2562     }
2563   }
2564 }
2565 
2566 /*
2567 ** Try to obtain a lock on a file.  Invoke the busy callback if the lock
2568 ** is currently not available.  Repeat until the busy callback returns
2569 ** false or until the lock succeeds.
2570 **
2571 ** Return SQLITE_OK on success and an error code if we cannot obtain
2572 ** the lock.
2573 */
2574 static int pager_wait_on_lock(Pager *pPager, int locktype){
2575   int rc;
2576 
2577   /* The OS lock values must be the same as the Pager lock values */
2578   assert( PAGER_SHARED==SHARED_LOCK );
2579   assert( PAGER_RESERVED==RESERVED_LOCK );
2580   assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
2581 
2582   /* If the file is currently unlocked then the size must be unknown */
2583   assert( pPager->state>=PAGER_SHARED || pPager->dbSize<0 || MEMDB );
2584 
2585   if( pPager->state>=locktype ){
2586     rc = SQLITE_OK;
2587   }else{
2588     do {
2589       rc = sqlite3OsLock(pPager->fd, locktype);
2590     }while( rc==SQLITE_BUSY && sqlite3InvokeBusyHandler(pPager->pBusyHandler) );
2591     if( rc==SQLITE_OK ){
2592       pPager->state = locktype;
2593       IOTRACE(("LOCK %p %d\n", pPager, locktype))
2594     }
2595   }
2596   return rc;
2597 }
2598 
2599 /*
2600 ** Truncate the file to the number of pages specified.
2601 */
2602 int sqlite3PagerTruncate(Pager *pPager, Pgno nPage){
2603   int rc;
2604   assert( pPager->state>=PAGER_SHARED || MEMDB );
2605   sqlite3PagerPagecount(pPager);
2606   if( pPager->errCode ){
2607     rc = pPager->errCode;
2608     return rc;
2609   }
2610   if( nPage>=(unsigned)pPager->dbSize ){
2611     return SQLITE_OK;
2612   }
2613   if( MEMDB ){
2614     pPager->dbSize = nPage;
2615     pager_truncate_cache(pPager);
2616     return SQLITE_OK;
2617   }
2618   pagerEnter(pPager);
2619   rc = syncJournal(pPager);
2620   pagerLeave(pPager);
2621   if( rc!=SQLITE_OK ){
2622     return rc;
2623   }
2624 
2625   /* Get an exclusive lock on the database before truncating. */
2626   pagerEnter(pPager);
2627   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2628   pagerLeave(pPager);
2629   if( rc!=SQLITE_OK ){
2630     return rc;
2631   }
2632 
2633   rc = pager_truncate(pPager, nPage);
2634   return rc;
2635 }
2636 
2637 /*
2638 ** Shutdown the page cache.  Free all memory and close all files.
2639 **
2640 ** If a transaction was in progress when this routine is called, that
2641 ** transaction is rolled back.  All outstanding pages are invalidated
2642 ** and their memory is freed.  Any attempt to use a page associated
2643 ** with this page cache after this function returns will likely
2644 ** result in a coredump.
2645 **
2646 ** This function always succeeds. If a transaction is active an attempt
2647 ** is made to roll it back. If an error occurs during the rollback
2648 ** a hot journal may be left in the filesystem but no error is returned
2649 ** to the caller.
2650 */
2651 int sqlite3PagerClose(Pager *pPager){
2652 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
2653   if( !MEMDB ){
2654     sqlite3_mutex *mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
2655     sqlite3_mutex_enter(mutex);
2656     if( pPager->pPrev ){
2657       pPager->pPrev->pNext = pPager->pNext;
2658     }else{
2659       sqlite3PagerList = pPager->pNext;
2660     }
2661     if( pPager->pNext ){
2662       pPager->pNext->pPrev = pPager->pPrev;
2663     }
2664     sqlite3_mutex_leave(mutex);
2665   }
2666 #endif
2667 
2668   disable_simulated_io_errors();
2669   pPager->errCode = 0;
2670   pPager->exclusiveMode = 0;
2671   pager_reset(pPager);
2672   pagerUnlockAndRollback(pPager);
2673   enable_simulated_io_errors();
2674   PAGERTRACE2("CLOSE %d\n", PAGERID(pPager));
2675   IOTRACE(("CLOSE %p\n", pPager))
2676   assert( pPager->errCode || (pPager->journalOpen==0 && pPager->stmtOpen==0) );
2677   if( pPager->journalOpen ){
2678     sqlite3OsClose(pPager->jfd);
2679   }
2680   sqlite3_free(pPager->aInJournal);
2681   if( pPager->stmtOpen ){
2682     sqlite3OsClose(pPager->stfd);
2683   }
2684   sqlite3OsClose(pPager->fd);
2685   /* Temp files are automatically deleted by the OS
2686   ** if( pPager->tempFile ){
2687   **   sqlite3OsDelete(pPager->zFilename);
2688   ** }
2689   */
2690 
2691   sqlite3_free(pPager->aHash);
2692   sqlite3_free(pPager->pTmpSpace);
2693   sqlite3_free(pPager);
2694   return SQLITE_OK;
2695 }
2696 
2697 #if !defined(NDEBUG) || defined(SQLITE_TEST)
2698 /*
2699 ** Return the page number for the given page data.
2700 */
2701 Pgno sqlite3PagerPagenumber(DbPage *p){
2702   return p->pgno;
2703 }
2704 #endif
2705 
2706 /*
2707 ** The page_ref() function increments the reference count for a page.
2708 ** If the page is currently on the freelist (the reference count is zero) then
2709 ** remove it from the freelist.
2710 **
2711 ** For non-test systems, page_ref() is a macro that calls _page_ref()
2712 ** online of the reference count is zero.  For test systems, page_ref()
2713 ** is a real function so that we can set breakpoints and trace it.
2714 */
2715 static void _page_ref(PgHdr *pPg){
2716   if( pPg->nRef==0 ){
2717     /* The page is currently on the freelist.  Remove it. */
2718     lruListRemove(pPg);
2719     pPg->pPager->nRef++;
2720   }
2721   pPg->nRef++;
2722   REFINFO(pPg);
2723 }
2724 #ifdef SQLITE_DEBUG
2725   static void page_ref(PgHdr *pPg){
2726     if( pPg->nRef==0 ){
2727       _page_ref(pPg);
2728     }else{
2729       pPg->nRef++;
2730       REFINFO(pPg);
2731     }
2732   }
2733 #else
2734 # define page_ref(P)   ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
2735 #endif
2736 
2737 /*
2738 ** Increment the reference count for a page.  The input pointer is
2739 ** a reference to the page data.
2740 */
2741 int sqlite3PagerRef(DbPage *pPg){
2742   pagerEnter(pPg->pPager);
2743   page_ref(pPg);
2744   pagerLeave(pPg->pPager);
2745   return SQLITE_OK;
2746 }
2747 
2748 /*
2749 ** Sync the journal.  In other words, make sure all the pages that have
2750 ** been written to the journal have actually reached the surface of the
2751 ** disk.  It is not safe to modify the original database file until after
2752 ** the journal has been synced.  If the original database is modified before
2753 ** the journal is synced and a power failure occurs, the unsynced journal
2754 ** data would be lost and we would be unable to completely rollback the
2755 ** database changes.  Database corruption would occur.
2756 **
2757 ** This routine also updates the nRec field in the header of the journal.
2758 ** (See comments on the pager_playback() routine for additional information.)
2759 ** If the sync mode is FULL, two syncs will occur.  First the whole journal
2760 ** is synced, then the nRec field is updated, then a second sync occurs.
2761 **
2762 ** For temporary databases, we do not care if we are able to rollback
2763 ** after a power failure, so no sync occurs.
2764 **
2765 ** If the IOCAP_SEQUENTIAL flag is set for the persistent media on which
2766 ** the database is stored, then OsSync() is never called on the journal
2767 ** file. In this case all that is required is to update the nRec field in
2768 ** the journal header.
2769 **
2770 ** This routine clears the needSync field of every page current held in
2771 ** memory.
2772 */
2773 static int syncJournal(Pager *pPager){
2774   PgHdr *pPg;
2775   int rc = SQLITE_OK;
2776 
2777 
2778   /* Sync the journal before modifying the main database
2779   ** (assuming there is a journal and it needs to be synced.)
2780   */
2781   if( pPager->needSync ){
2782     if( !pPager->tempFile ){
2783       int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
2784       assert( pPager->journalOpen );
2785 
2786       /* assert( !pPager->noSync ); // noSync might be set if synchronous
2787       ** was turned off after the transaction was started.  Ticket #615 */
2788 #ifndef NDEBUG
2789       {
2790         /* Make sure the pPager->nRec counter we are keeping agrees
2791         ** with the nRec computed from the size of the journal file.
2792         */
2793         i64 jSz;
2794         rc = sqlite3OsFileSize(pPager->jfd, &jSz);
2795         if( rc!=0 ) return rc;
2796         assert( pPager->journalOff==jSz );
2797       }
2798 #endif
2799       if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
2800         /* Write the nRec value into the journal file header. If in
2801         ** full-synchronous mode, sync the journal first. This ensures that
2802         ** all data has really hit the disk before nRec is updated to mark
2803         ** it as a candidate for rollback.
2804         **
2805         ** This is not required if the persistent media supports the
2806         ** SAFE_APPEND property. Because in this case it is not possible
2807         ** for garbage data to be appended to the file, the nRec field
2808         ** is populated with 0xFFFFFFFF when the journal header is written
2809         ** and never needs to be updated.
2810         */
2811         i64 jrnlOff;
2812         if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2813           PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
2814           IOTRACE(("JSYNC %p\n", pPager))
2815           rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags);
2816           if( rc!=0 ) return rc;
2817         }
2818 
2819         jrnlOff = pPager->journalHdr + sizeof(aJournalMagic);
2820         IOTRACE(("JHDR %p %lld %d\n", pPager, jrnlOff, 4));
2821         rc = write32bits(pPager->jfd, jrnlOff, pPager->nRec);
2822         if( rc ) return rc;
2823       }
2824       if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
2825         PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
2826         IOTRACE(("JSYNC %p\n", pPager))
2827         rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags|
2828           (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)
2829         );
2830         if( rc!=0 ) return rc;
2831       }
2832       pPager->journalStarted = 1;
2833     }
2834     pPager->needSync = 0;
2835 
2836     /* Erase the needSync flag from every page.
2837     */
2838     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2839       pPg->needSync = 0;
2840     }
2841     lruListSetFirstSynced(pPager);
2842   }
2843 
2844 #ifndef NDEBUG
2845   /* If the Pager.needSync flag is clear then the PgHdr.needSync
2846   ** flag must also be clear for all pages.  Verify that this
2847   ** invariant is true.
2848   */
2849   else{
2850     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2851       assert( pPg->needSync==0 );
2852     }
2853     assert( pPager->lru.pFirstSynced==pPager->lru.pFirst );
2854   }
2855 #endif
2856 
2857   return rc;
2858 }
2859 
2860 /*
2861 ** Merge two lists of pages connected by pDirty and in pgno order.
2862 ** Do not both fixing the pPrevDirty pointers.
2863 */
2864 static PgHdr *merge_pagelist(PgHdr *pA, PgHdr *pB){
2865   PgHdr result, *pTail;
2866   pTail = &result;
2867   while( pA && pB ){
2868     if( pA->pgno<pB->pgno ){
2869       pTail->pDirty = pA;
2870       pTail = pA;
2871       pA = pA->pDirty;
2872     }else{
2873       pTail->pDirty = pB;
2874       pTail = pB;
2875       pB = pB->pDirty;
2876     }
2877   }
2878   if( pA ){
2879     pTail->pDirty = pA;
2880   }else if( pB ){
2881     pTail->pDirty = pB;
2882   }else{
2883     pTail->pDirty = 0;
2884   }
2885   return result.pDirty;
2886 }
2887 
2888 /*
2889 ** Sort the list of pages in accending order by pgno.  Pages are
2890 ** connected by pDirty pointers.  The pPrevDirty pointers are
2891 ** corrupted by this sort.
2892 */
2893 #define N_SORT_BUCKET_ALLOC 25
2894 #define N_SORT_BUCKET       25
2895 #ifdef SQLITE_TEST
2896   int sqlite3_pager_n_sort_bucket = 0;
2897   #undef N_SORT_BUCKET
2898   #define N_SORT_BUCKET \
2899    (sqlite3_pager_n_sort_bucket?sqlite3_pager_n_sort_bucket:N_SORT_BUCKET_ALLOC)
2900 #endif
2901 static PgHdr *sort_pagelist(PgHdr *pIn){
2902   PgHdr *a[N_SORT_BUCKET_ALLOC], *p;
2903   int i;
2904   memset(a, 0, sizeof(a));
2905   while( pIn ){
2906     p = pIn;
2907     pIn = p->pDirty;
2908     p->pDirty = 0;
2909     for(i=0; i<N_SORT_BUCKET-1; i++){
2910       if( a[i]==0 ){
2911         a[i] = p;
2912         break;
2913       }else{
2914         p = merge_pagelist(a[i], p);
2915         a[i] = 0;
2916       }
2917     }
2918     if( i==N_SORT_BUCKET-1 ){
2919       /* Coverage: To get here, there need to be 2^(N_SORT_BUCKET)
2920       ** elements in the input list. This is possible, but impractical.
2921       ** Testing this line is the point of global variable
2922       ** sqlite3_pager_n_sort_bucket.
2923       */
2924       a[i] = merge_pagelist(a[i], p);
2925     }
2926   }
2927   p = a[0];
2928   for(i=1; i<N_SORT_BUCKET; i++){
2929     p = merge_pagelist(p, a[i]);
2930   }
2931   return p;
2932 }
2933 
2934 /*
2935 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
2936 ** every one of those pages out to the database file and mark them all
2937 ** as clean.
2938 */
2939 static int pager_write_pagelist(PgHdr *pList){
2940   Pager *pPager;
2941   PgHdr *p;
2942   int rc;
2943 
2944   if( pList==0 ) return SQLITE_OK;
2945   pPager = pList->pPager;
2946 
2947   /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
2948   ** database file. If there is already an EXCLUSIVE lock, the following
2949   ** calls to sqlite3OsLock() are no-ops.
2950   **
2951   ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
2952   ** through an intermediate state PENDING.   A PENDING lock prevents new
2953   ** readers from attaching to the database but is unsufficient for us to
2954   ** write.  The idea of a PENDING lock is to prevent new readers from
2955   ** coming in while we wait for existing readers to clear.
2956   **
2957   ** While the pager is in the RESERVED state, the original database file
2958   ** is unchanged and we can rollback without having to playback the
2959   ** journal into the original database file.  Once we transition to
2960   ** EXCLUSIVE, it means the database file has been changed and any rollback
2961   ** will require a journal playback.
2962   */
2963   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
2964   if( rc!=SQLITE_OK ){
2965     return rc;
2966   }
2967 
2968   pList = sort_pagelist(pList);
2969   for(p=pList; p; p=p->pDirty){
2970     assert( p->dirty );
2971     p->dirty = 0;
2972   }
2973   while( pList ){
2974 
2975     /* If the file has not yet been opened, open it now. */
2976     if( !pPager->fd->pMethods ){
2977       assert(pPager->tempFile);
2978       rc = sqlite3PagerOpentemp(pPager->pVfs, pPager->fd, pPager->zFilename,
2979                                 pPager->vfsFlags);
2980       if( rc ) return rc;
2981     }
2982 
2983     /* If there are dirty pages in the page cache with page numbers greater
2984     ** than Pager.dbSize, this means sqlite3PagerTruncate() was called to
2985     ** make the file smaller (presumably by auto-vacuum code). Do not write
2986     ** any such pages to the file.
2987     */
2988     if( pList->pgno<=pPager->dbSize ){
2989       i64 offset = (pList->pgno-1)*(i64)pPager->pageSize;
2990       char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
2991       PAGERTRACE4("STORE %d page %d hash(%08x)\n",
2992                    PAGERID(pPager), pList->pgno, pager_pagehash(pList));
2993       IOTRACE(("PGOUT %p %d\n", pPager, pList->pgno));
2994       rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);
2995       PAGER_INCR(sqlite3_pager_writedb_count);
2996       PAGER_INCR(pPager->nWrite);
2997       if( pList->pgno==1 ){
2998         memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
2999       }
3000     }
3001 #ifndef NDEBUG
3002     else{
3003       PAGERTRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
3004     }
3005 #endif
3006     if( rc ) return rc;
3007 #ifdef SQLITE_CHECK_PAGES
3008     pList->pageHash = pager_pagehash(pList);
3009 #endif
3010     pList = pList->pDirty;
3011   }
3012   return SQLITE_OK;
3013 }
3014 
3015 /*
3016 ** Collect every dirty page into a dirty list and
3017 ** return a pointer to the head of that list.  All pages are
3018 ** collected even if they are still in use.
3019 */
3020 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
3021   return pPager->pDirty;
3022 }
3023 
3024 /*
3025 ** Return TRUE if there is a hot journal on the given pager.
3026 ** A hot journal is one that needs to be played back.
3027 **
3028 ** If the current size of the database file is 0 but a journal file
3029 ** exists, that is probably an old journal left over from a prior
3030 ** database with the same name.  Just delete the journal.
3031 */
3032 static int hasHotJournal(Pager *pPager){
3033   sqlite3_vfs *pVfs = pPager->pVfs;
3034   if( !pPager->useJournal ) return 0;
3035   if( !pPager->fd->pMethods ) return 0;
3036   if( !sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS) ){
3037     return 0;
3038   }
3039   if( sqlite3OsCheckReservedLock(pPager->fd) ){
3040     return 0;
3041   }
3042   if( sqlite3PagerPagecount(pPager)==0 ){
3043     sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3044     return 0;
3045   }else{
3046     return 1;
3047   }
3048 }
3049 
3050 /*
3051 ** Try to find a page in the cache that can be recycled.
3052 **
3053 ** This routine may return SQLITE_IOERR, SQLITE_FULL or SQLITE_OK. It
3054 ** does not set the pPager->errCode variable.
3055 */
3056 static int pager_recycle(Pager *pPager, PgHdr **ppPg){
3057   PgHdr *pPg;
3058   *ppPg = 0;
3059 
3060   /* It is illegal to call this function unless the pager object
3061   ** pointed to by pPager has at least one free page (page with nRef==0).
3062   */
3063   assert(!MEMDB);
3064   assert(pPager->lru.pFirst);
3065 
3066   /* Find a page to recycle.  Try to locate a page that does not
3067   ** require us to do an fsync() on the journal.
3068   */
3069   pPg = pPager->lru.pFirstSynced;
3070 
3071   /* If we could not find a page that does not require an fsync()
3072   ** on the journal file then fsync the journal file.  This is a
3073   ** very slow operation, so we work hard to avoid it.  But sometimes
3074   ** it can't be helped.
3075   */
3076   if( pPg==0 && pPager->lru.pFirst){
3077     int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
3078     int rc = syncJournal(pPager);
3079     if( rc!=0 ){
3080       return rc;
3081     }
3082     if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
3083       /* If in full-sync mode, write a new journal header into the
3084       ** journal file. This is done to avoid ever modifying a journal
3085       ** header that is involved in the rollback of pages that have
3086       ** already been written to the database (in case the header is
3087       ** trashed when the nRec field is updated).
3088       */
3089       pPager->nRec = 0;
3090       assert( pPager->journalOff > 0 );
3091       assert( pPager->doNotSync==0 );
3092       rc = writeJournalHdr(pPager);
3093       if( rc!=0 ){
3094         return rc;
3095       }
3096     }
3097     pPg = pPager->lru.pFirst;
3098   }
3099 
3100   assert( pPg->nRef==0 );
3101 
3102   /* Write the page to the database file if it is dirty.
3103   */
3104   if( pPg->dirty ){
3105     int rc;
3106     assert( pPg->needSync==0 );
3107     makeClean(pPg);
3108     pPg->dirty = 1;
3109     pPg->pDirty = 0;
3110     rc = pager_write_pagelist( pPg );
3111     pPg->dirty = 0;
3112     if( rc!=SQLITE_OK ){
3113       return rc;
3114     }
3115   }
3116   assert( pPg->dirty==0 );
3117 
3118   /* If the page we are recycling is marked as alwaysRollback, then
3119   ** set the global alwaysRollback flag, thus disabling the
3120   ** sqlite3PagerDontRollback() optimization for the rest of this transaction.
3121   ** It is necessary to do this because the page marked alwaysRollback
3122   ** might be reloaded at a later time but at that point we won't remember
3123   ** that is was marked alwaysRollback.  This means that all pages must
3124   ** be marked as alwaysRollback from here on out.
3125   */
3126   if( pPg->alwaysRollback ){
3127     IOTRACE(("ALWAYS_ROLLBACK %p\n", pPager))
3128     pPager->alwaysRollback = 1;
3129   }
3130 
3131   /* Unlink the old page from the free list and the hash table
3132   */
3133   unlinkPage(pPg);
3134   assert( pPg->pgno==0 );
3135 
3136   *ppPg = pPg;
3137   return SQLITE_OK;
3138 }
3139 
3140 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
3141 /*
3142 ** This function is called to free superfluous dynamically allocated memory
3143 ** held by the pager system. Memory in use by any SQLite pager allocated
3144 ** by the current thread may be sqlite3_free()ed.
3145 **
3146 ** nReq is the number of bytes of memory required. Once this much has
3147 ** been released, the function returns. The return value is the total number
3148 ** of bytes of memory released.
3149 */
3150 int sqlite3PagerReleaseMemory(int nReq){
3151   int nReleased = 0;          /* Bytes of memory released so far */
3152   sqlite3_mutex *mutex;       /* The MEM2 mutex */
3153   Pager *pPager;              /* For looping over pagers */
3154   BusyHandler *savedBusy;     /* Saved copy of the busy handler */
3155   int rc = SQLITE_OK;
3156 
3157   /* Acquire the memory-management mutex
3158   */
3159   mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_MEM2);
3160   sqlite3_mutex_enter(mutex);
3161 
3162   /* Signal all database connections that memory management wants
3163   ** to have access to the pagers.
3164   */
3165   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
3166      pPager->iInUseMM = 1;
3167   }
3168 
3169   while( rc==SQLITE_OK && (nReq<0 || nReleased<nReq) ){
3170     PgHdr *pPg;
3171     PgHdr *pRecycled;
3172 
3173     /* Try to find a page to recycle that does not require a sync(). If
3174     ** this is not possible, find one that does require a sync().
3175     */
3176     sqlite3_mutex_enter(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
3177     pPg = sqlite3LruPageList.pFirstSynced;
3178     while( pPg && (pPg->needSync || pPg->pPager->iInUseDB) ){
3179       pPg = pPg->gfree.pNext;
3180     }
3181     if( !pPg ){
3182       pPg = sqlite3LruPageList.pFirst;
3183       while( pPg && pPg->pPager->iInUseDB ){
3184         pPg = pPg->gfree.pNext;
3185       }
3186     }
3187     sqlite3_mutex_leave(sqlite3_mutex_alloc(SQLITE_MUTEX_STATIC_LRU));
3188 
3189     /* If pPg==0, then the block above has failed to find a page to
3190     ** recycle. In this case return early - no further memory will
3191     ** be released.
3192     */
3193     if( !pPg ) break;
3194 
3195     pPager = pPg->pPager;
3196     assert(!pPg->needSync || pPg==pPager->lru.pFirst);
3197     assert(pPg->needSync || pPg==pPager->lru.pFirstSynced);
3198 
3199     savedBusy = pPager->pBusyHandler;
3200     pPager->pBusyHandler = 0;
3201     rc = pager_recycle(pPager, &pRecycled);
3202     pPager->pBusyHandler = savedBusy;
3203     assert(pRecycled==pPg || rc!=SQLITE_OK);
3204     if( rc==SQLITE_OK ){
3205       /* We've found a page to free. At this point the page has been
3206       ** removed from the page hash-table, free-list and synced-list
3207       ** (pFirstSynced). It is still in the all pages (pAll) list.
3208       ** Remove it from this list before freeing.
3209       **
3210       ** Todo: Check the Pager.pStmt list to make sure this is Ok. It
3211       ** probably is though.
3212       */
3213       PgHdr *pTmp;
3214       assert( pPg );
3215       if( pPg==pPager->pAll ){
3216          pPager->pAll = pPg->pNextAll;
3217       }else{
3218         for( pTmp=pPager->pAll; pTmp->pNextAll!=pPg; pTmp=pTmp->pNextAll ){}
3219         pTmp->pNextAll = pPg->pNextAll;
3220       }
3221       nReleased += (
3222           sizeof(*pPg) + pPager->pageSize
3223           + sizeof(u32) + pPager->nExtra
3224           + MEMDB*sizeof(PgHistory)
3225       );
3226       IOTRACE(("PGFREE %p %d *\n", pPager, pPg->pgno));
3227       PAGER_INCR(sqlite3_pager_pgfree_count);
3228       sqlite3_free(pPg);
3229       pPager->nPage--;
3230     }else{
3231       /* An error occured whilst writing to the database file or
3232       ** journal in pager_recycle(). The error is not returned to the
3233       ** caller of this function. Instead, set the Pager.errCode variable.
3234       ** The error will be returned to the user (or users, in the case
3235       ** of a shared pager cache) of the pager for which the error occured.
3236       */
3237       assert(
3238           (rc&0xff)==SQLITE_IOERR ||
3239           rc==SQLITE_FULL ||
3240           rc==SQLITE_BUSY
3241       );
3242       assert( pPager->state>=PAGER_RESERVED );
3243       pager_error(pPager, rc);
3244     }
3245   }
3246 
3247   /* Clear the memory management flags and release the mutex
3248   */
3249   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
3250      pPager->iInUseMM = 0;
3251   }
3252   sqlite3_mutex_leave(mutex);
3253 
3254   /* Return the number of bytes released
3255   */
3256   return nReleased;
3257 }
3258 #endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */
3259 
3260 /*
3261 ** Read the content of page pPg out of the database file.
3262 */
3263 static int readDbPage(Pager *pPager, PgHdr *pPg, Pgno pgno){
3264   int rc;
3265   i64 offset;
3266   assert( MEMDB==0 );
3267   assert(pPager->fd->pMethods||pPager->tempFile);
3268   if( !pPager->fd->pMethods ){
3269     return SQLITE_IOERR_SHORT_READ;
3270   }
3271   offset = (pgno-1)*(i64)pPager->pageSize;
3272   rc = sqlite3OsRead(pPager->fd, PGHDR_TO_DATA(pPg), pPager->pageSize, offset);
3273   PAGER_INCR(sqlite3_pager_readdb_count);
3274   PAGER_INCR(pPager->nRead);
3275   IOTRACE(("PGIN %p %d\n", pPager, pgno));
3276   if( pgno==1 ){
3277     memcpy(&pPager->dbFileVers, &((u8*)PGHDR_TO_DATA(pPg))[24],
3278                                               sizeof(pPager->dbFileVers));
3279   }
3280   CODEC1(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
3281   PAGERTRACE4("FETCH %d page %d hash(%08x)\n",
3282                PAGERID(pPager), pPg->pgno, pager_pagehash(pPg));
3283   return rc;
3284 }
3285 
3286 
3287 /*
3288 ** This function is called to obtain the shared lock required before
3289 ** data may be read from the pager cache. If the shared lock has already
3290 ** been obtained, this function is a no-op.
3291 **
3292 ** Immediately after obtaining the shared lock (if required), this function
3293 ** checks for a hot-journal file. If one is found, an emergency rollback
3294 ** is performed immediately.
3295 */
3296 static int pagerSharedLock(Pager *pPager){
3297   int rc = SQLITE_OK;
3298   int isHot = 0;
3299 
3300   /* If this database is opened for exclusive access, has no outstanding
3301   ** page references and is in an error-state, now is the chance to clear
3302   ** the error. Discard the contents of the pager-cache and treat any
3303   ** open journal file as a hot-journal.
3304   */
3305   if( !MEMDB && pPager->exclusiveMode && pPager->nRef==0 && pPager->errCode ){
3306     if( pPager->journalOpen ){
3307       isHot = 1;
3308     }
3309     pager_reset(pPager);
3310     pPager->errCode = SQLITE_OK;
3311   }
3312 
3313   /* If the pager is still in an error state, do not proceed. The error
3314   ** state will be cleared at some point in the future when all page
3315   ** references are dropped and the cache can be discarded.
3316   */
3317   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
3318     return pPager->errCode;
3319   }
3320 
3321   if( pPager->state==PAGER_UNLOCK || isHot ){
3322     sqlite3_vfs *pVfs = pPager->pVfs;
3323     if( !MEMDB ){
3324       assert( pPager->nRef==0 );
3325       if( !pPager->noReadlock ){
3326         rc = pager_wait_on_lock(pPager, SHARED_LOCK);
3327         if( rc!=SQLITE_OK ){
3328           return pager_error(pPager, rc);
3329         }
3330         assert( pPager->state>=SHARED_LOCK );
3331       }
3332 
3333       /* If a journal file exists, and there is no RESERVED lock on the
3334       ** database file, then it either needs to be played back or deleted.
3335       */
3336       if( hasHotJournal(pPager) || isHot ){
3337         /* Get an EXCLUSIVE lock on the database file. At this point it is
3338         ** important that a RESERVED lock is not obtained on the way to the
3339         ** EXCLUSIVE lock. If it were, another process might open the
3340         ** database file, detect the RESERVED lock, and conclude that the
3341         ** database is safe to read while this process is still rolling it
3342         ** back.
3343         **
3344         ** Because the intermediate RESERVED lock is not requested, the
3345         ** second process will get to this point in the code and fail to
3346         ** obtain its own EXCLUSIVE lock on the database file.
3347         */
3348         if( pPager->state<EXCLUSIVE_LOCK ){
3349           rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
3350           if( rc!=SQLITE_OK ){
3351             pager_unlock(pPager);
3352             return pager_error(pPager, rc);
3353           }
3354           pPager->state = PAGER_EXCLUSIVE;
3355         }
3356 
3357         /* Open the journal for reading only.  Return SQLITE_BUSY if
3358         ** we are unable to open the journal file.
3359         **
3360         ** The journal file does not need to be locked itself.  The
3361         ** journal file is never open unless the main database file holds
3362         ** a write lock, so there is never any chance of two or more
3363         ** processes opening the journal at the same time.
3364         **
3365         ** Open the journal for read/write access. This is because in
3366         ** exclusive-access mode the file descriptor will be kept open and
3367         ** possibly used for a transaction later on. On some systems, the
3368         ** OsTruncate() call used in exclusive-access mode also requires
3369         ** a read/write file handle.
3370         */
3371         if( !isHot ){
3372           rc = SQLITE_BUSY;
3373           if( sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS) ){
3374             int fout = 0;
3375             int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL;
3376             assert( !pPager->tempFile );
3377             rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);
3378             assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
3379             if( fout&SQLITE_OPEN_READONLY ){
3380               rc = SQLITE_BUSY;
3381               sqlite3OsClose(pPager->jfd);
3382             }
3383           }
3384         }
3385         if( rc!=SQLITE_OK ){
3386           pager_unlock(pPager);
3387           return ((rc==SQLITE_NOMEM||rc==SQLITE_IOERR_NOMEM)?rc:SQLITE_BUSY);
3388         }
3389         pPager->journalOpen = 1;
3390         pPager->journalStarted = 0;
3391         pPager->journalOff = 0;
3392         pPager->setMaster = 0;
3393         pPager->journalHdr = 0;
3394 
3395         /* Playback and delete the journal.  Drop the database write
3396         ** lock and reacquire the read lock.
3397         */
3398         rc = pager_playback(pPager, 1);
3399         if( rc!=SQLITE_OK ){
3400           return pager_error(pPager, rc);
3401         }
3402         assert(pPager->state==PAGER_SHARED ||
3403             (pPager->exclusiveMode && pPager->state>PAGER_SHARED)
3404         );
3405       }
3406 
3407       if( pPager->pAll ){
3408         /* The shared-lock has just been acquired on the database file
3409         ** and there are already pages in the cache (from a previous
3410         ** read or write transaction).  Check to see if the database
3411         ** has been modified.  If the database has changed, flush the
3412         ** cache.
3413         **
3414         ** Database changes is detected by looking at 15 bytes beginning
3415         ** at offset 24 into the file.  The first 4 of these 16 bytes are
3416         ** a 32-bit counter that is incremented with each change.  The
3417         ** other bytes change randomly with each file change when
3418         ** a codec is in use.
3419         **
3420         ** There is a vanishingly small chance that a change will not be
3421         ** detected.  The chance of an undetected change is so small that
3422         ** it can be neglected.
3423         */
3424         char dbFileVers[sizeof(pPager->dbFileVers)];
3425         sqlite3PagerPagecount(pPager);
3426 
3427         if( pPager->errCode ){
3428           return pPager->errCode;
3429         }
3430 
3431         if( pPager->dbSize>0 ){
3432           IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));
3433           rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);
3434           if( rc!=SQLITE_OK ){
3435             return rc;
3436           }
3437         }else{
3438           memset(dbFileVers, 0, sizeof(dbFileVers));
3439         }
3440 
3441         if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){
3442           pager_reset(pPager);
3443         }
3444       }
3445     }
3446     assert( pPager->exclusiveMode || pPager->state<=PAGER_SHARED );
3447     if( pPager->state==PAGER_UNLOCK ){
3448       pPager->state = PAGER_SHARED;
3449     }
3450   }
3451 
3452   return rc;
3453 }
3454 
3455 /*
3456 ** Allocate a PgHdr object.   Either create a new one or reuse
3457 ** an existing one that is not otherwise in use.
3458 **
3459 ** A new PgHdr structure is created if any of the following are
3460 ** true:
3461 **
3462 **     (1)  We have not exceeded our maximum allocated cache size
3463 **          as set by the "PRAGMA cache_size" command.
3464 **
3465 **     (2)  There are no unused PgHdr objects available at this time.
3466 **
3467 **     (3)  This is an in-memory database.
3468 **
3469 **     (4)  There are no PgHdr objects that do not require a journal
3470 **          file sync and a sync of the journal file is currently
3471 **          prohibited.
3472 **
3473 ** Otherwise, reuse an existing PgHdr.  In other words, reuse an
3474 ** existing PgHdr if all of the following are true:
3475 **
3476 **     (1)  We have reached or exceeded the maximum cache size
3477 **          allowed by "PRAGMA cache_size".
3478 **
3479 **     (2)  There is a PgHdr available with PgHdr->nRef==0
3480 **
3481 **     (3)  We are not in an in-memory database
3482 **
3483 **     (4)  Either there is an available PgHdr that does not need
3484 **          to be synced to disk or else disk syncing is currently
3485 **          allowed.
3486 */
3487 static int pagerAllocatePage(Pager *pPager, PgHdr **ppPg){
3488   int rc = SQLITE_OK;
3489   PgHdr *pPg;
3490   int nByteHdr;
3491 
3492   /* Create a new PgHdr if any of the four conditions defined
3493   ** above are met: */
3494   if( pPager->nPage<pPager->mxPage
3495    || pPager->lru.pFirst==0
3496    || MEMDB
3497    || (pPager->lru.pFirstSynced==0 && pPager->doNotSync)
3498   ){
3499     if( pPager->nPage>=pPager->nHash ){
3500       pager_resize_hash_table(pPager,
3501          pPager->nHash<256 ? 256 : pPager->nHash*2);
3502       if( pPager->nHash==0 ){
3503         rc = SQLITE_NOMEM;
3504         goto pager_allocate_out;
3505       }
3506     }
3507     pagerLeave(pPager);
3508     nByteHdr = sizeof(*pPg) + sizeof(u32) + pPager->nExtra
3509               + MEMDB*sizeof(PgHistory);
3510     pPg = sqlite3_malloc( nByteHdr + pPager->pageSize );
3511     pagerEnter(pPager);
3512     if( pPg==0 ){
3513       rc = SQLITE_NOMEM;
3514       goto pager_allocate_out;
3515     }
3516     memset(pPg, 0, nByteHdr);
3517     pPg->pData = (void*)(nByteHdr + (char*)pPg);
3518     pPg->pPager = pPager;
3519     pPg->pNextAll = pPager->pAll;
3520     pPager->pAll = pPg;
3521     pPager->nPage++;
3522   }else{
3523     /* Recycle an existing page with a zero ref-count. */
3524     rc = pager_recycle(pPager, &pPg);
3525     if( rc==SQLITE_BUSY ){
3526       rc = SQLITE_IOERR_BLOCKED;
3527     }
3528     if( rc!=SQLITE_OK ){
3529       goto pager_allocate_out;
3530     }
3531     assert( pPager->state>=SHARED_LOCK );
3532     assert(pPg);
3533   }
3534   *ppPg = pPg;
3535 
3536 pager_allocate_out:
3537   return rc;
3538 }
3539 
3540 /*
3541 ** Make sure we have the content for a page.  If the page was
3542 ** previously acquired with noContent==1, then the content was
3543 ** just initialized to zeros instead of being read from disk.
3544 ** But now we need the real data off of disk.  So make sure we
3545 ** have it.  Read it in if we do not have it already.
3546 */
3547 static int pager_get_content(PgHdr *pPg){
3548   if( pPg->needRead ){
3549     int rc = readDbPage(pPg->pPager, pPg, pPg->pgno);
3550     if( rc==SQLITE_OK ){
3551       pPg->needRead = 0;
3552     }else{
3553       return rc;
3554     }
3555   }
3556   return SQLITE_OK;
3557 }
3558 
3559 /*
3560 ** Acquire a page.
3561 **
3562 ** A read lock on the disk file is obtained when the first page is acquired.
3563 ** This read lock is dropped when the last page is released.
3564 **
3565 ** This routine works for any page number greater than 0.  If the database
3566 ** file is smaller than the requested page, then no actual disk
3567 ** read occurs and the memory image of the page is initialized to
3568 ** all zeros.  The extra data appended to a page is always initialized
3569 ** to zeros the first time a page is loaded into memory.
3570 **
3571 ** The acquisition might fail for several reasons.  In all cases,
3572 ** an appropriate error code is returned and *ppPage is set to NULL.
3573 **
3574 ** See also sqlite3PagerLookup().  Both this routine and Lookup() attempt
3575 ** to find a page in the in-memory cache first.  If the page is not already
3576 ** in memory, this routine goes to disk to read it in whereas Lookup()
3577 ** just returns 0.  This routine acquires a read-lock the first time it
3578 ** has to go to disk, and could also playback an old journal if necessary.
3579 ** Since Lookup() never goes to disk, it never has to deal with locks
3580 ** or journal files.
3581 **
3582 ** If noContent is false, the page contents are actually read from disk.
3583 ** If noContent is true, it means that we do not care about the contents
3584 ** of the page at this time, so do not do a disk read.  Just fill in the
3585 ** page content with zeros.  But mark the fact that we have not read the
3586 ** content by setting the PgHdr.needRead flag.  Later on, if
3587 ** sqlite3PagerWrite() is called on this page or if this routine is
3588 ** called again with noContent==0, that means that the content is needed
3589 ** and the disk read should occur at that point.
3590 */
3591 static int pagerAcquire(
3592   Pager *pPager,      /* The pager open on the database file */
3593   Pgno pgno,          /* Page number to fetch */
3594   DbPage **ppPage,    /* Write a pointer to the page here */
3595   int noContent       /* Do not bother reading content from disk if true */
3596 ){
3597   PgHdr *pPg;
3598   int rc;
3599 
3600   assert( pPager->state==PAGER_UNLOCK || pPager->nRef>0 || pgno==1 );
3601 
3602   /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
3603   ** number greater than this, or zero, is requested.
3604   */
3605   if( pgno>PAGER_MAX_PGNO || pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
3606     return SQLITE_CORRUPT_BKPT;
3607   }
3608 
3609   /* Make sure we have not hit any critical errors.
3610   */
3611   assert( pPager!=0 );
3612   *ppPage = 0;
3613 
3614   /* If this is the first page accessed, then get a SHARED lock
3615   ** on the database file. pagerSharedLock() is a no-op if
3616   ** a database lock is already held.
3617   */
3618   rc = pagerSharedLock(pPager);
3619   if( rc!=SQLITE_OK ){
3620     return rc;
3621   }
3622   assert( pPager->state!=PAGER_UNLOCK );
3623 
3624   pPg = pager_lookup(pPager, pgno);
3625   if( pPg==0 ){
3626     /* The requested page is not in the page cache. */
3627     int nMax;
3628     int h;
3629     PAGER_INCR(pPager->nMiss);
3630     rc = pagerAllocatePage(pPager, &pPg);
3631     if( rc!=SQLITE_OK ){
3632       return rc;
3633     }
3634 
3635     pPg->pgno = pgno;
3636     assert( !MEMDB || pgno>pPager->stmtSize );
3637     if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
3638 #if 0
3639       sqlite3CheckMemory(pPager->aInJournal, pgno/8);
3640 #endif
3641       assert( pPager->journalOpen );
3642       pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
3643       pPg->needSync = 0;
3644     }else{
3645       pPg->inJournal = 0;
3646       pPg->needSync = 0;
3647     }
3648 
3649     makeClean(pPg);
3650     pPg->nRef = 1;
3651     REFINFO(pPg);
3652 
3653     pPager->nRef++;
3654     if( pPager->nExtra>0 ){
3655       memset(PGHDR_TO_EXTRA(pPg, pPager), 0, pPager->nExtra);
3656     }
3657     nMax = sqlite3PagerPagecount(pPager);
3658     if( pPager->errCode ){
3659       rc = pPager->errCode;
3660       sqlite3PagerUnref(pPg);
3661       return rc;
3662     }
3663 
3664     /* Populate the page with data, either by reading from the database
3665     ** file, or by setting the entire page to zero.
3666     */
3667     if( nMax<(int)pgno || MEMDB || (noContent && !pPager->alwaysRollback) ){
3668       if( pgno>pPager->mxPgno ){
3669         sqlite3PagerUnref(pPg);
3670         return SQLITE_FULL;
3671       }
3672       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
3673       pPg->needRead = noContent && !pPager->alwaysRollback;
3674       IOTRACE(("ZERO %p %d\n", pPager, pgno));
3675     }else{
3676       rc = readDbPage(pPager, pPg, pgno);
3677       if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
3678         pPg->pgno = 0;
3679         sqlite3PagerUnref(pPg);
3680         return rc;
3681       }
3682       pPg->needRead = 0;
3683     }
3684 
3685     /* Link the page into the page hash table */
3686     h = pgno & (pPager->nHash-1);
3687     assert( pgno!=0 );
3688     pPg->pNextHash = pPager->aHash[h];
3689     pPager->aHash[h] = pPg;
3690     if( pPg->pNextHash ){
3691       assert( pPg->pNextHash->pPrevHash==0 );
3692       pPg->pNextHash->pPrevHash = pPg;
3693     }
3694 
3695 #ifdef SQLITE_CHECK_PAGES
3696     pPg->pageHash = pager_pagehash(pPg);
3697 #endif
3698   }else{
3699     /* The requested page is in the page cache. */
3700     assert(pPager->nRef>0 || pgno==1);
3701     PAGER_INCR(pPager->nHit);
3702     if( !noContent ){
3703       rc = pager_get_content(pPg);
3704       if( rc ){
3705         return rc;
3706       }
3707     }
3708     page_ref(pPg);
3709   }
3710   *ppPage = pPg;
3711   return SQLITE_OK;
3712 }
3713 int sqlite3PagerAcquire(
3714   Pager *pPager,      /* The pager open on the database file */
3715   Pgno pgno,          /* Page number to fetch */
3716   DbPage **ppPage,    /* Write a pointer to the page here */
3717   int noContent       /* Do not bother reading content from disk if true */
3718 ){
3719   int rc;
3720   pagerEnter(pPager);
3721   rc = pagerAcquire(pPager, pgno, ppPage, noContent);
3722   pagerLeave(pPager);
3723   return rc;
3724 }
3725 
3726 
3727 /*
3728 ** Acquire a page if it is already in the in-memory cache.  Do
3729 ** not read the page from disk.  Return a pointer to the page,
3730 ** or 0 if the page is not in cache.
3731 **
3732 ** See also sqlite3PagerGet().  The difference between this routine
3733 ** and sqlite3PagerGet() is that _get() will go to the disk and read
3734 ** in the page if the page is not already in cache.  This routine
3735 ** returns NULL if the page is not in cache or if a disk I/O error
3736 ** has ever happened.
3737 */
3738 DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){
3739   PgHdr *pPg = 0;
3740 
3741   assert( pPager!=0 );
3742   assert( pgno!=0 );
3743 
3744   pagerEnter(pPager);
3745   if( pPager->state==PAGER_UNLOCK ){
3746     assert( !pPager->pAll || pPager->exclusiveMode );
3747   }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
3748     /* Do nothing */
3749   }else if( (pPg = pager_lookup(pPager, pgno))!=0 ){
3750     page_ref(pPg);
3751   }
3752   pagerLeave(pPager);
3753   return pPg;
3754 }
3755 
3756 /*
3757 ** Release a page.
3758 **
3759 ** If the number of references to the page drop to zero, then the
3760 ** page is added to the LRU list.  When all references to all pages
3761 ** are released, a rollback occurs and the lock on the database is
3762 ** removed.
3763 */
3764 int sqlite3PagerUnref(DbPage *pPg){
3765   Pager *pPager = pPg->pPager;
3766 
3767   /* Decrement the reference count for this page
3768   */
3769   assert( pPg->nRef>0 );
3770   pagerEnter(pPg->pPager);
3771   pPg->nRef--;
3772   REFINFO(pPg);
3773 
3774   CHECK_PAGE(pPg);
3775 
3776   /* When the number of references to a page reach 0, call the
3777   ** destructor and add the page to the freelist.
3778   */
3779   if( pPg->nRef==0 ){
3780 
3781     lruListAdd(pPg);
3782     if( pPager->xDestructor ){
3783       pPager->xDestructor(pPg, pPager->pageSize);
3784     }
3785 
3786     /* When all pages reach the freelist, drop the read lock from
3787     ** the database file.
3788     */
3789     pPager->nRef--;
3790     assert( pPager->nRef>=0 );
3791     if( pPager->nRef==0 && (!pPager->exclusiveMode || pPager->journalOff>0) ){
3792       pagerUnlockAndRollback(pPager);
3793     }
3794   }
3795   pagerLeave(pPager);
3796   return SQLITE_OK;
3797 }
3798 
3799 /*
3800 ** Create a journal file for pPager.  There should already be a RESERVED
3801 ** or EXCLUSIVE lock on the database file when this routine is called.
3802 **
3803 ** Return SQLITE_OK if everything.  Return an error code and release the
3804 ** write lock if anything goes wrong.
3805 */
3806 static int pager_open_journal(Pager *pPager){
3807   sqlite3_vfs *pVfs = pPager->pVfs;
3808   int flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE);
3809 
3810   int rc;
3811   assert( !MEMDB );
3812   assert( pPager->state>=PAGER_RESERVED );
3813   assert( pPager->journalOpen==0 );
3814   assert( pPager->useJournal );
3815   assert( pPager->aInJournal==0 );
3816   sqlite3PagerPagecount(pPager);
3817   pagerLeave(pPager);
3818   pPager->aInJournal = sqlite3MallocZero( pPager->dbSize/8 + 1 );
3819   pagerEnter(pPager);
3820   if( pPager->aInJournal==0 ){
3821     rc = SQLITE_NOMEM;
3822     goto failed_to_open_journal;
3823   }
3824 
3825   if( pPager->tempFile ){
3826     flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL);
3827   }else{
3828     flags |= (SQLITE_OPEN_MAIN_JOURNAL);
3829   }
3830 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
3831   rc = sqlite3JournalOpen(
3832       pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)
3833   );
3834 #else
3835   rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);
3836 #endif
3837   assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
3838   pPager->journalOff = 0;
3839   pPager->setMaster = 0;
3840   pPager->journalHdr = 0;
3841   if( rc!=SQLITE_OK ){
3842     if( rc==SQLITE_NOMEM ){
3843       sqlite3OsDelete(pVfs, pPager->zJournal, 0);
3844     }
3845     goto failed_to_open_journal;
3846   }
3847   pPager->journalOpen = 1;
3848   pPager->journalStarted = 0;
3849   pPager->needSync = 0;
3850   pPager->alwaysRollback = 0;
3851   pPager->nRec = 0;
3852   if( pPager->errCode ){
3853     rc = pPager->errCode;
3854     goto failed_to_open_journal;
3855   }
3856   pPager->origDbSize = pPager->dbSize;
3857 
3858   rc = writeJournalHdr(pPager);
3859 
3860   if( pPager->stmtAutoopen && rc==SQLITE_OK ){
3861     rc = sqlite3PagerStmtBegin(pPager);
3862   }
3863   if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_NOMEM ){
3864     rc = pager_end_transaction(pPager);
3865     if( rc==SQLITE_OK ){
3866       rc = SQLITE_FULL;
3867     }
3868   }
3869   return rc;
3870 
3871 failed_to_open_journal:
3872   sqlite3_free(pPager->aInJournal);
3873   pPager->aInJournal = 0;
3874   return rc;
3875 }
3876 
3877 /*
3878 ** Acquire a write-lock on the database.  The lock is removed when
3879 ** the any of the following happen:
3880 **
3881 **   *  sqlite3PagerCommitPhaseTwo() is called.
3882 **   *  sqlite3PagerRollback() is called.
3883 **   *  sqlite3PagerClose() is called.
3884 **   *  sqlite3PagerUnref() is called to on every outstanding page.
3885 **
3886 ** The first parameter to this routine is a pointer to any open page of the
3887 ** database file.  Nothing changes about the page - it is used merely to
3888 ** acquire a pointer to the Pager structure and as proof that there is
3889 ** already a read-lock on the database.
3890 **
3891 ** The second parameter indicates how much space in bytes to reserve for a
3892 ** master journal file-name at the start of the journal when it is created.
3893 **
3894 ** A journal file is opened if this is not a temporary file.  For temporary
3895 ** files, the opening of the journal file is deferred until there is an
3896 ** actual need to write to the journal.
3897 **
3898 ** If the database is already reserved for writing, this routine is a no-op.
3899 **
3900 ** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file
3901 ** immediately instead of waiting until we try to flush the cache.  The
3902 ** exFlag is ignored if a transaction is already active.
3903 */
3904 int sqlite3PagerBegin(DbPage *pPg, int exFlag){
3905   Pager *pPager = pPg->pPager;
3906   int rc = SQLITE_OK;
3907   pagerEnter(pPager);
3908   assert( pPg->nRef>0 );
3909   assert( pPager->state!=PAGER_UNLOCK );
3910   if( pPager->state==PAGER_SHARED ){
3911     assert( pPager->aInJournal==0 );
3912     if( MEMDB ){
3913       pPager->state = PAGER_EXCLUSIVE;
3914       pPager->origDbSize = pPager->dbSize;
3915     }else{
3916       rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
3917       if( rc==SQLITE_OK ){
3918         pPager->state = PAGER_RESERVED;
3919         if( exFlag ){
3920           rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
3921         }
3922       }
3923       if( rc!=SQLITE_OK ){
3924         pagerLeave(pPager);
3925         return rc;
3926       }
3927       pPager->dirtyCache = 0;
3928       PAGERTRACE2("TRANSACTION %d\n", PAGERID(pPager));
3929       if( pPager->useJournal && !pPager->tempFile ){
3930         rc = pager_open_journal(pPager);
3931       }
3932     }
3933   }else if( pPager->journalOpen && pPager->journalOff==0 ){
3934     /* This happens when the pager was in exclusive-access mode last
3935     ** time a (read or write) transaction was successfully concluded
3936     ** by this connection. Instead of deleting the journal file it was
3937     ** kept open and truncated to 0 bytes.
3938     */
3939     assert( pPager->nRec==0 );
3940     assert( pPager->origDbSize==0 );
3941     assert( pPager->aInJournal==0 );
3942     sqlite3PagerPagecount(pPager);
3943     pagerLeave(pPager);
3944     pPager->aInJournal = sqlite3MallocZero( pPager->dbSize/8 + 1 );
3945     pagerEnter(pPager);
3946     if( !pPager->aInJournal ){
3947       rc = SQLITE_NOMEM;
3948     }else{
3949       pPager->origDbSize = pPager->dbSize;
3950       rc = writeJournalHdr(pPager);
3951     }
3952   }
3953   assert( !pPager->journalOpen || pPager->journalOff>0 || rc!=SQLITE_OK );
3954   pagerLeave(pPager);
3955   return rc;
3956 }
3957 
3958 /*
3959 ** Make a page dirty.  Set its dirty flag and add it to the dirty
3960 ** page list.
3961 */
3962 static void makeDirty(PgHdr *pPg){
3963   if( pPg->dirty==0 ){
3964     Pager *pPager = pPg->pPager;
3965     pPg->dirty = 1;
3966     pPg->pDirty = pPager->pDirty;
3967     if( pPager->pDirty ){
3968       pPager->pDirty->pPrevDirty = pPg;
3969     }
3970     pPg->pPrevDirty = 0;
3971     pPager->pDirty = pPg;
3972   }
3973 }
3974 
3975 /*
3976 ** Make a page clean.  Clear its dirty bit and remove it from the
3977 ** dirty page list.
3978 */
3979 static void makeClean(PgHdr *pPg){
3980   if( pPg->dirty ){
3981     pPg->dirty = 0;
3982     if( pPg->pDirty ){
3983       assert( pPg->pDirty->pPrevDirty==pPg );
3984       pPg->pDirty->pPrevDirty = pPg->pPrevDirty;
3985     }
3986     if( pPg->pPrevDirty ){
3987       assert( pPg->pPrevDirty->pDirty==pPg );
3988       pPg->pPrevDirty->pDirty = pPg->pDirty;
3989     }else{
3990       assert( pPg->pPager->pDirty==pPg );
3991       pPg->pPager->pDirty = pPg->pDirty;
3992     }
3993   }
3994 }
3995 
3996 
3997 /*
3998 ** Mark a data page as writeable.  The page is written into the journal
3999 ** if it is not there already.  This routine must be called before making
4000 ** changes to a page.
4001 **
4002 ** The first time this routine is called, the pager creates a new
4003 ** journal and acquires a RESERVED lock on the database.  If the RESERVED
4004 ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
4005 ** calling routine must check for that return value and be careful not to
4006 ** change any page data until this routine returns SQLITE_OK.
4007 **
4008 ** If the journal file could not be written because the disk is full,
4009 ** then this routine returns SQLITE_FULL and does an immediate rollback.
4010 ** All subsequent write attempts also return SQLITE_FULL until there
4011 ** is a call to sqlite3PagerCommit() or sqlite3PagerRollback() to
4012 ** reset.
4013 */
4014 static int pager_write(PgHdr *pPg){
4015   void *pData = PGHDR_TO_DATA(pPg);
4016   Pager *pPager = pPg->pPager;
4017   int rc = SQLITE_OK;
4018 
4019   /* Check for errors
4020   */
4021   if( pPager->errCode ){
4022     return pPager->errCode;
4023   }
4024   if( pPager->readOnly ){
4025     return SQLITE_PERM;
4026   }
4027 
4028   assert( !pPager->setMaster );
4029 
4030   CHECK_PAGE(pPg);
4031 
4032   /* If this page was previously acquired with noContent==1, that means
4033   ** we didn't really read in the content of the page.  This can happen
4034   ** (for example) when the page is being moved to the freelist.  But
4035   ** now we are (perhaps) moving the page off of the freelist for
4036   ** reuse and we need to know its original content so that content
4037   ** can be stored in the rollback journal.  So do the read at this
4038   ** time.
4039   */
4040   rc = pager_get_content(pPg);
4041   if( rc ){
4042     return rc;
4043   }
4044 
4045   /* Mark the page as dirty.  If the page has already been written
4046   ** to the journal then we can return right away.
4047   */
4048   makeDirty(pPg);
4049   if( pPg->inJournal && (pageInStatement(pPg) || pPager->stmtInUse==0) ){
4050     pPager->dirtyCache = 1;
4051   }else{
4052 
4053     /* If we get this far, it means that the page needs to be
4054     ** written to the transaction journal or the ckeckpoint journal
4055     ** or both.
4056     **
4057     ** First check to see that the transaction journal exists and
4058     ** create it if it does not.
4059     */
4060     assert( pPager->state!=PAGER_UNLOCK );
4061     rc = sqlite3PagerBegin(pPg, 0);
4062     if( rc!=SQLITE_OK ){
4063       return rc;
4064     }
4065     assert( pPager->state>=PAGER_RESERVED );
4066     if( !pPager->journalOpen && pPager->useJournal ){
4067       rc = pager_open_journal(pPager);
4068       if( rc!=SQLITE_OK ) return rc;
4069     }
4070     assert( pPager->journalOpen || !pPager->useJournal );
4071     pPager->dirtyCache = 1;
4072 
4073     /* The transaction journal now exists and we have a RESERVED or an
4074     ** EXCLUSIVE lock on the main database file.  Write the current page to
4075     ** the transaction journal if it is not there already.
4076     */
4077     if( !pPg->inJournal && (pPager->useJournal || MEMDB) ){
4078       if( (int)pPg->pgno <= pPager->origDbSize ){
4079         if( MEMDB ){
4080           PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4081           PAGERTRACE3("JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4082           assert( pHist->pOrig==0 );
4083           pHist->pOrig = sqlite3_malloc( pPager->pageSize );
4084           if( !pHist->pOrig ){
4085             return SQLITE_NOMEM;
4086           }
4087           memcpy(pHist->pOrig, PGHDR_TO_DATA(pPg), pPager->pageSize);
4088         }else{
4089           u32 cksum;
4090           char *pData2;
4091 
4092           /* We should never write to the journal file the page that
4093           ** contains the database locks.  The following assert verifies
4094           ** that we do not. */
4095           assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
4096           pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
4097           cksum = pager_cksum(pPager, (u8*)pData2);
4098           rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);
4099           if( rc==SQLITE_OK ){
4100             rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize,
4101                                 pPager->journalOff + 4);
4102             pPager->journalOff += pPager->pageSize+4;
4103           }
4104           if( rc==SQLITE_OK ){
4105             rc = write32bits(pPager->jfd, pPager->journalOff, cksum);
4106             pPager->journalOff += 4;
4107           }
4108           IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno,
4109                    pPager->journalOff, pPager->pageSize));
4110           PAGER_INCR(sqlite3_pager_writej_count);
4111           PAGERTRACE5("JOURNAL %d page %d needSync=%d hash(%08x)\n",
4112                PAGERID(pPager), pPg->pgno, pPg->needSync, pager_pagehash(pPg));
4113 
4114           /* An error has occured writing to the journal file. The
4115           ** transaction will be rolled back by the layer above.
4116           */
4117           if( rc!=SQLITE_OK ){
4118             return rc;
4119           }
4120 
4121           pPager->nRec++;
4122           assert( pPager->aInJournal!=0 );
4123           pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4124           pPg->needSync = !pPager->noSync;
4125           if( pPager->stmtInUse ){
4126             pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4127           }
4128         }
4129       }else{
4130         pPg->needSync = !pPager->journalStarted && !pPager->noSync;
4131         PAGERTRACE4("APPEND %d page %d needSync=%d\n",
4132                 PAGERID(pPager), pPg->pgno, pPg->needSync);
4133       }
4134       if( pPg->needSync ){
4135         pPager->needSync = 1;
4136       }
4137       pPg->inJournal = 1;
4138     }
4139 
4140     /* If the statement journal is open and the page is not in it,
4141     ** then write the current page to the statement journal.  Note that
4142     ** the statement journal format differs from the standard journal format
4143     ** in that it omits the checksums and the header.
4144     */
4145     if( pPager->stmtInUse
4146      && !pageInStatement(pPg)
4147      && (int)pPg->pgno<=pPager->stmtSize
4148     ){
4149       assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
4150       if( MEMDB ){
4151         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4152         assert( pHist->pStmt==0 );
4153         pHist->pStmt = sqlite3_malloc( pPager->pageSize );
4154         if( pHist->pStmt ){
4155           memcpy(pHist->pStmt, PGHDR_TO_DATA(pPg), pPager->pageSize);
4156         }
4157         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4158         page_add_to_stmt_list(pPg);
4159       }else{
4160         i64 offset = pPager->stmtNRec*(4+pPager->pageSize);
4161         char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
4162         rc = write32bits(pPager->stfd, offset, pPg->pgno);
4163         if( rc==SQLITE_OK ){
4164           rc = sqlite3OsWrite(pPager->stfd, pData2, pPager->pageSize, offset+4);
4165         }
4166         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
4167         if( rc!=SQLITE_OK ){
4168           return rc;
4169         }
4170         pPager->stmtNRec++;
4171         assert( pPager->aInStmt!=0 );
4172         pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4173       }
4174     }
4175   }
4176 
4177   /* Update the database size and return.
4178   */
4179   assert( pPager->state>=PAGER_SHARED );
4180   if( pPager->dbSize<(int)pPg->pgno ){
4181     pPager->dbSize = pPg->pgno;
4182     if( !MEMDB && pPager->dbSize==PENDING_BYTE/pPager->pageSize ){
4183       pPager->dbSize++;
4184     }
4185   }
4186   return rc;
4187 }
4188 
4189 /*
4190 ** This function is used to mark a data-page as writable. It uses
4191 ** pager_write() to open a journal file (if it is not already open)
4192 ** and write the page *pData to the journal.
4193 **
4194 ** The difference between this function and pager_write() is that this
4195 ** function also deals with the special case where 2 or more pages
4196 ** fit on a single disk sector. In this case all co-resident pages
4197 ** must have been written to the journal file before returning.
4198 */
4199 int sqlite3PagerWrite(DbPage *pDbPage){
4200   int rc = SQLITE_OK;
4201 
4202   PgHdr *pPg = pDbPage;
4203   Pager *pPager = pPg->pPager;
4204   Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);
4205 
4206   pagerEnter(pPager);
4207   if( !MEMDB && nPagePerSector>1 ){
4208     Pgno nPageCount;          /* Total number of pages in database file */
4209     Pgno pg1;                 /* First page of the sector pPg is located on. */
4210     int nPage;                /* Number of pages starting at pg1 to journal */
4211     int ii;
4212     int needSync = 0;
4213 
4214     /* Set the doNotSync flag to 1. This is because we cannot allow a journal
4215     ** header to be written between the pages journaled by this function.
4216     */
4217     assert( pPager->doNotSync==0 );
4218     pPager->doNotSync = 1;
4219 
4220     /* This trick assumes that both the page-size and sector-size are
4221     ** an integer power of 2. It sets variable pg1 to the identifier
4222     ** of the first page of the sector pPg is located on.
4223     */
4224     pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;
4225 
4226     nPageCount = sqlite3PagerPagecount(pPager);
4227     if( pPg->pgno>nPageCount ){
4228       nPage = (pPg->pgno - pg1)+1;
4229     }else if( (pg1+nPagePerSector-1)>nPageCount ){
4230       nPage = nPageCount+1-pg1;
4231     }else{
4232       nPage = nPagePerSector;
4233     }
4234     assert(nPage>0);
4235     assert(pg1<=pPg->pgno);
4236     assert((pg1+nPage)>pPg->pgno);
4237 
4238     for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){
4239       Pgno pg = pg1+ii;
4240       PgHdr *pPage;
4241       if( !pPager->aInJournal || pg==pPg->pgno ||
4242           pg>pPager->origDbSize || !(pPager->aInJournal[pg/8]&(1<<(pg&7)))
4243       ) {
4244         if( pg!=PAGER_MJ_PGNO(pPager) ){
4245           rc = sqlite3PagerGet(pPager, pg, &pPage);
4246           if( rc==SQLITE_OK ){
4247             rc = pager_write(pPage);
4248             if( pPage->needSync ){
4249               needSync = 1;
4250             }
4251             sqlite3PagerUnref(pPage);
4252           }
4253         }
4254       }else if( (pPage = pager_lookup(pPager, pg)) ){
4255         if( pPage->needSync ){
4256           needSync = 1;
4257         }
4258       }
4259     }
4260 
4261     /* If the PgHdr.needSync flag is set for any of the nPage pages
4262     ** starting at pg1, then it needs to be set for all of them. Because
4263     ** writing to any of these nPage pages may damage the others, the
4264     ** journal file must contain sync()ed copies of all of them
4265     ** before any of them can be written out to the database file.
4266     */
4267     if( needSync ){
4268       for(ii=0; ii<nPage && needSync; ii++){
4269         PgHdr *pPage = pager_lookup(pPager, pg1+ii);
4270         if( pPage ) pPage->needSync = 1;
4271       }
4272       assert(pPager->needSync);
4273     }
4274 
4275     assert( pPager->doNotSync==1 );
4276     pPager->doNotSync = 0;
4277   }else{
4278     rc = pager_write(pDbPage);
4279   }
4280   pagerLeave(pPager);
4281   return rc;
4282 }
4283 
4284 /*
4285 ** Return TRUE if the page given in the argument was previously passed
4286 ** to sqlite3PagerWrite().  In other words, return TRUE if it is ok
4287 ** to change the content of the page.
4288 */
4289 #ifndef NDEBUG
4290 int sqlite3PagerIswriteable(DbPage *pPg){
4291   return pPg->dirty;
4292 }
4293 #endif
4294 
4295 #ifndef SQLITE_OMIT_VACUUM
4296 /*
4297 ** Replace the content of a single page with the information in the third
4298 ** argument.
4299 */
4300 int sqlite3PagerOverwrite(Pager *pPager, Pgno pgno, void *pData){
4301   PgHdr *pPg;
4302   int rc;
4303 
4304   pagerEnter(pPager);
4305   rc = sqlite3PagerGet(pPager, pgno, &pPg);
4306   if( rc==SQLITE_OK ){
4307     rc = sqlite3PagerWrite(pPg);
4308     if( rc==SQLITE_OK ){
4309       memcpy(sqlite3PagerGetData(pPg), pData, pPager->pageSize);
4310     }
4311     sqlite3PagerUnref(pPg);
4312   }
4313   pagerLeave(pPager);
4314   return rc;
4315 }
4316 #endif
4317 
4318 /*
4319 ** A call to this routine tells the pager that it is not necessary to
4320 ** write the information on page pPg back to the disk, even though
4321 ** that page might be marked as dirty.
4322 **
4323 ** The overlying software layer calls this routine when all of the data
4324 ** on the given page is unused.  The pager marks the page as clean so
4325 ** that it does not get written to disk.
4326 **
4327 ** Tests show that this optimization, together with the
4328 ** sqlite3PagerDontRollback() below, more than double the speed
4329 ** of large INSERT operations and quadruple the speed of large DELETEs.
4330 **
4331 ** When this routine is called, set the alwaysRollback flag to true.
4332 ** Subsequent calls to sqlite3PagerDontRollback() for the same page
4333 ** will thereafter be ignored.  This is necessary to avoid a problem
4334 ** where a page with data is added to the freelist during one part of
4335 ** a transaction then removed from the freelist during a later part
4336 ** of the same transaction and reused for some other purpose.  When it
4337 ** is first added to the freelist, this routine is called.  When reused,
4338 ** the sqlite3PagerDontRollback() routine is called.  But because the
4339 ** page contains critical data, we still need to be sure it gets
4340 ** rolled back in spite of the sqlite3PagerDontRollback() call.
4341 */
4342 void sqlite3PagerDontWrite(DbPage *pDbPage){
4343   PgHdr *pPg = pDbPage;
4344   Pager *pPager = pPg->pPager;
4345 
4346   if( MEMDB ) return;
4347   pagerEnter(pPager);
4348   pPg->alwaysRollback = 1;
4349   if( pPg->dirty && !pPager->stmtInUse ){
4350     assert( pPager->state>=PAGER_SHARED );
4351     if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
4352       /* If this pages is the last page in the file and the file has grown
4353       ** during the current transaction, then do NOT mark the page as clean.
4354       ** When the database file grows, we must make sure that the last page
4355       ** gets written at least once so that the disk file will be the correct
4356       ** size. If you do not write this page and the size of the file
4357       ** on the disk ends up being too small, that can lead to database
4358       ** corruption during the next transaction.
4359       */
4360     }else{
4361       PAGERTRACE3("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager));
4362       IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))
4363       makeClean(pPg);
4364 #ifdef SQLITE_CHECK_PAGES
4365       pPg->pageHash = pager_pagehash(pPg);
4366 #endif
4367     }
4368   }
4369   pagerLeave(pPager);
4370 }
4371 
4372 /*
4373 ** A call to this routine tells the pager that if a rollback occurs,
4374 ** it is not necessary to restore the data on the given page.  This
4375 ** means that the pager does not have to record the given page in the
4376 ** rollback journal.
4377 **
4378 ** If we have not yet actually read the content of this page (if
4379 ** the PgHdr.needRead flag is set) then this routine acts as a promise
4380 ** that we will never need to read the page content in the future.
4381 ** so the needRead flag can be cleared at this point.
4382 */
4383 void sqlite3PagerDontRollback(DbPage *pPg){
4384   Pager *pPager = pPg->pPager;
4385 
4386   pagerEnter(pPager);
4387   assert( pPager->state>=PAGER_RESERVED );
4388   if( pPager->journalOpen==0 ) return;
4389   if( pPg->alwaysRollback || pPager->alwaysRollback || MEMDB ) return;
4390   if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
4391     assert( pPager->aInJournal!=0 );
4392     pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4393     pPg->inJournal = 1;
4394     pPg->needRead = 0;
4395     if( pPager->stmtInUse ){
4396       pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4397     }
4398     PAGERTRACE3("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager));
4399     IOTRACE(("GARBAGE %p %d\n", pPager, pPg->pgno))
4400   }
4401   if( pPager->stmtInUse
4402    && !pageInStatement(pPg)
4403    && (int)pPg->pgno<=pPager->stmtSize
4404   ){
4405     assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
4406     assert( pPager->aInStmt!=0 );
4407     pPager->aInStmt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
4408   }
4409   pagerLeave(pPager);
4410 }
4411 
4412 
4413 /*
4414 ** This routine is called to increment the database file change-counter,
4415 ** stored at byte 24 of the pager file.
4416 */
4417 static int pager_incr_changecounter(Pager *pPager, int isDirect){
4418   PgHdr *pPgHdr;
4419   u32 change_counter;
4420   int rc = SQLITE_OK;
4421 
4422   if( !pPager->changeCountDone ){
4423     /* Open page 1 of the file for writing. */
4424     rc = sqlite3PagerGet(pPager, 1, &pPgHdr);
4425     if( rc!=SQLITE_OK ) return rc;
4426 
4427     if( !isDirect ){
4428       rc = sqlite3PagerWrite(pPgHdr);
4429       if( rc!=SQLITE_OK ){
4430         sqlite3PagerUnref(pPgHdr);
4431         return rc;
4432       }
4433     }
4434 
4435     /* Increment the value just read and write it back to byte 24. */
4436     change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers);
4437     change_counter++;
4438     put32bits(((char*)PGHDR_TO_DATA(pPgHdr))+24, change_counter);
4439 
4440     if( isDirect && pPager->fd->pMethods ){
4441       const void *zBuf = PGHDR_TO_DATA(pPgHdr);
4442       rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);
4443     }
4444 
4445     /* Release the page reference. */
4446     sqlite3PagerUnref(pPgHdr);
4447     pPager->changeCountDone = 1;
4448   }
4449   return rc;
4450 }
4451 
4452 /*
4453 ** Sync the database file for the pager pPager. zMaster points to the name
4454 ** of a master journal file that should be written into the individual
4455 ** journal file. zMaster may be NULL, which is interpreted as no master
4456 ** journal (a single database transaction).
4457 **
4458 ** This routine ensures that the journal is synced, all dirty pages written
4459 ** to the database file and the database file synced. The only thing that
4460 ** remains to commit the transaction is to delete the journal file (or
4461 ** master journal file if specified).
4462 **
4463 ** Note that if zMaster==NULL, this does not overwrite a previous value
4464 ** passed to an sqlite3PagerCommitPhaseOne() call.
4465 **
4466 ** If parameter nTrunc is non-zero, then the pager file is truncated to
4467 ** nTrunc pages (this is used by auto-vacuum databases).
4468 */
4469 int sqlite3PagerCommitPhaseOne(Pager *pPager, const char *zMaster, Pgno nTrunc){
4470   int rc = SQLITE_OK;
4471 
4472   PAGERTRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n",
4473       pPager->zFilename, zMaster, nTrunc);
4474   pagerEnter(pPager);
4475 
4476   /* If this is an in-memory db, or no pages have been written to, or this
4477   ** function has already been called, it is a no-op.
4478   */
4479   if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
4480     PgHdr *pPg;
4481 
4482 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
4483     /* The atomic-write optimization can be used if all of the
4484     ** following are true:
4485     **
4486     **    + The file-system supports the atomic-write property for
4487     **      blocks of size page-size, and
4488     **    + This commit is not part of a multi-file transaction, and
4489     **    + Exactly one page has been modified and store in the journal file.
4490     **
4491     ** If the optimization can be used, then the journal file will never
4492     ** be created for this transaction.
4493     */
4494     int useAtomicWrite = (
4495         !zMaster &&
4496         pPager->journalOff==jrnlBufferSize(pPager) &&
4497         nTrunc==0 &&
4498         (0==pPager->pDirty || 0==pPager->pDirty->pDirty)
4499     );
4500     if( useAtomicWrite ){
4501       /* Update the nRec field in the journal file. */
4502       int offset = pPager->journalHdr + sizeof(aJournalMagic);
4503       assert(pPager->nRec==1);
4504       rc = write32bits(pPager->jfd, offset, pPager->nRec);
4505 
4506       /* Update the db file change counter. The following call will modify
4507       ** the in-memory representation of page 1 to include the updated
4508       ** change counter and then write page 1 directly to the database
4509       ** file. Because of the atomic-write property of the host file-system,
4510       ** this is safe.
4511       */
4512       if( rc==SQLITE_OK ){
4513         rc = pager_incr_changecounter(pPager, 1);
4514       }
4515     }else{
4516       rc = sqlite3JournalCreate(pPager->jfd);
4517     }
4518 
4519     if( !useAtomicWrite && rc==SQLITE_OK )
4520 #endif
4521 
4522     /* If a master journal file name has already been written to the
4523     ** journal file, then no sync is required. This happens when it is
4524     ** written, then the process fails to upgrade from a RESERVED to an
4525     ** EXCLUSIVE lock. The next time the process tries to commit the
4526     ** transaction the m-j name will have already been written.
4527     */
4528     if( !pPager->setMaster ){
4529       assert( pPager->journalOpen );
4530       rc = pager_incr_changecounter(pPager, 0);
4531       if( rc!=SQLITE_OK ) goto sync_exit;
4532 #ifndef SQLITE_OMIT_AUTOVACUUM
4533       if( nTrunc!=0 ){
4534         /* If this transaction has made the database smaller, then all pages
4535         ** being discarded by the truncation must be written to the journal
4536         ** file.
4537         */
4538         Pgno i;
4539         int iSkip = PAGER_MJ_PGNO(pPager);
4540         for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
4541           if( !(pPager->aInJournal[i/8] & (1<<(i&7))) && i!=iSkip ){
4542             rc = sqlite3PagerGet(pPager, i, &pPg);
4543             if( rc!=SQLITE_OK ) goto sync_exit;
4544             rc = sqlite3PagerWrite(pPg);
4545             sqlite3PagerUnref(pPg);
4546             if( rc!=SQLITE_OK ) goto sync_exit;
4547           }
4548         }
4549       }
4550 #endif
4551       rc = writeMasterJournal(pPager, zMaster);
4552       if( rc!=SQLITE_OK ) goto sync_exit;
4553       rc = syncJournal(pPager);
4554     }
4555     if( rc!=SQLITE_OK ) goto sync_exit;
4556 
4557 #ifndef SQLITE_OMIT_AUTOVACUUM
4558     if( nTrunc!=0 ){
4559       rc = sqlite3PagerTruncate(pPager, nTrunc);
4560       if( rc!=SQLITE_OK ) goto sync_exit;
4561     }
4562 #endif
4563 
4564     /* Write all dirty pages to the database file */
4565     pPg = pager_get_all_dirty_pages(pPager);
4566     rc = pager_write_pagelist(pPg);
4567     if( rc!=SQLITE_OK ){
4568       while( pPg && !pPg->dirty ){ pPg = pPg->pDirty; }
4569       pPager->pDirty = pPg;
4570       goto sync_exit;
4571     }
4572     pPager->pDirty = 0;
4573 
4574     /* Sync the database file. */
4575     if( !pPager->noSync ){
4576       rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
4577     }
4578     IOTRACE(("DBSYNC %p\n", pPager))
4579 
4580     pPager->state = PAGER_SYNCED;
4581   }else if( MEMDB && nTrunc!=0 ){
4582     rc = sqlite3PagerTruncate(pPager, nTrunc);
4583   }
4584 
4585 sync_exit:
4586   if( rc==SQLITE_IOERR_BLOCKED ){
4587     /* pager_incr_changecounter() may attempt to obtain an exclusive
4588      * lock to spill the cache and return IOERR_BLOCKED. But since
4589      * there is no chance the cache is inconsistent, it is
4590      * better to return SQLITE_BUSY.
4591      */
4592     rc = SQLITE_BUSY;
4593   }
4594   pagerLeave(pPager);
4595   return rc;
4596 }
4597 
4598 
4599 /*
4600 ** Commit all changes to the database and release the write lock.
4601 **
4602 ** If the commit fails for any reason, a rollback attempt is made
4603 ** and an error code is returned.  If the commit worked, SQLITE_OK
4604 ** is returned.
4605 */
4606 int sqlite3PagerCommitPhaseTwo(Pager *pPager){
4607   int rc;
4608   PgHdr *pPg;
4609 
4610   if( pPager->errCode ){
4611     return pPager->errCode;
4612   }
4613   if( pPager->state<PAGER_RESERVED ){
4614     return SQLITE_ERROR;
4615   }
4616   pagerEnter(pPager);
4617   PAGERTRACE2("COMMIT %d\n", PAGERID(pPager));
4618   if( MEMDB ){
4619     pPg = pager_get_all_dirty_pages(pPager);
4620     while( pPg ){
4621       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4622       clearHistory(pHist);
4623       pPg->dirty = 0;
4624       pPg->inJournal = 0;
4625       pHist->inStmt = 0;
4626       pPg->needSync = 0;
4627       pHist->pPrevStmt = pHist->pNextStmt = 0;
4628       pPg = pPg->pDirty;
4629     }
4630     pPager->pDirty = 0;
4631 #ifndef NDEBUG
4632     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
4633       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4634       assert( !pPg->alwaysRollback );
4635       assert( !pHist->pOrig );
4636       assert( !pHist->pStmt );
4637     }
4638 #endif
4639     pPager->pStmt = 0;
4640     pPager->state = PAGER_SHARED;
4641     return SQLITE_OK;
4642   }
4643   assert( pPager->journalOpen || !pPager->dirtyCache );
4644   assert( pPager->state==PAGER_SYNCED || !pPager->dirtyCache );
4645   rc = pager_end_transaction(pPager);
4646   rc = pager_error(pPager, rc);
4647   pagerLeave(pPager);
4648   return rc;
4649 }
4650 
4651 /*
4652 ** Rollback all changes.  The database falls back to PAGER_SHARED mode.
4653 ** All in-memory cache pages revert to their original data contents.
4654 ** The journal is deleted.
4655 **
4656 ** This routine cannot fail unless some other process is not following
4657 ** the correct locking protocol or unless some other
4658 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
4659 ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
4660 ** codes are returned for all these occasions.  Otherwise,
4661 ** SQLITE_OK is returned.
4662 */
4663 int sqlite3PagerRollback(Pager *pPager){
4664   int rc;
4665   PAGERTRACE2("ROLLBACK %d\n", PAGERID(pPager));
4666   if( MEMDB ){
4667     PgHdr *p;
4668     for(p=pPager->pAll; p; p=p->pNextAll){
4669       PgHistory *pHist;
4670       assert( !p->alwaysRollback );
4671       if( !p->dirty ){
4672         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pOrig );
4673         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pStmt );
4674         continue;
4675       }
4676 
4677       pHist = PGHDR_TO_HIST(p, pPager);
4678       if( pHist->pOrig ){
4679         memcpy(PGHDR_TO_DATA(p), pHist->pOrig, pPager->pageSize);
4680         PAGERTRACE3("ROLLBACK-PAGE %d of %d\n", p->pgno, PAGERID(pPager));
4681       }else{
4682         PAGERTRACE3("PAGE %d is clean on %d\n", p->pgno, PAGERID(pPager));
4683       }
4684       clearHistory(pHist);
4685       p->dirty = 0;
4686       p->inJournal = 0;
4687       pHist->inStmt = 0;
4688       pHist->pPrevStmt = pHist->pNextStmt = 0;
4689       if( pPager->xReiniter ){
4690         pPager->xReiniter(p, pPager->pageSize);
4691       }
4692     }
4693     pPager->pDirty = 0;
4694     pPager->pStmt = 0;
4695     pPager->dbSize = pPager->origDbSize;
4696     pager_truncate_cache(pPager);
4697     pPager->stmtInUse = 0;
4698     pPager->state = PAGER_SHARED;
4699     return SQLITE_OK;
4700   }
4701 
4702   pagerEnter(pPager);
4703   if( !pPager->dirtyCache || !pPager->journalOpen ){
4704     rc = pager_end_transaction(pPager);
4705     pagerLeave(pPager);
4706     return rc;
4707   }
4708 
4709   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
4710     if( pPager->state>=PAGER_EXCLUSIVE ){
4711       pager_playback(pPager, 0);
4712     }
4713     pagerLeave(pPager);
4714     return pPager->errCode;
4715   }
4716   if( pPager->state==PAGER_RESERVED ){
4717     int rc2;
4718     rc = pager_playback(pPager, 0);
4719     rc2 = pager_end_transaction(pPager);
4720     if( rc==SQLITE_OK ){
4721       rc = rc2;
4722     }
4723   }else{
4724     rc = pager_playback(pPager, 0);
4725   }
4726   /* pager_reset(pPager); */
4727   pPager->dbSize = -1;
4728 
4729   /* If an error occurs during a ROLLBACK, we can no longer trust the pager
4730   ** cache. So call pager_error() on the way out to make any error
4731   ** persistent.
4732   */
4733   rc = pager_error(pPager, rc);
4734   pagerLeave(pPager);
4735   return rc;
4736 }
4737 
4738 /*
4739 ** Return TRUE if the database file is opened read-only.  Return FALSE
4740 ** if the database is (in theory) writable.
4741 */
4742 int sqlite3PagerIsreadonly(Pager *pPager){
4743   return pPager->readOnly;
4744 }
4745 
4746 /*
4747 ** Return the number of references to the pager.
4748 */
4749 int sqlite3PagerRefcount(Pager *pPager){
4750   return pPager->nRef;
4751 }
4752 
4753 #ifdef SQLITE_TEST
4754 /*
4755 ** This routine is used for testing and analysis only.
4756 */
4757 int *sqlite3PagerStats(Pager *pPager){
4758   static int a[11];
4759   a[0] = pPager->nRef;
4760   a[1] = pPager->nPage;
4761   a[2] = pPager->mxPage;
4762   a[3] = pPager->dbSize;
4763   a[4] = pPager->state;
4764   a[5] = pPager->errCode;
4765   a[6] = pPager->nHit;
4766   a[7] = pPager->nMiss;
4767   a[8] = 0;  /* Used to be pPager->nOvfl */
4768   a[9] = pPager->nRead;
4769   a[10] = pPager->nWrite;
4770   return a;
4771 }
4772 #endif
4773 
4774 /*
4775 ** Set the statement rollback point.
4776 **
4777 ** This routine should be called with the transaction journal already
4778 ** open.  A new statement journal is created that can be used to rollback
4779 ** changes of a single SQL command within a larger transaction.
4780 */
4781 static int pagerStmtBegin(Pager *pPager){
4782   int rc;
4783   assert( !pPager->stmtInUse );
4784   assert( pPager->state>=PAGER_SHARED );
4785   assert( pPager->dbSize>=0 );
4786   PAGERTRACE2("STMT-BEGIN %d\n", PAGERID(pPager));
4787   if( MEMDB ){
4788     pPager->stmtInUse = 1;
4789     pPager->stmtSize = pPager->dbSize;
4790     return SQLITE_OK;
4791   }
4792   if( !pPager->journalOpen ){
4793     pPager->stmtAutoopen = 1;
4794     return SQLITE_OK;
4795   }
4796   assert( pPager->journalOpen );
4797   pagerLeave(pPager);
4798   assert( pPager->aInStmt==0 );
4799   pPager->aInStmt = sqlite3MallocZero( pPager->dbSize/8 + 1 );
4800   pagerEnter(pPager);
4801   if( pPager->aInStmt==0 ){
4802     /* sqlite3OsLock(pPager->fd, SHARED_LOCK); */
4803     return SQLITE_NOMEM;
4804   }
4805 #ifndef NDEBUG
4806   rc = sqlite3OsFileSize(pPager->jfd, &pPager->stmtJSize);
4807   if( rc ) goto stmt_begin_failed;
4808   assert( pPager->stmtJSize == pPager->journalOff );
4809 #endif
4810   pPager->stmtJSize = pPager->journalOff;
4811   pPager->stmtSize = pPager->dbSize;
4812   pPager->stmtHdrOff = 0;
4813   pPager->stmtCksum = pPager->cksumInit;
4814   if( !pPager->stmtOpen ){
4815     rc = sqlite3PagerOpentemp(pPager->pVfs, pPager->stfd, pPager->zStmtJrnl,
4816                               SQLITE_OPEN_SUBJOURNAL);
4817     if( rc ){
4818       goto stmt_begin_failed;
4819     }
4820     pPager->stmtOpen = 1;
4821     pPager->stmtNRec = 0;
4822   }
4823   pPager->stmtInUse = 1;
4824   return SQLITE_OK;
4825 
4826 stmt_begin_failed:
4827   if( pPager->aInStmt ){
4828     sqlite3_free(pPager->aInStmt);
4829     pPager->aInStmt = 0;
4830   }
4831   return rc;
4832 }
4833 int sqlite3PagerStmtBegin(Pager *pPager){
4834   int rc;
4835   pagerEnter(pPager);
4836   rc = pagerStmtBegin(pPager);
4837   pagerLeave(pPager);
4838   return rc;
4839 }
4840 
4841 /*
4842 ** Commit a statement.
4843 */
4844 int sqlite3PagerStmtCommit(Pager *pPager){
4845   pagerEnter(pPager);
4846   if( pPager->stmtInUse ){
4847     PgHdr *pPg, *pNext;
4848     PAGERTRACE2("STMT-COMMIT %d\n", PAGERID(pPager));
4849     if( !MEMDB ){
4850       /* sqlite3OsTruncate(pPager->stfd, 0); */
4851       sqlite3_free( pPager->aInStmt );
4852       pPager->aInStmt = 0;
4853     }else{
4854       for(pPg=pPager->pStmt; pPg; pPg=pNext){
4855         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
4856         pNext = pHist->pNextStmt;
4857         assert( pHist->inStmt );
4858         pHist->inStmt = 0;
4859         pHist->pPrevStmt = pHist->pNextStmt = 0;
4860         sqlite3_free(pHist->pStmt);
4861         pHist->pStmt = 0;
4862       }
4863     }
4864     pPager->stmtNRec = 0;
4865     pPager->stmtInUse = 0;
4866     pPager->pStmt = 0;
4867   }
4868   pPager->stmtAutoopen = 0;
4869   pagerLeave(pPager);
4870   return SQLITE_OK;
4871 }
4872 
4873 /*
4874 ** Rollback a statement.
4875 */
4876 int sqlite3PagerStmtRollback(Pager *pPager){
4877   int rc;
4878   pagerEnter(pPager);
4879   if( pPager->stmtInUse ){
4880     PAGERTRACE2("STMT-ROLLBACK %d\n", PAGERID(pPager));
4881     if( MEMDB ){
4882       PgHdr *pPg;
4883       PgHistory *pHist;
4884       for(pPg=pPager->pStmt; pPg; pPg=pHist->pNextStmt){
4885         pHist = PGHDR_TO_HIST(pPg, pPager);
4886         if( pHist->pStmt ){
4887           memcpy(PGHDR_TO_DATA(pPg), pHist->pStmt, pPager->pageSize);
4888           sqlite3_free(pHist->pStmt);
4889           pHist->pStmt = 0;
4890         }
4891       }
4892       pPager->dbSize = pPager->stmtSize;
4893       pager_truncate_cache(pPager);
4894       rc = SQLITE_OK;
4895     }else{
4896       rc = pager_stmt_playback(pPager);
4897     }
4898     sqlite3PagerStmtCommit(pPager);
4899   }else{
4900     rc = SQLITE_OK;
4901   }
4902   pPager->stmtAutoopen = 0;
4903   pagerLeave(pPager);
4904   return rc;
4905 }
4906 
4907 /*
4908 ** Return the full pathname of the database file.
4909 */
4910 const char *sqlite3PagerFilename(Pager *pPager){
4911   return pPager->zFilename;
4912 }
4913 
4914 /*
4915 ** Return the VFS structure for the pager.
4916 */
4917 const sqlite3_vfs *sqlite3PagerVfs(Pager *pPager){
4918   return pPager->pVfs;
4919 }
4920 
4921 /*
4922 ** Return the file handle for the database file associated
4923 ** with the pager.  This might return NULL if the file has
4924 ** not yet been opened.
4925 */
4926 sqlite3_file *sqlite3PagerFile(Pager *pPager){
4927   return pPager->fd;
4928 }
4929 
4930 /*
4931 ** Return the directory of the database file.
4932 */
4933 const char *sqlite3PagerDirname(Pager *pPager){
4934   return pPager->zDirectory;
4935 }
4936 
4937 /*
4938 ** Return the full pathname of the journal file.
4939 */
4940 const char *sqlite3PagerJournalname(Pager *pPager){
4941   return pPager->zJournal;
4942 }
4943 
4944 /*
4945 ** Return true if fsync() calls are disabled for this pager.  Return FALSE
4946 ** if fsync()s are executed normally.
4947 */
4948 int sqlite3PagerNosync(Pager *pPager){
4949   return pPager->noSync;
4950 }
4951 
4952 #ifdef SQLITE_HAS_CODEC
4953 /*
4954 ** Set the codec for this pager
4955 */
4956 void sqlite3PagerSetCodec(
4957   Pager *pPager,
4958   void *(*xCodec)(void*,void*,Pgno,int),
4959   void *pCodecArg
4960 ){
4961   pPager->xCodec = xCodec;
4962   pPager->pCodecArg = pCodecArg;
4963 }
4964 #endif
4965 
4966 #ifndef SQLITE_OMIT_AUTOVACUUM
4967 /*
4968 ** Move the page pPg to location pgno in the file.
4969 **
4970 ** There must be no references to the page previously located at
4971 ** pgno (which we call pPgOld) though that page is allowed to be
4972 ** in cache.  If the page previous located at pgno is not already
4973 ** in the rollback journal, it is not put there by by this routine.
4974 **
4975 ** References to the page pPg remain valid. Updating any
4976 ** meta-data associated with pPg (i.e. data stored in the nExtra bytes
4977 ** allocated along with the page) is the responsibility of the caller.
4978 **
4979 ** A transaction must be active when this routine is called. It used to be
4980 ** required that a statement transaction was not active, but this restriction
4981 ** has been removed (CREATE INDEX needs to move a page when a statement
4982 ** transaction is active).
4983 */
4984 int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno){
4985   PgHdr *pPgOld;  /* The page being overwritten. */
4986   int h;
4987   Pgno needSyncPgno = 0;
4988 
4989   pagerEnter(pPager);
4990   assert( pPg->nRef>0 );
4991 
4992   PAGERTRACE5("MOVE %d page %d (needSync=%d) moves to %d\n",
4993       PAGERID(pPager), pPg->pgno, pPg->needSync, pgno);
4994   IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))
4995 
4996   pager_get_content(pPg);
4997   if( pPg->needSync ){
4998     needSyncPgno = pPg->pgno;
4999     assert( pPg->inJournal || (int)pgno>pPager->origDbSize );
5000     assert( pPg->dirty );
5001     assert( pPager->needSync );
5002   }
5003 
5004   /* Unlink pPg from its hash-chain */
5005   unlinkHashChain(pPager, pPg);
5006 
5007   /* If the cache contains a page with page-number pgno, remove it
5008   ** from its hash chain. Also, if the PgHdr.needSync was set for
5009   ** page pgno before the 'move' operation, it needs to be retained
5010   ** for the page moved there.
5011   */
5012   pPg->needSync = 0;
5013   pPgOld = pager_lookup(pPager, pgno);
5014   if( pPgOld ){
5015     assert( pPgOld->nRef==0 );
5016     unlinkHashChain(pPager, pPgOld);
5017     makeClean(pPgOld);
5018     pPg->needSync = pPgOld->needSync;
5019   }else{
5020     pPg->needSync = 0;
5021   }
5022   if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
5023     pPg->inJournal =  (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
5024   }else{
5025     pPg->inJournal = 0;
5026     assert( pPg->needSync==0 || (int)pgno>pPager->origDbSize );
5027   }
5028 
5029   /* Change the page number for pPg and insert it into the new hash-chain. */
5030   assert( pgno!=0 );
5031   pPg->pgno = pgno;
5032   h = pgno & (pPager->nHash-1);
5033   if( pPager->aHash[h] ){
5034     assert( pPager->aHash[h]->pPrevHash==0 );
5035     pPager->aHash[h]->pPrevHash = pPg;
5036   }
5037   pPg->pNextHash = pPager->aHash[h];
5038   pPager->aHash[h] = pPg;
5039   pPg->pPrevHash = 0;
5040 
5041   makeDirty(pPg);
5042   pPager->dirtyCache = 1;
5043 
5044   if( needSyncPgno ){
5045     /* If needSyncPgno is non-zero, then the journal file needs to be
5046     ** sync()ed before any data is written to database file page needSyncPgno.
5047     ** Currently, no such page exists in the page-cache and the
5048     ** Pager.aInJournal bit has been set. This needs to be remedied by loading
5049     ** the page into the pager-cache and setting the PgHdr.needSync flag.
5050     **
5051     ** The sqlite3PagerGet() call may cause the journal to sync. So make
5052     ** sure the Pager.needSync flag is set too.
5053     */
5054     int rc;
5055     PgHdr *pPgHdr;
5056     assert( pPager->needSync );
5057     rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr);
5058     if( rc!=SQLITE_OK ) return rc;
5059     pPager->needSync = 1;
5060     pPgHdr->needSync = 1;
5061     pPgHdr->inJournal = 1;
5062     makeDirty(pPgHdr);
5063     sqlite3PagerUnref(pPgHdr);
5064   }
5065 
5066   pagerLeave(pPager);
5067   return SQLITE_OK;
5068 }
5069 #endif
5070 
5071 /*
5072 ** Return a pointer to the data for the specified page.
5073 */
5074 void *sqlite3PagerGetData(DbPage *pPg){
5075   return PGHDR_TO_DATA(pPg);
5076 }
5077 
5078 /*
5079 ** Return a pointer to the Pager.nExtra bytes of "extra" space
5080 ** allocated along with the specified page.
5081 */
5082 void *sqlite3PagerGetExtra(DbPage *pPg){
5083   Pager *pPager = pPg->pPager;
5084   return (pPager?PGHDR_TO_EXTRA(pPg, pPager):0);
5085 }
5086 
5087 /*
5088 ** Get/set the locking-mode for this pager. Parameter eMode must be one
5089 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or
5090 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then
5091 ** the locking-mode is set to the value specified.
5092 **
5093 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or
5094 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)
5095 ** locking-mode.
5096 */
5097 int sqlite3PagerLockingMode(Pager *pPager, int eMode){
5098   assert( eMode==PAGER_LOCKINGMODE_QUERY
5099             || eMode==PAGER_LOCKINGMODE_NORMAL
5100             || eMode==PAGER_LOCKINGMODE_EXCLUSIVE );
5101   assert( PAGER_LOCKINGMODE_QUERY<0 );
5102   assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );
5103   if( eMode>=0 && !pPager->tempFile ){
5104     pPager->exclusiveMode = eMode;
5105   }
5106   return (int)pPager->exclusiveMode;
5107 }
5108 
5109 #ifdef SQLITE_TEST
5110 /*
5111 ** Print a listing of all referenced pages and their ref count.
5112 */
5113 void sqlite3PagerRefdump(Pager *pPager){
5114   PgHdr *pPg;
5115   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
5116     if( pPg->nRef<=0 ) continue;
5117     sqlite3DebugPrintf("PAGE %3d addr=%p nRef=%d\n",
5118        pPg->pgno, PGHDR_TO_DATA(pPg), pPg->nRef);
5119   }
5120 }
5121 #endif
5122 
5123 #endif /* SQLITE_OMIT_DISKIO */
5124