xref: /sqlite-3.40.0/src/wal.c (revision 60176fa9)
1 /*
2 ** 2010 February 1
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This file contains the implementation of a write-ahead log (WAL) used in
14 ** "journal_mode=WAL" mode.
15 **
16 ** WRITE-AHEAD LOG (WAL) FILE FORMAT
17 **
18 ** A WAL file consists of a header followed by zero or more "frames".
19 ** Each frame records the revised content of a single page from the
20 ** database file.  All changes to the database are recorded by writing
21 ** frames into the WAL.  Transactions commit when a frame is written that
22 ** contains a commit marker.  A single WAL can and usually does record
23 ** multiple transactions.  Periodically, the content of the WAL is
24 ** transferred back into the database file in an operation called a
25 ** "checkpoint".
26 **
27 ** A single WAL file can be used multiple times.  In other words, the
28 ** WAL can fill up with frames and then be checkpointed and then new
29 ** frames can overwrite the old ones.  A WAL always grows from beginning
30 ** toward the end.  Checksums and counters attached to each frame are
31 ** used to determine which frames within the WAL are valid and which
32 ** are leftovers from prior checkpoints.
33 **
34 ** The WAL header is 32 bytes in size and consists of the following eight
35 ** big-endian 32-bit unsigned integer values:
36 **
37 **     0: Magic number.  0x377f0682 or 0x377f0683
38 **     4: File format version.  Currently 3007000
39 **     8: Database page size.  Example: 1024
40 **    12: Checkpoint sequence number
41 **    16: Salt-1, random integer incremented with each checkpoint
42 **    20: Salt-2, a different random integer changing with each ckpt
43 **    24: Checksum-1 (first part of checksum for first 24 bytes of header).
44 **    28: Checksum-2 (second part of checksum for first 24 bytes of header).
45 **
46 ** Immediately following the wal-header are zero or more frames. Each
47 ** frame consists of a 24-byte frame-header followed by a <page-size> bytes
48 ** of page data. The frame-header is six big-endian 32-bit unsigned
49 ** integer values, as follows:
50 **
51 **     0: Page number.
52 **     4: For commit records, the size of the database image in pages
53 **        after the commit. For all other records, zero.
54 **     8: Salt-1 (copied from the header)
55 **    12: Salt-2 (copied from the header)
56 **    16: Checksum-1.
57 **    20: Checksum-2.
58 **
59 ** A frame is considered valid if and only if the following conditions are
60 ** true:
61 **
62 **    (1) The salt-1 and salt-2 values in the frame-header match
63 **        salt values in the wal-header
64 **
65 **    (2) The checksum values in the final 8 bytes of the frame-header
66 **        exactly match the checksum computed consecutively on the
67 **        WAL header and the first 8 bytes and the content of all frames
68 **        up to and including the current frame.
69 **
70 ** The checksum is computed using 32-bit big-endian integers if the
71 ** magic number in the first 4 bytes of the WAL is 0x377f0683 and it
72 ** is computed using little-endian if the magic number is 0x377f0682.
73 ** The checksum values are always stored in the frame header in a
74 ** big-endian format regardless of which byte order is used to compute
75 ** the checksum.  The checksum is computed by interpreting the input as
76 ** an even number of unsigned 32-bit integers: x[0] through x[N].  The
77 ** algorithm used for the checksum is as follows:
78 **
79 **   for i from 0 to n-1 step 2:
80 **     s0 += x[i] + s1;
81 **     s1 += x[i+1] + s0;
82 **   endfor
83 **
84 ** Note that s0 and s1 are both weighted checksums using fibonacci weights
85 ** in reverse order (the largest fibonacci weight occurs on the first element
86 ** of the sequence being summed.)  The s1 value spans all 32-bit
87 ** terms of the sequence whereas s0 omits the final term.
88 **
89 ** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the
90 ** WAL is transferred into the database, then the database is VFS.xSync-ed.
91 ** The VFS.xSync operations serve as write barriers - all writes launched
92 ** before the xSync must complete before any write that launches after the
93 ** xSync begins.
94 **
95 ** After each checkpoint, the salt-1 value is incremented and the salt-2
96 ** value is randomized.  This prevents old and new frames in the WAL from
97 ** being considered valid at the same time and being checkpointing together
98 ** following a crash.
99 **
100 ** READER ALGORITHM
101 **
102 ** To read a page from the database (call it page number P), a reader
103 ** first checks the WAL to see if it contains page P.  If so, then the
104 ** last valid instance of page P that is a followed by a commit frame
105 ** or is a commit frame itself becomes the value read.  If the WAL
106 ** contains no copies of page P that are valid and which are a commit
107 ** frame or are followed by a commit frame, then page P is read from
108 ** the database file.
109 **
110 ** To start a read transaction, the reader records the index of the last
111 ** valid frame in the WAL.  The reader uses this recorded "mxFrame" value
112 ** for all subsequent read operations.  New transactions can be appended
113 ** to the WAL, but as long as the reader uses its original mxFrame value
114 ** and ignores the newly appended content, it will see a consistent snapshot
115 ** of the database from a single point in time.  This technique allows
116 ** multiple concurrent readers to view different versions of the database
117 ** content simultaneously.
118 **
119 ** The reader algorithm in the previous paragraphs works correctly, but
120 ** because frames for page P can appear anywhere within the WAL, the
121 ** reader has to scan the entire WAL looking for page P frames.  If the
122 ** WAL is large (multiple megabytes is typical) that scan can be slow,
123 ** and read performance suffers.  To overcome this problem, a separate
124 ** data structure called the wal-index is maintained to expedite the
125 ** search for frames of a particular page.
126 **
127 ** WAL-INDEX FORMAT
128 **
129 ** Conceptually, the wal-index is shared memory, though VFS implementations
130 ** might choose to implement the wal-index using a mmapped file.  Because
131 ** the wal-index is shared memory, SQLite does not support journal_mode=WAL
132 ** on a network filesystem.  All users of the database must be able to
133 ** share memory.
134 **
135 ** The wal-index is transient.  After a crash, the wal-index can (and should
136 ** be) reconstructed from the original WAL file.  In fact, the VFS is required
137 ** to either truncate or zero the header of the wal-index when the last
138 ** connection to it closes.  Because the wal-index is transient, it can
139 ** use an architecture-specific format; it does not have to be cross-platform.
140 ** Hence, unlike the database and WAL file formats which store all values
141 ** as big endian, the wal-index can store multi-byte values in the native
142 ** byte order of the host computer.
143 **
144 ** The purpose of the wal-index is to answer this question quickly:  Given
145 ** a page number P, return the index of the last frame for page P in the WAL,
146 ** or return NULL if there are no frames for page P in the WAL.
147 **
148 ** The wal-index consists of a header region, followed by an one or
149 ** more index blocks.
150 **
151 ** The wal-index header contains the total number of frames within the WAL
152 ** in the the mxFrame field.
153 **
154 ** Each index block except for the first contains information on
155 ** HASHTABLE_NPAGE frames. The first index block contains information on
156 ** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and
157 ** HASHTABLE_NPAGE are selected so that together the wal-index header and
158 ** first index block are the same size as all other index blocks in the
159 ** wal-index.
160 **
161 ** Each index block contains two sections, a page-mapping that contains the
162 ** database page number associated with each wal frame, and a hash-table
163 ** that allows readers to query an index block for a specific page number.
164 ** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
165 ** for the first index block) 32-bit page numbers. The first entry in the
166 ** first index-block contains the database page number corresponding to the
167 ** first frame in the WAL file. The first entry in the second index block
168 ** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
169 ** the log, and so on.
170 **
171 ** The last index block in a wal-index usually contains less than the full
172 ** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
173 ** depending on the contents of the WAL file. This does not change the
174 ** allocated size of the page-mapping array - the page-mapping array merely
175 ** contains unused entries.
176 **
177 ** Even without using the hash table, the last frame for page P
178 ** can be found by scanning the page-mapping sections of each index block
179 ** starting with the last index block and moving toward the first, and
180 ** within each index block, starting at the end and moving toward the
181 ** beginning.  The first entry that equals P corresponds to the frame
182 ** holding the content for that page.
183 **
184 ** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
185 ** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
186 ** hash table for each page number in the mapping section, so the hash
187 ** table is never more than half full.  The expected number of collisions
188 ** prior to finding a match is 1.  Each entry of the hash table is an
189 ** 1-based index of an entry in the mapping section of the same
190 ** index block.   Let K be the 1-based index of the largest entry in
191 ** the mapping section.  (For index blocks other than the last, K will
192 ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
193 ** K will be (mxFrame%HASHTABLE_NPAGE).)  Unused slots of the hash table
194 ** contain a value of 0.
195 **
196 ** To look for page P in the hash table, first compute a hash iKey on
197 ** P as follows:
198 **
199 **      iKey = (P * 383) % HASHTABLE_NSLOT
200 **
201 ** Then start scanning entries of the hash table, starting with iKey
202 ** (wrapping around to the beginning when the end of the hash table is
203 ** reached) until an unused hash slot is found. Let the first unused slot
204 ** be at index iUnused.  (iUnused might be less than iKey if there was
205 ** wrap-around.) Because the hash table is never more than half full,
206 ** the search is guaranteed to eventually hit an unused entry.  Let
207 ** iMax be the value between iKey and iUnused, closest to iUnused,
208 ** where aHash[iMax]==P.  If there is no iMax entry (if there exists
209 ** no hash slot such that aHash[i]==p) then page P is not in the
210 ** current index block.  Otherwise the iMax-th mapping entry of the
211 ** current index block corresponds to the last entry that references
212 ** page P.
213 **
214 ** A hash search begins with the last index block and moves toward the
215 ** first index block, looking for entries corresponding to page P.  On
216 ** average, only two or three slots in each index block need to be
217 ** examined in order to either find the last entry for page P, or to
218 ** establish that no such entry exists in the block.  Each index block
219 ** holds over 4000 entries.  So two or three index blocks are sufficient
220 ** to cover a typical 10 megabyte WAL file, assuming 1K pages.  8 or 10
221 ** comparisons (on average) suffice to either locate a frame in the
222 ** WAL or to establish that the frame does not exist in the WAL.  This
223 ** is much faster than scanning the entire 10MB WAL.
224 **
225 ** Note that entries are added in order of increasing K.  Hence, one
226 ** reader might be using some value K0 and a second reader that started
227 ** at a later time (after additional transactions were added to the WAL
228 ** and to the wal-index) might be using a different value K1, where K1>K0.
229 ** Both readers can use the same hash table and mapping section to get
230 ** the correct result.  There may be entries in the hash table with
231 ** K>K0 but to the first reader, those entries will appear to be unused
232 ** slots in the hash table and so the first reader will get an answer as
233 ** if no values greater than K0 had ever been inserted into the hash table
234 ** in the first place - which is what reader one wants.  Meanwhile, the
235 ** second reader using K1 will see additional values that were inserted
236 ** later, which is exactly what reader two wants.
237 **
238 ** When a rollback occurs, the value of K is decreased. Hash table entries
239 ** that correspond to frames greater than the new K value are removed
240 ** from the hash table at this point.
241 */
242 #ifndef SQLITE_OMIT_WAL
243 
244 #include "wal.h"
245 
246 /*
247 ** Trace output macros
248 */
249 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
250 int sqlite3WalTrace = 0;
251 # define WALTRACE(X)  if(sqlite3WalTrace) sqlite3DebugPrintf X
252 #else
253 # define WALTRACE(X)
254 #endif
255 
256 /*
257 ** The maximum (and only) versions of the wal and wal-index formats
258 ** that may be interpreted by this version of SQLite.
259 **
260 ** If a client begins recovering a WAL file and finds that (a) the checksum
261 ** values in the wal-header are correct and (b) the version field is not
262 ** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.
263 **
264 ** Similarly, if a client successfully reads a wal-index header (i.e. the
265 ** checksum test is successful) and finds that the version field is not
266 ** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite
267 ** returns SQLITE_CANTOPEN.
268 */
269 #define WAL_MAX_VERSION      3007000
270 #define WALINDEX_MAX_VERSION 3007000
271 
272 /*
273 ** Indices of various locking bytes.   WAL_NREADER is the number
274 ** of available reader locks and should be at least 3.
275 */
276 #define WAL_WRITE_LOCK         0
277 #define WAL_ALL_BUT_WRITE      1
278 #define WAL_CKPT_LOCK          1
279 #define WAL_RECOVER_LOCK       2
280 #define WAL_READ_LOCK(I)       (3+(I))
281 #define WAL_NREADER            (SQLITE_SHM_NLOCK-3)
282 
283 
284 /* Object declarations */
285 typedef struct WalIndexHdr WalIndexHdr;
286 typedef struct WalIterator WalIterator;
287 typedef struct WalCkptInfo WalCkptInfo;
288 
289 
290 /*
291 ** The following object holds a copy of the wal-index header content.
292 **
293 ** The actual header in the wal-index consists of two copies of this
294 ** object.
295 */
296 struct WalIndexHdr {
297   u32 iVersion;                   /* Wal-index version */
298   u32 unused;                     /* Unused (padding) field */
299   u32 iChange;                    /* Counter incremented each transaction */
300   u8 isInit;                      /* 1 when initialized */
301   u8 bigEndCksum;                 /* True if checksums in WAL are big-endian */
302   u16 szPage;                     /* Database page size in bytes */
303   u32 mxFrame;                    /* Index of last valid frame in the WAL */
304   u32 nPage;                      /* Size of database in pages */
305   u32 aFrameCksum[2];             /* Checksum of last frame in log */
306   u32 aSalt[2];                   /* Two salt values copied from WAL header */
307   u32 aCksum[2];                  /* Checksum over all prior fields */
308 };
309 
310 /*
311 ** A copy of the following object occurs in the wal-index immediately
312 ** following the second copy of the WalIndexHdr.  This object stores
313 ** information used by checkpoint.
314 **
315 ** nBackfill is the number of frames in the WAL that have been written
316 ** back into the database. (We call the act of moving content from WAL to
317 ** database "backfilling".)  The nBackfill number is never greater than
318 ** WalIndexHdr.mxFrame.  nBackfill can only be increased by threads
319 ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
320 ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
321 ** mxFrame back to zero when the WAL is reset.
322 **
323 ** There is one entry in aReadMark[] for each reader lock.  If a reader
324 ** holds read-lock K, then the value in aReadMark[K] is no greater than
325 ** the mxFrame for that reader.  The value READMARK_NOT_USED (0xffffffff)
326 ** for any aReadMark[] means that entry is unused.  aReadMark[0] is
327 ** a special case; its value is never used and it exists as a place-holder
328 ** to avoid having to offset aReadMark[] indexs by one.  Readers holding
329 ** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
330 ** directly from the database.
331 **
332 ** The value of aReadMark[K] may only be changed by a thread that
333 ** is holding an exclusive lock on WAL_READ_LOCK(K).  Thus, the value of
334 ** aReadMark[K] cannot changed while there is a reader is using that mark
335 ** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
336 **
337 ** The checkpointer may only transfer frames from WAL to database where
338 ** the frame numbers are less than or equal to every aReadMark[] that is
339 ** in use (that is, every aReadMark[j] for which there is a corresponding
340 ** WAL_READ_LOCK(j)).  New readers (usually) pick the aReadMark[] with the
341 ** largest value and will increase an unused aReadMark[] to mxFrame if there
342 ** is not already an aReadMark[] equal to mxFrame.  The exception to the
343 ** previous sentence is when nBackfill equals mxFrame (meaning that everything
344 ** in the WAL has been backfilled into the database) then new readers
345 ** will choose aReadMark[0] which has value 0 and hence such reader will
346 ** get all their all content directly from the database file and ignore
347 ** the WAL.
348 **
349 ** Writers normally append new frames to the end of the WAL.  However,
350 ** if nBackfill equals mxFrame (meaning that all WAL content has been
351 ** written back into the database) and if no readers are using the WAL
352 ** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
353 ** the writer will first "reset" the WAL back to the beginning and start
354 ** writing new content beginning at frame 1.
355 **
356 ** We assume that 32-bit loads are atomic and so no locks are needed in
357 ** order to read from any aReadMark[] entries.
358 */
359 struct WalCkptInfo {
360   u32 nBackfill;                  /* Number of WAL frames backfilled into DB */
361   u32 aReadMark[WAL_NREADER];     /* Reader marks */
362 };
363 #define READMARK_NOT_USED  0xffffffff
364 
365 
366 /* A block of WALINDEX_LOCK_RESERVED bytes beginning at
367 ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
368 ** only support mandatory file-locks, we do not read or write data
369 ** from the region of the file on which locks are applied.
370 */
371 #define WALINDEX_LOCK_OFFSET   (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
372 #define WALINDEX_LOCK_RESERVED 16
373 #define WALINDEX_HDR_SIZE      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
374 
375 /* Size of header before each frame in wal */
376 #define WAL_FRAME_HDRSIZE 24
377 
378 /* Size of write ahead log header, including checksum. */
379 /* #define WAL_HDRSIZE 24 */
380 #define WAL_HDRSIZE 32
381 
382 /* WAL magic value. Either this value, or the same value with the least
383 ** significant bit also set (WAL_MAGIC | 0x00000001) is stored in 32-bit
384 ** big-endian format in the first 4 bytes of a WAL file.
385 **
386 ** If the LSB is set, then the checksums for each frame within the WAL
387 ** file are calculated by treating all data as an array of 32-bit
388 ** big-endian words. Otherwise, they are calculated by interpreting
389 ** all data as 32-bit little-endian words.
390 */
391 #define WAL_MAGIC 0x377f0682
392 
393 /*
394 ** Return the offset of frame iFrame in the write-ahead log file,
395 ** assuming a database page size of szPage bytes. The offset returned
396 ** is to the start of the write-ahead log frame-header.
397 */
398 #define walFrameOffset(iFrame, szPage) (                               \
399   WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE)         \
400 )
401 
402 /*
403 ** An open write-ahead log file is represented by an instance of the
404 ** following object.
405 */
406 struct Wal {
407   sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
408   sqlite3_file *pDbFd;       /* File handle for the database file */
409   sqlite3_file *pWalFd;      /* File handle for WAL file */
410   u32 iCallback;             /* Value to pass to log callback (or 0) */
411   int nWiData;               /* Size of array apWiData */
412   volatile u32 **apWiData;   /* Pointer to wal-index content in memory */
413   u16 szPage;                /* Database page size */
414   i16 readLock;              /* Which read lock is being held.  -1 for none */
415   u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
416   u8 writeLock;              /* True if in a write transaction */
417   u8 ckptLock;               /* True if holding a checkpoint lock */
418   u8 readOnly;               /* True if the WAL file is open read-only */
419   WalIndexHdr hdr;           /* Wal-index header for current transaction */
420   const char *zWalName;      /* Name of WAL file */
421   u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
422 #ifdef SQLITE_DEBUG
423   u8 lockError;              /* True if a locking error has occurred */
424 #endif
425 };
426 
427 /*
428 ** Each page of the wal-index mapping contains a hash-table made up of
429 ** an array of HASHTABLE_NSLOT elements of the following type.
430 */
431 typedef u16 ht_slot;
432 
433 /*
434 ** This structure is used to implement an iterator that loops through
435 ** all frames in the WAL in database page order. Where two or more frames
436 ** correspond to the same database page, the iterator visits only the
437 ** frame most recently written to the WAL (in other words, the frame with
438 ** the largest index).
439 **
440 ** The internals of this structure are only accessed by:
441 **
442 **   walIteratorInit() - Create a new iterator,
443 **   walIteratorNext() - Step an iterator,
444 **   walIteratorFree() - Free an iterator.
445 **
446 ** This functionality is used by the checkpoint code (see walCheckpoint()).
447 */
448 struct WalIterator {
449   int iPrior;                     /* Last result returned from the iterator */
450   int nSegment;                   /* Size of the aSegment[] array */
451   struct WalSegment {
452     int iNext;                    /* Next slot in aIndex[] not yet returned */
453     ht_slot *aIndex;              /* i0, i1, i2... such that aPgno[iN] ascend */
454     u32 *aPgno;                   /* Array of page numbers. */
455     int nEntry;                   /* Max size of aPgno[] and aIndex[] arrays */
456     int iZero;                    /* Frame number associated with aPgno[0] */
457   } aSegment[1];                  /* One for every 32KB page in the WAL */
458 };
459 
460 /*
461 ** Define the parameters of the hash tables in the wal-index file. There
462 ** is a hash-table following every HASHTABLE_NPAGE page numbers in the
463 ** wal-index.
464 **
465 ** Changing any of these constants will alter the wal-index format and
466 ** create incompatibilities.
467 */
468 #define HASHTABLE_NPAGE      4096                 /* Must be power of 2 */
469 #define HASHTABLE_HASH_1     383                  /* Should be prime */
470 #define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
471 
472 /*
473 ** The block of page numbers associated with the first hash-table in a
474 ** wal-index is smaller than usual. This is so that there is a complete
475 ** hash-table on each aligned 32KB page of the wal-index.
476 */
477 #define HASHTABLE_NPAGE_ONE  (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
478 
479 /* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
480 #define WALINDEX_PGSZ   (                                         \
481     sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
482 )
483 
484 /*
485 ** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
486 ** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
487 ** numbered from zero.
488 **
489 ** If this call is successful, *ppPage is set to point to the wal-index
490 ** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
491 ** then an SQLite error code is returned and *ppPage is set to 0.
492 */
493 static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
494   int rc = SQLITE_OK;
495 
496   /* Enlarge the pWal->apWiData[] array if required */
497   if( pWal->nWiData<=iPage ){
498     int nByte = sizeof(u32*)*(iPage+1);
499     volatile u32 **apNew;
500     apNew = (volatile u32 **)sqlite3_realloc((void *)pWal->apWiData, nByte);
501     if( !apNew ){
502       *ppPage = 0;
503       return SQLITE_NOMEM;
504     }
505     memset((void*)&apNew[pWal->nWiData], 0,
506            sizeof(u32*)*(iPage+1-pWal->nWiData));
507     pWal->apWiData = apNew;
508     pWal->nWiData = iPage+1;
509   }
510 
511   /* Request a pointer to the required page from the VFS */
512   if( pWal->apWiData[iPage]==0 ){
513     rc = sqlite3OsShmMap(pWal->pDbFd, iPage, WALINDEX_PGSZ,
514         pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
515     );
516   }
517 
518   *ppPage = pWal->apWiData[iPage];
519   assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
520   return rc;
521 }
522 
523 /*
524 ** Return a pointer to the WalCkptInfo structure in the wal-index.
525 */
526 static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
527   assert( pWal->nWiData>0 && pWal->apWiData[0] );
528   return (volatile WalCkptInfo*)&(pWal->apWiData[0][sizeof(WalIndexHdr)/2]);
529 }
530 
531 /*
532 ** Return a pointer to the WalIndexHdr structure in the wal-index.
533 */
534 static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
535   assert( pWal->nWiData>0 && pWal->apWiData[0] );
536   return (volatile WalIndexHdr*)pWal->apWiData[0];
537 }
538 
539 /*
540 ** The argument to this macro must be of type u32. On a little-endian
541 ** architecture, it returns the u32 value that results from interpreting
542 ** the 4 bytes as a big-endian value. On a big-endian architecture, it
543 ** returns the value that would be produced by intepreting the 4 bytes
544 ** of the input value as a little-endian integer.
545 */
546 #define BYTESWAP32(x) ( \
547     (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8)  \
548   + (((x)&0x00FF0000)>>8)  + (((x)&0xFF000000)>>24) \
549 )
550 
551 /*
552 ** Generate or extend an 8 byte checksum based on the data in
553 ** array aByte[] and the initial values of aIn[0] and aIn[1] (or
554 ** initial values of 0 and 0 if aIn==NULL).
555 **
556 ** The checksum is written back into aOut[] before returning.
557 **
558 ** nByte must be a positive multiple of 8.
559 */
560 static void walChecksumBytes(
561   int nativeCksum, /* True for native byte-order, false for non-native */
562   u8 *a,           /* Content to be checksummed */
563   int nByte,       /* Bytes of content in a[].  Must be a multiple of 8. */
564   const u32 *aIn,  /* Initial checksum value input */
565   u32 *aOut        /* OUT: Final checksum value output */
566 ){
567   u32 s1, s2;
568   u32 *aData = (u32 *)a;
569   u32 *aEnd = (u32 *)&a[nByte];
570 
571   if( aIn ){
572     s1 = aIn[0];
573     s2 = aIn[1];
574   }else{
575     s1 = s2 = 0;
576   }
577 
578   assert( nByte>=8 );
579   assert( (nByte&0x00000007)==0 );
580 
581   if( nativeCksum ){
582     do {
583       s1 += *aData++ + s2;
584       s2 += *aData++ + s1;
585     }while( aData<aEnd );
586   }else{
587     do {
588       s1 += BYTESWAP32(aData[0]) + s2;
589       s2 += BYTESWAP32(aData[1]) + s1;
590       aData += 2;
591     }while( aData<aEnd );
592   }
593 
594   aOut[0] = s1;
595   aOut[1] = s2;
596 }
597 
598 /*
599 ** Write the header information in pWal->hdr into the wal-index.
600 **
601 ** The checksum on pWal->hdr is updated before it is written.
602 */
603 static void walIndexWriteHdr(Wal *pWal){
604   volatile WalIndexHdr *aHdr = walIndexHdr(pWal);
605   const int nCksum = offsetof(WalIndexHdr, aCksum);
606 
607   assert( pWal->writeLock );
608   pWal->hdr.isInit = 1;
609   pWal->hdr.iVersion = WALINDEX_MAX_VERSION;
610   walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum);
611   memcpy((void *)&aHdr[1], (void *)&pWal->hdr, sizeof(WalIndexHdr));
612   sqlite3OsShmBarrier(pWal->pDbFd);
613   memcpy((void *)&aHdr[0], (void *)&pWal->hdr, sizeof(WalIndexHdr));
614 }
615 
616 /*
617 ** This function encodes a single frame header and writes it to a buffer
618 ** supplied by the caller. A frame-header is made up of a series of
619 ** 4-byte big-endian integers, as follows:
620 **
621 **     0: Page number.
622 **     4: For commit records, the size of the database image in pages
623 **        after the commit. For all other records, zero.
624 **     8: Salt-1 (copied from the wal-header)
625 **    12: Salt-2 (copied from the wal-header)
626 **    16: Checksum-1.
627 **    20: Checksum-2.
628 */
629 static void walEncodeFrame(
630   Wal *pWal,                      /* The write-ahead log */
631   u32 iPage,                      /* Database page number for frame */
632   u32 nTruncate,                  /* New db size (or 0 for non-commit frames) */
633   u8 *aData,                      /* Pointer to page data */
634   u8 *aFrame                      /* OUT: Write encoded frame here */
635 ){
636   int nativeCksum;                /* True for native byte-order checksums */
637   u32 *aCksum = pWal->hdr.aFrameCksum;
638   assert( WAL_FRAME_HDRSIZE==24 );
639   sqlite3Put4byte(&aFrame[0], iPage);
640   sqlite3Put4byte(&aFrame[4], nTruncate);
641   memcpy(&aFrame[8], pWal->hdr.aSalt, 8);
642 
643   nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);
644   walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
645   walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);
646 
647   sqlite3Put4byte(&aFrame[16], aCksum[0]);
648   sqlite3Put4byte(&aFrame[20], aCksum[1]);
649 }
650 
651 /*
652 ** Check to see if the frame with header in aFrame[] and content
653 ** in aData[] is valid.  If it is a valid frame, fill *piPage and
654 ** *pnTruncate and return true.  Return if the frame is not valid.
655 */
656 static int walDecodeFrame(
657   Wal *pWal,                      /* The write-ahead log */
658   u32 *piPage,                    /* OUT: Database page number for frame */
659   u32 *pnTruncate,                /* OUT: New db size (or 0 if not commit) */
660   u8 *aData,                      /* Pointer to page data (for checksum) */
661   u8 *aFrame                      /* Frame data */
662 ){
663   int nativeCksum;                /* True for native byte-order checksums */
664   u32 *aCksum = pWal->hdr.aFrameCksum;
665   u32 pgno;                       /* Page number of the frame */
666   assert( WAL_FRAME_HDRSIZE==24 );
667 
668   /* A frame is only valid if the salt values in the frame-header
669   ** match the salt values in the wal-header.
670   */
671   if( memcmp(&pWal->hdr.aSalt, &aFrame[8], 8)!=0 ){
672     return 0;
673   }
674 
675   /* A frame is only valid if the page number is creater than zero.
676   */
677   pgno = sqlite3Get4byte(&aFrame[0]);
678   if( pgno==0 ){
679     return 0;
680   }
681 
682   /* A frame is only valid if a checksum of the WAL header,
683   ** all prior frams, the first 16 bytes of this frame-header,
684   ** and the frame-data matches the checksum in the last 8
685   ** bytes of this frame-header.
686   */
687   nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);
688   walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
689   walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);
690   if( aCksum[0]!=sqlite3Get4byte(&aFrame[16])
691    || aCksum[1]!=sqlite3Get4byte(&aFrame[20])
692   ){
693     /* Checksum failed. */
694     return 0;
695   }
696 
697   /* If we reach this point, the frame is valid.  Return the page number
698   ** and the new database size.
699   */
700   *piPage = pgno;
701   *pnTruncate = sqlite3Get4byte(&aFrame[4]);
702   return 1;
703 }
704 
705 
706 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
707 /*
708 ** Names of locks.  This routine is used to provide debugging output and is not
709 ** a part of an ordinary build.
710 */
711 static const char *walLockName(int lockIdx){
712   if( lockIdx==WAL_WRITE_LOCK ){
713     return "WRITE-LOCK";
714   }else if( lockIdx==WAL_CKPT_LOCK ){
715     return "CKPT-LOCK";
716   }else if( lockIdx==WAL_RECOVER_LOCK ){
717     return "RECOVER-LOCK";
718   }else{
719     static char zName[15];
720     sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",
721                      lockIdx-WAL_READ_LOCK(0));
722     return zName;
723   }
724 }
725 #endif /*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
726 
727 
728 /*
729 ** Set or release locks on the WAL.  Locks are either shared or exclusive.
730 ** A lock cannot be moved directly between shared and exclusive - it must go
731 ** through the unlocked state first.
732 **
733 ** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
734 */
735 static int walLockShared(Wal *pWal, int lockIdx){
736   int rc;
737   if( pWal->exclusiveMode ) return SQLITE_OK;
738   rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
739                         SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
740   WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,
741             walLockName(lockIdx), rc ? "failed" : "ok"));
742   VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
743   return rc;
744 }
745 static void walUnlockShared(Wal *pWal, int lockIdx){
746   if( pWal->exclusiveMode ) return;
747   (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
748                          SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
749   WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));
750 }
751 static int walLockExclusive(Wal *pWal, int lockIdx, int n){
752   int rc;
753   if( pWal->exclusiveMode ) return SQLITE_OK;
754   rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
755                         SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
756   WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,
757             walLockName(lockIdx), n, rc ? "failed" : "ok"));
758   VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )
759   return rc;
760 }
761 static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
762   if( pWal->exclusiveMode ) return;
763   (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
764                          SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
765   WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
766              walLockName(lockIdx), n));
767 }
768 
769 /*
770 ** Compute a hash on a page number.  The resulting hash value must land
771 ** between 0 and (HASHTABLE_NSLOT-1).  The walHashNext() function advances
772 ** the hash to the next value in the event of a collision.
773 */
774 static int walHash(u32 iPage){
775   assert( iPage>0 );
776   assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
777   return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
778 }
779 static int walNextHash(int iPriorHash){
780   return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
781 }
782 
783 /*
784 ** Return pointers to the hash table and page number array stored on
785 ** page iHash of the wal-index. The wal-index is broken into 32KB pages
786 ** numbered starting from 0.
787 **
788 ** Set output variable *paHash to point to the start of the hash table
789 ** in the wal-index file. Set *piZero to one less than the frame
790 ** number of the first frame indexed by this hash table. If a
791 ** slot in the hash table is set to N, it refers to frame number
792 ** (*piZero+N) in the log.
793 **
794 ** Finally, set *paPgno so that *paPgno[1] is the page number of the
795 ** first frame indexed by the hash table, frame (*piZero+1).
796 */
797 static int walHashGet(
798   Wal *pWal,                      /* WAL handle */
799   int iHash,                      /* Find the iHash'th table */
800   volatile ht_slot **paHash,      /* OUT: Pointer to hash index */
801   volatile u32 **paPgno,          /* OUT: Pointer to page number array */
802   u32 *piZero                     /* OUT: Frame associated with *paPgno[0] */
803 ){
804   int rc;                         /* Return code */
805   volatile u32 *aPgno;
806 
807   rc = walIndexPage(pWal, iHash, &aPgno);
808   assert( rc==SQLITE_OK || iHash>0 );
809 
810   if( rc==SQLITE_OK ){
811     u32 iZero;
812     volatile ht_slot *aHash;
813 
814     aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];
815     if( iHash==0 ){
816       aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];
817       iZero = 0;
818     }else{
819       iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
820     }
821 
822     *paPgno = &aPgno[-1];
823     *paHash = aHash;
824     *piZero = iZero;
825   }
826   return rc;
827 }
828 
829 /*
830 ** Return the number of the wal-index page that contains the hash-table
831 ** and page-number array that contain entries corresponding to WAL frame
832 ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages
833 ** are numbered starting from 0.
834 */
835 static int walFramePage(u32 iFrame){
836   int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;
837   assert( (iHash==0 || iFrame>HASHTABLE_NPAGE_ONE)
838        && (iHash>=1 || iFrame<=HASHTABLE_NPAGE_ONE)
839        && (iHash<=1 || iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))
840        && (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)
841        && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))
842   );
843   return iHash;
844 }
845 
846 /*
847 ** Return the page number associated with frame iFrame in this WAL.
848 */
849 static u32 walFramePgno(Wal *pWal, u32 iFrame){
850   int iHash = walFramePage(iFrame);
851   if( iHash==0 ){
852     return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];
853   }
854   return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];
855 }
856 
857 /*
858 ** Remove entries from the hash table that point to WAL slots greater
859 ** than pWal->hdr.mxFrame.
860 **
861 ** This function is called whenever pWal->hdr.mxFrame is decreased due
862 ** to a rollback or savepoint.
863 **
864 ** At most only the hash table containing pWal->hdr.mxFrame needs to be
865 ** updated.  Any later hash tables will be automatically cleared when
866 ** pWal->hdr.mxFrame advances to the point where those hash tables are
867 ** actually needed.
868 */
869 static void walCleanupHash(Wal *pWal){
870   volatile ht_slot *aHash = 0;    /* Pointer to hash table to clear */
871   volatile u32 *aPgno = 0;        /* Page number array for hash table */
872   u32 iZero = 0;                  /* frame == (aHash[x]+iZero) */
873   int iLimit = 0;                 /* Zero values greater than this */
874   int nByte;                      /* Number of bytes to zero in aPgno[] */
875   int i;                          /* Used to iterate through aHash[] */
876 
877   assert( pWal->writeLock );
878   testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 );
879   testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE );
880   testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 );
881 
882   if( pWal->hdr.mxFrame==0 ) return;
883 
884   /* Obtain pointers to the hash-table and page-number array containing
885   ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed
886   ** that the page said hash-table and array reside on is already mapped.
887   */
888   assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) );
889   assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] );
890   walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &aHash, &aPgno, &iZero);
891 
892   /* Zero all hash-table entries that correspond to frame numbers greater
893   ** than pWal->hdr.mxFrame.
894   */
895   iLimit = pWal->hdr.mxFrame - iZero;
896   assert( iLimit>0 );
897   for(i=0; i<HASHTABLE_NSLOT; i++){
898     if( aHash[i]>iLimit ){
899       aHash[i] = 0;
900     }
901   }
902 
903   /* Zero the entries in the aPgno array that correspond to frames with
904   ** frame numbers greater than pWal->hdr.mxFrame.
905   */
906   nByte = (int)((char *)aHash - (char *)&aPgno[iLimit+1]);
907   memset((void *)&aPgno[iLimit+1], 0, nByte);
908 
909 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
910   /* Verify that the every entry in the mapping region is still reachable
911   ** via the hash table even after the cleanup.
912   */
913   if( iLimit ){
914     int i;           /* Loop counter */
915     int iKey;        /* Hash key */
916     for(i=1; i<=iLimit; i++){
917       for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
918         if( aHash[iKey]==i ) break;
919       }
920       assert( aHash[iKey]==i );
921     }
922   }
923 #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
924 }
925 
926 
927 /*
928 ** Set an entry in the wal-index that will map database page number
929 ** pPage into WAL frame iFrame.
930 */
931 static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
932   int rc;                         /* Return code */
933   u32 iZero = 0;                  /* One less than frame number of aPgno[1] */
934   volatile u32 *aPgno = 0;        /* Page number array */
935   volatile ht_slot *aHash = 0;    /* Hash table */
936 
937   rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);
938 
939   /* Assuming the wal-index file was successfully mapped, populate the
940   ** page number array and hash table entry.
941   */
942   if( rc==SQLITE_OK ){
943     int iKey;                     /* Hash table key */
944     int idx;                      /* Value to write to hash-table slot */
945     int nCollide;                 /* Number of hash collisions */
946 
947     idx = iFrame - iZero;
948     assert( idx <= HASHTABLE_NSLOT/2 + 1 );
949 
950     /* If this is the first entry to be added to this hash-table, zero the
951     ** entire hash table and aPgno[] array before proceding.
952     */
953     if( idx==1 ){
954       int nByte = (int)((u8 *)&aHash[HASHTABLE_NSLOT] - (u8 *)&aPgno[1]);
955       memset((void*)&aPgno[1], 0, nByte);
956     }
957 
958     /* If the entry in aPgno[] is already set, then the previous writer
959     ** must have exited unexpectedly in the middle of a transaction (after
960     ** writing one or more dirty pages to the WAL to free up memory).
961     ** Remove the remnants of that writers uncommitted transaction from
962     ** the hash-table before writing any new entries.
963     */
964     if( aPgno[idx] ){
965       walCleanupHash(pWal);
966       assert( !aPgno[idx] );
967     }
968 
969     /* Write the aPgno[] array entry and the hash-table slot. */
970     nCollide = idx;
971     for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
972       if( (nCollide--)==0 ) return SQLITE_CORRUPT_BKPT;
973     }
974     aPgno[idx] = iPage;
975     aHash[iKey] = (ht_slot)idx;
976 
977 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
978     /* Verify that the number of entries in the hash table exactly equals
979     ** the number of entries in the mapping region.
980     */
981     {
982       int i;           /* Loop counter */
983       int nEntry = 0;  /* Number of entries in the hash table */
984       for(i=0; i<HASHTABLE_NSLOT; i++){ if( aHash[i] ) nEntry++; }
985       assert( nEntry==idx );
986     }
987 
988     /* Verify that the every entry in the mapping region is reachable
989     ** via the hash table.  This turns out to be a really, really expensive
990     ** thing to check, so only do this occasionally - not on every
991     ** iteration.
992     */
993     if( (idx&0x3ff)==0 ){
994       int i;           /* Loop counter */
995       for(i=1; i<=idx; i++){
996         for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
997           if( aHash[iKey]==i ) break;
998         }
999         assert( aHash[iKey]==i );
1000       }
1001     }
1002 #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
1003   }
1004 
1005 
1006   return rc;
1007 }
1008 
1009 
1010 /*
1011 ** Recover the wal-index by reading the write-ahead log file.
1012 **
1013 ** This routine first tries to establish an exclusive lock on the
1014 ** wal-index to prevent other threads/processes from doing anything
1015 ** with the WAL or wal-index while recovery is running.  The
1016 ** WAL_RECOVER_LOCK is also held so that other threads will know
1017 ** that this thread is running recovery.  If unable to establish
1018 ** the necessary locks, this routine returns SQLITE_BUSY.
1019 */
1020 static int walIndexRecover(Wal *pWal){
1021   int rc;                         /* Return Code */
1022   i64 nSize;                      /* Size of log file */
1023   u32 aFrameCksum[2] = {0, 0};
1024   int iLock;                      /* Lock offset to lock for checkpoint */
1025   int nLock;                      /* Number of locks to hold */
1026 
1027   /* Obtain an exclusive lock on all byte in the locking range not already
1028   ** locked by the caller. The caller is guaranteed to have locked the
1029   ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
1030   ** If successful, the same bytes that are locked here are unlocked before
1031   ** this function returns.
1032   */
1033   assert( pWal->ckptLock==1 || pWal->ckptLock==0 );
1034   assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );
1035   assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );
1036   assert( pWal->writeLock );
1037   iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock;
1038   nLock = SQLITE_SHM_NLOCK - iLock;
1039   rc = walLockExclusive(pWal, iLock, nLock);
1040   if( rc ){
1041     return rc;
1042   }
1043   WALTRACE(("WAL%p: recovery begin...\n", pWal));
1044 
1045   memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
1046 
1047   rc = sqlite3OsFileSize(pWal->pWalFd, &nSize);
1048   if( rc!=SQLITE_OK ){
1049     goto recovery_error;
1050   }
1051 
1052   if( nSize>WAL_HDRSIZE ){
1053     u8 aBuf[WAL_HDRSIZE];         /* Buffer to load WAL header into */
1054     u8 *aFrame = 0;               /* Malloc'd buffer to load entire frame */
1055     int szFrame;                  /* Number of bytes in buffer aFrame[] */
1056     u8 *aData;                    /* Pointer to data part of aFrame buffer */
1057     int iFrame;                   /* Index of last frame read */
1058     i64 iOffset;                  /* Next offset to read from log file */
1059     int szPage;                   /* Page size according to the log */
1060     u32 magic;                    /* Magic value read from WAL header */
1061     u32 version;                  /* Magic value read from WAL header */
1062 
1063     /* Read in the WAL header. */
1064     rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
1065     if( rc!=SQLITE_OK ){
1066       goto recovery_error;
1067     }
1068 
1069     /* If the database page size is not a power of two, or is greater than
1070     ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid
1071     ** data. Similarly, if the 'magic' value is invalid, ignore the whole
1072     ** WAL file.
1073     */
1074     magic = sqlite3Get4byte(&aBuf[0]);
1075     szPage = sqlite3Get4byte(&aBuf[8]);
1076     if( (magic&0xFFFFFFFE)!=WAL_MAGIC
1077      || szPage&(szPage-1)
1078      || szPage>SQLITE_MAX_PAGE_SIZE
1079      || szPage<512
1080     ){
1081       goto finished;
1082     }
1083     pWal->hdr.bigEndCksum = (u8)(magic&0x00000001);
1084     pWal->szPage = (u16)szPage;
1085     pWal->nCkpt = sqlite3Get4byte(&aBuf[12]);
1086     memcpy(&pWal->hdr.aSalt, &aBuf[16], 8);
1087 
1088     /* Verify that the WAL header checksum is correct */
1089     walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN,
1090         aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum
1091     );
1092     if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24])
1093      || pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28])
1094     ){
1095       goto finished;
1096     }
1097 
1098     /* Verify that the version number on the WAL format is one that
1099     ** are able to understand */
1100     version = sqlite3Get4byte(&aBuf[4]);
1101     if( version!=WAL_MAX_VERSION ){
1102       rc = SQLITE_CANTOPEN_BKPT;
1103       goto finished;
1104     }
1105 
1106     /* Malloc a buffer to read frames into. */
1107     szFrame = szPage + WAL_FRAME_HDRSIZE;
1108     aFrame = (u8 *)sqlite3_malloc(szFrame);
1109     if( !aFrame ){
1110       rc = SQLITE_NOMEM;
1111       goto recovery_error;
1112     }
1113     aData = &aFrame[WAL_FRAME_HDRSIZE];
1114 
1115     /* Read all frames from the log file. */
1116     iFrame = 0;
1117     for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
1118       u32 pgno;                   /* Database page number for frame */
1119       u32 nTruncate;              /* dbsize field from frame header */
1120       int isValid;                /* True if this frame is valid */
1121 
1122       /* Read and decode the next log frame. */
1123       rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);
1124       if( rc!=SQLITE_OK ) break;
1125       isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
1126       if( !isValid ) break;
1127       rc = walIndexAppend(pWal, ++iFrame, pgno);
1128       if( rc!=SQLITE_OK ) break;
1129 
1130       /* If nTruncate is non-zero, this is a commit record. */
1131       if( nTruncate ){
1132         pWal->hdr.mxFrame = iFrame;
1133         pWal->hdr.nPage = nTruncate;
1134         pWal->hdr.szPage = (u16)szPage;
1135         aFrameCksum[0] = pWal->hdr.aFrameCksum[0];
1136         aFrameCksum[1] = pWal->hdr.aFrameCksum[1];
1137       }
1138     }
1139 
1140     sqlite3_free(aFrame);
1141   }
1142 
1143 finished:
1144   if( rc==SQLITE_OK ){
1145     volatile WalCkptInfo *pInfo;
1146     int i;
1147     pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
1148     pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
1149     walIndexWriteHdr(pWal);
1150 
1151     /* Reset the checkpoint-header. This is safe because this thread is
1152     ** currently holding locks that exclude all other readers, writers and
1153     ** checkpointers.
1154     */
1155     pInfo = walCkptInfo(pWal);
1156     pInfo->nBackfill = 0;
1157     pInfo->aReadMark[0] = 0;
1158     for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
1159   }
1160 
1161 recovery_error:
1162   WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
1163   walUnlockExclusive(pWal, iLock, nLock);
1164   return rc;
1165 }
1166 
1167 /*
1168 ** Close an open wal-index.
1169 */
1170 static void walIndexClose(Wal *pWal, int isDelete){
1171   sqlite3OsShmUnmap(pWal->pDbFd, isDelete);
1172 }
1173 
1174 /*
1175 ** Open a connection to the WAL file zWalName. The database file must
1176 ** already be opened on connection pDbFd. The buffer that zWalName points
1177 ** to must remain valid for the lifetime of the returned Wal* handle.
1178 **
1179 ** A SHARED lock should be held on the database file when this function
1180 ** is called. The purpose of this SHARED lock is to prevent any other
1181 ** client from unlinking the WAL or wal-index file. If another process
1182 ** were to do this just after this client opened one of these files, the
1183 ** system would be badly broken.
1184 **
1185 ** If the log file is successfully opened, SQLITE_OK is returned and
1186 ** *ppWal is set to point to a new WAL handle. If an error occurs,
1187 ** an SQLite error code is returned and *ppWal is left unmodified.
1188 */
1189 int sqlite3WalOpen(
1190   sqlite3_vfs *pVfs,              /* vfs module to open wal and wal-index */
1191   sqlite3_file *pDbFd,            /* The open database file */
1192   const char *zWalName,           /* Name of the WAL file */
1193   Wal **ppWal                     /* OUT: Allocated Wal handle */
1194 ){
1195   int rc;                         /* Return Code */
1196   Wal *pRet;                      /* Object to allocate and return */
1197   int flags;                      /* Flags passed to OsOpen() */
1198 
1199   assert( zWalName && zWalName[0] );
1200   assert( pDbFd );
1201 
1202   /* In the amalgamation, the os_unix.c and os_win.c source files come before
1203   ** this source file.  Verify that the #defines of the locking byte offsets
1204   ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.
1205   */
1206 #ifdef WIN_SHM_BASE
1207   assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET );
1208 #endif
1209 #ifdef UNIX_SHM_BASE
1210   assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET );
1211 #endif
1212 
1213 
1214   /* Allocate an instance of struct Wal to return. */
1215   *ppWal = 0;
1216   pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile);
1217   if( !pRet ){
1218     return SQLITE_NOMEM;
1219   }
1220 
1221   pRet->pVfs = pVfs;
1222   pRet->pWalFd = (sqlite3_file *)&pRet[1];
1223   pRet->pDbFd = pDbFd;
1224   pRet->readLock = -1;
1225   pRet->zWalName = zWalName;
1226 
1227   /* Open file handle on the write-ahead log file. */
1228   flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
1229   rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
1230   if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
1231     pRet->readOnly = 1;
1232   }
1233 
1234   if( rc!=SQLITE_OK ){
1235     walIndexClose(pRet, 0);
1236     sqlite3OsClose(pRet->pWalFd);
1237     sqlite3_free(pRet);
1238   }else{
1239     *ppWal = pRet;
1240     WALTRACE(("WAL%d: opened\n", pRet));
1241   }
1242   return rc;
1243 }
1244 
1245 /*
1246 ** Find the smallest page number out of all pages held in the WAL that
1247 ** has not been returned by any prior invocation of this method on the
1248 ** same WalIterator object.   Write into *piFrame the frame index where
1249 ** that page was last written into the WAL.  Write into *piPage the page
1250 ** number.
1251 **
1252 ** Return 0 on success.  If there are no pages in the WAL with a page
1253 ** number larger than *piPage, then return 1.
1254 */
1255 static int walIteratorNext(
1256   WalIterator *p,               /* Iterator */
1257   u32 *piPage,                  /* OUT: The page number of the next page */
1258   u32 *piFrame                  /* OUT: Wal frame index of next page */
1259 ){
1260   u32 iMin;                     /* Result pgno must be greater than iMin */
1261   u32 iRet = 0xFFFFFFFF;        /* 0xffffffff is never a valid page number */
1262   int i;                        /* For looping through segments */
1263 
1264   iMin = p->iPrior;
1265   assert( iMin<0xffffffff );
1266   for(i=p->nSegment-1; i>=0; i--){
1267     struct WalSegment *pSegment = &p->aSegment[i];
1268     while( pSegment->iNext<pSegment->nEntry ){
1269       u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]];
1270       if( iPg>iMin ){
1271         if( iPg<iRet ){
1272           iRet = iPg;
1273           *piFrame = pSegment->iZero + pSegment->aIndex[pSegment->iNext];
1274         }
1275         break;
1276       }
1277       pSegment->iNext++;
1278     }
1279   }
1280 
1281   *piPage = p->iPrior = iRet;
1282   return (iRet==0xFFFFFFFF);
1283 }
1284 
1285 /*
1286 ** This function merges two sorted lists into a single sorted list.
1287 */
1288 static void walMerge(
1289   u32 *aContent,                  /* Pages in wal */
1290   ht_slot *aLeft,                 /* IN: Left hand input list */
1291   int nLeft,                      /* IN: Elements in array *paLeft */
1292   ht_slot **paRight,              /* IN/OUT: Right hand input list */
1293   int *pnRight,                   /* IN/OUT: Elements in *paRight */
1294   ht_slot *aTmp                   /* Temporary buffer */
1295 ){
1296   int iLeft = 0;                  /* Current index in aLeft */
1297   int iRight = 0;                 /* Current index in aRight */
1298   int iOut = 0;                   /* Current index in output buffer */
1299   int nRight = *pnRight;
1300   ht_slot *aRight = *paRight;
1301 
1302   assert( nLeft>0 && nRight>0 );
1303   while( iRight<nRight || iLeft<nLeft ){
1304     ht_slot logpage;
1305     Pgno dbpage;
1306 
1307     if( (iLeft<nLeft)
1308      && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
1309     ){
1310       logpage = aLeft[iLeft++];
1311     }else{
1312       logpage = aRight[iRight++];
1313     }
1314     dbpage = aContent[logpage];
1315 
1316     aTmp[iOut++] = logpage;
1317     if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
1318 
1319     assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
1320     assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
1321   }
1322 
1323   *paRight = aLeft;
1324   *pnRight = iOut;
1325   memcpy(aLeft, aTmp, sizeof(aTmp[0])*iOut);
1326 }
1327 
1328 /*
1329 ** Sort the elements in list aList, removing any duplicates.
1330 */
1331 static void walMergesort(
1332   u32 *aContent,                  /* Pages in wal */
1333   ht_slot *aBuffer,               /* Buffer of at least *pnList items to use */
1334   ht_slot *aList,                 /* IN/OUT: List to sort */
1335   int *pnList                     /* IN/OUT: Number of elements in aList[] */
1336 ){
1337   struct Sublist {
1338     int nList;                    /* Number of elements in aList */
1339     ht_slot *aList;               /* Pointer to sub-list content */
1340   };
1341 
1342   const int nList = *pnList;      /* Size of input list */
1343   int nMerge = 0;                 /* Number of elements in list aMerge */
1344   ht_slot *aMerge = 0;            /* List to be merged */
1345   int iList;                      /* Index into input list */
1346   int iSub = 0;                   /* Index into aSub array */
1347   struct Sublist aSub[13];        /* Array of sub-lists */
1348 
1349   memset(aSub, 0, sizeof(aSub));
1350   assert( nList<=HASHTABLE_NPAGE && nList>0 );
1351   assert( HASHTABLE_NPAGE==(1<<(ArraySize(aSub)-1)) );
1352 
1353   for(iList=0; iList<nList; iList++){
1354     nMerge = 1;
1355     aMerge = &aList[iList];
1356     for(iSub=0; iList & (1<<iSub); iSub++){
1357       struct Sublist *p = &aSub[iSub];
1358       assert( p->aList && p->nList<=(1<<iSub) );
1359       assert( p->aList==&aList[iList&~((2<<iSub)-1)] );
1360       walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
1361     }
1362     aSub[iSub].aList = aMerge;
1363     aSub[iSub].nList = nMerge;
1364   }
1365 
1366   for(iSub++; iSub<ArraySize(aSub); iSub++){
1367     if( nList & (1<<iSub) ){
1368       struct Sublist *p = &aSub[iSub];
1369       assert( p->nList<=(1<<iSub) );
1370       assert( p->aList==&aList[nList&~((2<<iSub)-1)] );
1371       walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
1372     }
1373   }
1374   assert( aMerge==aList );
1375   *pnList = nMerge;
1376 
1377 #ifdef SQLITE_DEBUG
1378   {
1379     int i;
1380     for(i=1; i<*pnList; i++){
1381       assert( aContent[aList[i]] > aContent[aList[i-1]] );
1382     }
1383   }
1384 #endif
1385 }
1386 
1387 /*
1388 ** Free an iterator allocated by walIteratorInit().
1389 */
1390 static void walIteratorFree(WalIterator *p){
1391   sqlite3ScratchFree(p);
1392 }
1393 
1394 /*
1395 ** Construct a WalInterator object that can be used to loop over all
1396 ** pages in the WAL in ascending order. The caller must hold the checkpoint
1397 **
1398 ** On success, make *pp point to the newly allocated WalInterator object
1399 ** return SQLITE_OK. Otherwise, return an error code. If this routine
1400 ** returns an error, the value of *pp is undefined.
1401 **
1402 ** The calling routine should invoke walIteratorFree() to destroy the
1403 ** WalIterator object when it has finished with it.
1404 */
1405 static int walIteratorInit(Wal *pWal, WalIterator **pp){
1406   WalIterator *p;                 /* Return value */
1407   int nSegment;                   /* Number of segments to merge */
1408   u32 iLast;                      /* Last frame in log */
1409   int nByte;                      /* Number of bytes to allocate */
1410   int i;                          /* Iterator variable */
1411   ht_slot *aTmp;                  /* Temp space used by merge-sort */
1412   int rc = SQLITE_OK;             /* Return Code */
1413 
1414   /* This routine only runs while holding the checkpoint lock. And
1415   ** it only runs if there is actually content in the log (mxFrame>0).
1416   */
1417   assert( pWal->ckptLock && pWal->hdr.mxFrame>0 );
1418   iLast = pWal->hdr.mxFrame;
1419 
1420   /* Allocate space for the WalIterator object. */
1421   nSegment = walFramePage(iLast) + 1;
1422   nByte = sizeof(WalIterator)
1423         + (nSegment-1)*sizeof(struct WalSegment)
1424         + iLast*sizeof(ht_slot);
1425   p = (WalIterator *)sqlite3ScratchMalloc(nByte);
1426   if( !p ){
1427     return SQLITE_NOMEM;
1428   }
1429   memset(p, 0, nByte);
1430   p->nSegment = nSegment;
1431 
1432   /* Allocate temporary space used by the merge-sort routine. This block
1433   ** of memory will be freed before this function returns.
1434   */
1435   aTmp = (ht_slot *)sqlite3ScratchMalloc(
1436       sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast)
1437   );
1438   if( !aTmp ){
1439     rc = SQLITE_NOMEM;
1440   }
1441 
1442   for(i=0; rc==SQLITE_OK && i<nSegment; i++){
1443     volatile ht_slot *aHash;
1444     u32 iZero;
1445     volatile u32 *aPgno;
1446 
1447     rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);
1448     if( rc==SQLITE_OK ){
1449       int j;                      /* Counter variable */
1450       int nEntry;                 /* Number of entries in this segment */
1451       ht_slot *aIndex;            /* Sorted index for this segment */
1452 
1453       aPgno++;
1454       if( (i+1)==nSegment ){
1455         nEntry = (int)(iLast - iZero);
1456       }else{
1457         nEntry = (int)((u32*)aHash - (u32*)aPgno);
1458       }
1459       aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[iZero];
1460       iZero++;
1461 
1462       for(j=0; j<nEntry; j++){
1463         aIndex[j] = (ht_slot)j;
1464       }
1465       walMergesort((u32 *)aPgno, aTmp, aIndex, &nEntry);
1466       p->aSegment[i].iZero = iZero;
1467       p->aSegment[i].nEntry = nEntry;
1468       p->aSegment[i].aIndex = aIndex;
1469       p->aSegment[i].aPgno = (u32 *)aPgno;
1470     }
1471   }
1472   sqlite3ScratchFree(aTmp);
1473 
1474   if( rc!=SQLITE_OK ){
1475     walIteratorFree(p);
1476   }
1477   *pp = p;
1478   return rc;
1479 }
1480 
1481 /*
1482 ** Copy as much content as we can from the WAL back into the database file
1483 ** in response to an sqlite3_wal_checkpoint() request or the equivalent.
1484 **
1485 ** The amount of information copies from WAL to database might be limited
1486 ** by active readers.  This routine will never overwrite a database page
1487 ** that a concurrent reader might be using.
1488 **
1489 ** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
1490 ** SQLite is in WAL-mode in synchronous=NORMAL.  That means that if
1491 ** checkpoints are always run by a background thread or background
1492 ** process, foreground threads will never block on a lengthy fsync call.
1493 **
1494 ** Fsync is called on the WAL before writing content out of the WAL and
1495 ** into the database.  This ensures that if the new content is persistent
1496 ** in the WAL and can be recovered following a power-loss or hard reset.
1497 **
1498 ** Fsync is also called on the database file if (and only if) the entire
1499 ** WAL content is copied into the database file.  This second fsync makes
1500 ** it safe to delete the WAL since the new content will persist in the
1501 ** database file.
1502 **
1503 ** This routine uses and updates the nBackfill field of the wal-index header.
1504 ** This is the only routine tha will increase the value of nBackfill.
1505 ** (A WAL reset or recovery will revert nBackfill to zero, but not increase
1506 ** its value.)
1507 **
1508 ** The caller must be holding sufficient locks to ensure that no other
1509 ** checkpoint is running (in any other thread or process) at the same
1510 ** time.
1511 */
1512 static int walCheckpoint(
1513   Wal *pWal,                      /* Wal connection */
1514   int sync_flags,                 /* Flags for OsSync() (or 0) */
1515   int nBuf,                       /* Size of zBuf in bytes */
1516   u8 *zBuf                        /* Temporary buffer to use */
1517 ){
1518   int rc;                         /* Return code */
1519   int szPage = pWal->hdr.szPage;  /* Database page-size */
1520   WalIterator *pIter = 0;         /* Wal iterator context */
1521   u32 iDbpage = 0;                /* Next database page to write */
1522   u32 iFrame = 0;                 /* Wal frame containing data for iDbpage */
1523   u32 mxSafeFrame;                /* Max frame that can be backfilled */
1524   int i;                          /* Loop counter */
1525   volatile WalCkptInfo *pInfo;    /* The checkpoint status information */
1526 
1527   if( pWal->hdr.mxFrame==0 ) return SQLITE_OK;
1528 
1529   /* Allocate the iterator */
1530   rc = walIteratorInit(pWal, &pIter);
1531   if( rc!=SQLITE_OK ){
1532     return rc;
1533   }
1534   assert( pIter );
1535 
1536   /*** TODO:  Move this test out to the caller.  Make it an assert() here ***/
1537   if( pWal->hdr.szPage!=nBuf ){
1538     rc = SQLITE_CORRUPT_BKPT;
1539     goto walcheckpoint_out;
1540   }
1541 
1542   /* Compute in mxSafeFrame the index of the last frame of the WAL that is
1543   ** safe to write into the database.  Frames beyond mxSafeFrame might
1544   ** overwrite database pages that are in use by active readers and thus
1545   ** cannot be backfilled from the WAL.
1546   */
1547   mxSafeFrame = pWal->hdr.mxFrame;
1548   pInfo = walCkptInfo(pWal);
1549   for(i=1; i<WAL_NREADER; i++){
1550     u32 y = pInfo->aReadMark[i];
1551     if( mxSafeFrame>=y ){
1552       assert( y<=pWal->hdr.mxFrame );
1553       rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
1554       if( rc==SQLITE_OK ){
1555         pInfo->aReadMark[i] = READMARK_NOT_USED;
1556         walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
1557       }else if( rc==SQLITE_BUSY ){
1558         mxSafeFrame = y;
1559       }else{
1560         goto walcheckpoint_out;
1561       }
1562     }
1563   }
1564 
1565   if( pInfo->nBackfill<mxSafeFrame
1566    && (rc = walLockExclusive(pWal, WAL_READ_LOCK(0), 1))==SQLITE_OK
1567   ){
1568     u32 nBackfill = pInfo->nBackfill;
1569 
1570     /* Sync the WAL to disk */
1571     if( sync_flags ){
1572       rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
1573     }
1574 
1575     /* Iterate through the contents of the WAL, copying data to the db file. */
1576     while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
1577       i64 iOffset;
1578       assert( walFramePgno(pWal, iFrame)==iDbpage );
1579       if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue;
1580       iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
1581       /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
1582       rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
1583       if( rc!=SQLITE_OK ) break;
1584       iOffset = (iDbpage-1)*(i64)szPage;
1585       testcase( IS_BIG_INT(iOffset) );
1586       rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);
1587       if( rc!=SQLITE_OK ) break;
1588     }
1589 
1590     /* If work was actually accomplished... */
1591     if( rc==SQLITE_OK ){
1592       if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
1593         i64 szDb = pWal->hdr.nPage*(i64)szPage;
1594         testcase( IS_BIG_INT(szDb) );
1595         rc = sqlite3OsTruncate(pWal->pDbFd, szDb);
1596         if( rc==SQLITE_OK && sync_flags ){
1597           rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
1598         }
1599       }
1600       if( rc==SQLITE_OK ){
1601         pInfo->nBackfill = mxSafeFrame;
1602       }
1603     }
1604 
1605     /* Release the reader lock held while backfilling */
1606     walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
1607   }else if( rc==SQLITE_BUSY ){
1608     /* Reset the return code so as not to report a checkpoint failure
1609     ** just because active readers prevent any backfill.
1610     */
1611     rc = SQLITE_OK;
1612   }
1613 
1614  walcheckpoint_out:
1615   walIteratorFree(pIter);
1616   return rc;
1617 }
1618 
1619 /*
1620 ** Close a connection to a log file.
1621 */
1622 int sqlite3WalClose(
1623   Wal *pWal,                      /* Wal to close */
1624   int sync_flags,                 /* Flags to pass to OsSync() (or 0) */
1625   int nBuf,
1626   u8 *zBuf                        /* Buffer of at least nBuf bytes */
1627 ){
1628   int rc = SQLITE_OK;
1629   if( pWal ){
1630     int isDelete = 0;             /* True to unlink wal and wal-index files */
1631 
1632     /* If an EXCLUSIVE lock can be obtained on the database file (using the
1633     ** ordinary, rollback-mode locking methods, this guarantees that the
1634     ** connection associated with this log file is the only connection to
1635     ** the database. In this case checkpoint the database and unlink both
1636     ** the wal and wal-index files.
1637     **
1638     ** The EXCLUSIVE lock is not released before returning.
1639     */
1640     rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);
1641     if( rc==SQLITE_OK ){
1642       pWal->exclusiveMode = 1;
1643       rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf);
1644       if( rc==SQLITE_OK ){
1645         isDelete = 1;
1646       }
1647     }
1648 
1649     walIndexClose(pWal, isDelete);
1650     sqlite3OsClose(pWal->pWalFd);
1651     if( isDelete ){
1652       sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
1653     }
1654     WALTRACE(("WAL%p: closed\n", pWal));
1655     sqlite3_free((void *)pWal->apWiData);
1656     sqlite3_free(pWal);
1657   }
1658   return rc;
1659 }
1660 
1661 /*
1662 ** Try to read the wal-index header.  Return 0 on success and 1 if
1663 ** there is a problem.
1664 **
1665 ** The wal-index is in shared memory.  Another thread or process might
1666 ** be writing the header at the same time this procedure is trying to
1667 ** read it, which might result in inconsistency.  A dirty read is detected
1668 ** by verifying that both copies of the header are the same and also by
1669 ** a checksum on the header.
1670 **
1671 ** If and only if the read is consistent and the header is different from
1672 ** pWal->hdr, then pWal->hdr is updated to the content of the new header
1673 ** and *pChanged is set to 1.
1674 **
1675 ** If the checksum cannot be verified return non-zero. If the header
1676 ** is read successfully and the checksum verified, return zero.
1677 */
1678 static int walIndexTryHdr(Wal *pWal, int *pChanged){
1679   u32 aCksum[2];                  /* Checksum on the header content */
1680   WalIndexHdr h1, h2;             /* Two copies of the header content */
1681   WalIndexHdr volatile *aHdr;     /* Header in shared memory */
1682 
1683   /* The first page of the wal-index must be mapped at this point. */
1684   assert( pWal->nWiData>0 && pWal->apWiData[0] );
1685 
1686   /* Read the header. This might happen currently with a write to the
1687   ** same area of shared memory on a different CPU in a SMP,
1688   ** meaning it is possible that an inconsistent snapshot is read
1689   ** from the file. If this happens, return non-zero.
1690   **
1691   ** There are two copies of the header at the beginning of the wal-index.
1692   ** When reading, read [0] first then [1].  Writes are in the reverse order.
1693   ** Memory barriers are used to prevent the compiler or the hardware from
1694   ** reordering the reads and writes.
1695   */
1696   aHdr = walIndexHdr(pWal);
1697   memcpy(&h1, (void *)&aHdr[0], sizeof(h1));
1698   sqlite3OsShmBarrier(pWal->pDbFd);
1699   memcpy(&h2, (void *)&aHdr[1], sizeof(h2));
1700 
1701   if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
1702     return 1;   /* Dirty read */
1703   }
1704   if( h1.isInit==0 ){
1705     return 1;   /* Malformed header - probably all zeros */
1706   }
1707   walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);
1708   if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){
1709     return 1;   /* Checksum does not match */
1710   }
1711 
1712   if( memcmp(&pWal->hdr, &h1, sizeof(WalIndexHdr)) ){
1713     *pChanged = 1;
1714     memcpy(&pWal->hdr, &h1, sizeof(WalIndexHdr));
1715     pWal->szPage = pWal->hdr.szPage;
1716   }
1717 
1718   /* The header was successfully read. Return zero. */
1719   return 0;
1720 }
1721 
1722 /*
1723 ** Read the wal-index header from the wal-index and into pWal->hdr.
1724 ** If the wal-header appears to be corrupt, try to reconstruct the
1725 ** wal-index from the WAL before returning.
1726 **
1727 ** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
1728 ** changed by this opertion.  If pWal->hdr is unchanged, set *pChanged
1729 ** to 0.
1730 **
1731 ** If the wal-index header is successfully read, return SQLITE_OK.
1732 ** Otherwise an SQLite error code.
1733 */
1734 static int walIndexReadHdr(Wal *pWal, int *pChanged){
1735   int rc;                         /* Return code */
1736   int badHdr;                     /* True if a header read failed */
1737   volatile u32 *page0;            /* Chunk of wal-index containing header */
1738 
1739   /* Ensure that page 0 of the wal-index (the page that contains the
1740   ** wal-index header) is mapped. Return early if an error occurs here.
1741   */
1742   assert( pChanged );
1743   rc = walIndexPage(pWal, 0, &page0);
1744   if( rc!=SQLITE_OK ){
1745     return rc;
1746   };
1747   assert( page0 || pWal->writeLock==0 );
1748 
1749   /* If the first page of the wal-index has been mapped, try to read the
1750   ** wal-index header immediately, without holding any lock. This usually
1751   ** works, but may fail if the wal-index header is corrupt or currently
1752   ** being modified by another thread or process.
1753   */
1754   badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
1755 
1756   /* If the first attempt failed, it might have been due to a race
1757   ** with a writer.  So get a WRITE lock and try again.
1758   */
1759   assert( badHdr==0 || pWal->writeLock==0 );
1760   if( badHdr && SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
1761     pWal->writeLock = 1;
1762     if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
1763       badHdr = walIndexTryHdr(pWal, pChanged);
1764       if( badHdr ){
1765         /* If the wal-index header is still malformed even while holding
1766         ** a WRITE lock, it can only mean that the header is corrupted and
1767         ** needs to be reconstructed.  So run recovery to do exactly that.
1768         */
1769         rc = walIndexRecover(pWal);
1770         *pChanged = 1;
1771       }
1772     }
1773     pWal->writeLock = 0;
1774     walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
1775   }
1776 
1777   /* If the header is read successfully, check the version number to make
1778   ** sure the wal-index was not constructed with some future format that
1779   ** this version of SQLite cannot understand.
1780   */
1781   if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){
1782     rc = SQLITE_CANTOPEN_BKPT;
1783   }
1784 
1785   return rc;
1786 }
1787 
1788 /*
1789 ** This is the value that walTryBeginRead returns when it needs to
1790 ** be retried.
1791 */
1792 #define WAL_RETRY  (-1)
1793 
1794 /*
1795 ** Attempt to start a read transaction.  This might fail due to a race or
1796 ** other transient condition.  When that happens, it returns WAL_RETRY to
1797 ** indicate to the caller that it is safe to retry immediately.
1798 **
1799 ** On success return SQLITE_OK.  On a permanent failure (such an
1800 ** I/O error or an SQLITE_BUSY because another process is running
1801 ** recovery) return a positive error code.
1802 **
1803 ** The useWal parameter is true to force the use of the WAL and disable
1804 ** the case where the WAL is bypassed because it has been completely
1805 ** checkpointed.  If useWal==0 then this routine calls walIndexReadHdr()
1806 ** to make a copy of the wal-index header into pWal->hdr.  If the
1807 ** wal-index header has changed, *pChanged is set to 1 (as an indication
1808 ** to the caller that the local paget cache is obsolete and needs to be
1809 ** flushed.)  When useWal==1, the wal-index header is assumed to already
1810 ** be loaded and the pChanged parameter is unused.
1811 **
1812 ** The caller must set the cnt parameter to the number of prior calls to
1813 ** this routine during the current read attempt that returned WAL_RETRY.
1814 ** This routine will start taking more aggressive measures to clear the
1815 ** race conditions after multiple WAL_RETRY returns, and after an excessive
1816 ** number of errors will ultimately return SQLITE_PROTOCOL.  The
1817 ** SQLITE_PROTOCOL return indicates that some other process has gone rogue
1818 ** and is not honoring the locking protocol.  There is a vanishingly small
1819 ** chance that SQLITE_PROTOCOL could be returned because of a run of really
1820 ** bad luck when there is lots of contention for the wal-index, but that
1821 ** possibility is so small that it can be safely neglected, we believe.
1822 **
1823 ** On success, this routine obtains a read lock on
1824 ** WAL_READ_LOCK(pWal->readLock).  The pWal->readLock integer is
1825 ** in the range 0 <= pWal->readLock < WAL_NREADER.  If pWal->readLock==(-1)
1826 ** that means the Wal does not hold any read lock.  The reader must not
1827 ** access any database page that is modified by a WAL frame up to and
1828 ** including frame number aReadMark[pWal->readLock].  The reader will
1829 ** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0
1830 ** Or if pWal->readLock==0, then the reader will ignore the WAL
1831 ** completely and get all content directly from the database file.
1832 ** If the useWal parameter is 1 then the WAL will never be ignored and
1833 ** this routine will always set pWal->readLock>0 on success.
1834 ** When the read transaction is completed, the caller must release the
1835 ** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.
1836 **
1837 ** This routine uses the nBackfill and aReadMark[] fields of the header
1838 ** to select a particular WAL_READ_LOCK() that strives to let the
1839 ** checkpoint process do as much work as possible.  This routine might
1840 ** update values of the aReadMark[] array in the header, but if it does
1841 ** so it takes care to hold an exclusive lock on the corresponding
1842 ** WAL_READ_LOCK() while changing values.
1843 */
1844 static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
1845   volatile WalCkptInfo *pInfo;    /* Checkpoint information in wal-index */
1846   u32 mxReadMark;                 /* Largest aReadMark[] value */
1847   int mxI;                        /* Index of largest aReadMark[] value */
1848   int i;                          /* Loop counter */
1849   int rc = SQLITE_OK;             /* Return code  */
1850 
1851   assert( pWal->readLock<0 );     /* Not currently locked */
1852 
1853   /* Take steps to avoid spinning forever if there is a protocol error. */
1854   if( cnt>5 ){
1855     if( cnt>100 ) return SQLITE_PROTOCOL;
1856     sqlite3OsSleep(pWal->pVfs, 1);
1857   }
1858 
1859   if( !useWal ){
1860     rc = walIndexReadHdr(pWal, pChanged);
1861     if( rc==SQLITE_BUSY ){
1862       /* If there is not a recovery running in another thread or process
1863       ** then convert BUSY errors to WAL_RETRY.  If recovery is known to
1864       ** be running, convert BUSY to BUSY_RECOVERY.  There is a race here
1865       ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
1866       ** would be technically correct.  But the race is benign since with
1867       ** WAL_RETRY this routine will be called again and will probably be
1868       ** right on the second iteration.
1869       */
1870       if( pWal->apWiData[0]==0 ){
1871         /* This branch is taken when the xShmMap() method returns SQLITE_BUSY.
1872         ** We assume this is a transient condition, so return WAL_RETRY. The
1873         ** xShmMap() implementation used by the default unix and win32 VFS
1874         ** modules may return SQLITE_BUSY due to a race condition in the
1875         ** code that determines whether or not the shared-memory region
1876         ** must be zeroed before the requested page is returned.
1877         */
1878         rc = WAL_RETRY;
1879       }else if( SQLITE_OK==(rc = walLockShared(pWal, WAL_RECOVER_LOCK)) ){
1880         walUnlockShared(pWal, WAL_RECOVER_LOCK);
1881         rc = WAL_RETRY;
1882       }else if( rc==SQLITE_BUSY ){
1883         rc = SQLITE_BUSY_RECOVERY;
1884       }
1885     }
1886     if( rc!=SQLITE_OK ){
1887       return rc;
1888     }
1889   }
1890 
1891   pInfo = walCkptInfo(pWal);
1892   if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){
1893     /* The WAL has been completely backfilled (or it is empty).
1894     ** and can be safely ignored.
1895     */
1896     rc = walLockShared(pWal, WAL_READ_LOCK(0));
1897     sqlite3OsShmBarrier(pWal->pDbFd);
1898     if( rc==SQLITE_OK ){
1899       if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){
1900         /* It is not safe to allow the reader to continue here if frames
1901         ** may have been appended to the log before READ_LOCK(0) was obtained.
1902         ** When holding READ_LOCK(0), the reader ignores the entire log file,
1903         ** which implies that the database file contains a trustworthy
1904         ** snapshoT. Since holding READ_LOCK(0) prevents a checkpoint from
1905         ** happening, this is usually correct.
1906         **
1907         ** However, if frames have been appended to the log (or if the log
1908         ** is wrapped and written for that matter) before the READ_LOCK(0)
1909         ** is obtained, that is not necessarily true. A checkpointer may
1910         ** have started to backfill the appended frames but crashed before
1911         ** it finished. Leaving a corrupt image in the database file.
1912         */
1913         walUnlockShared(pWal, WAL_READ_LOCK(0));
1914         return WAL_RETRY;
1915       }
1916       pWal->readLock = 0;
1917       return SQLITE_OK;
1918     }else if( rc!=SQLITE_BUSY ){
1919       return rc;
1920     }
1921   }
1922 
1923   /* If we get this far, it means that the reader will want to use
1924   ** the WAL to get at content from recent commits.  The job now is
1925   ** to select one of the aReadMark[] entries that is closest to
1926   ** but not exceeding pWal->hdr.mxFrame and lock that entry.
1927   */
1928   mxReadMark = 0;
1929   mxI = 0;
1930   for(i=1; i<WAL_NREADER; i++){
1931     u32 thisMark = pInfo->aReadMark[i];
1932     if( mxReadMark<=thisMark && thisMark<=pWal->hdr.mxFrame ){
1933       assert( thisMark!=READMARK_NOT_USED );
1934       mxReadMark = thisMark;
1935       mxI = i;
1936     }
1937   }
1938   if( mxI==0 ){
1939     /* If we get here, it means that all of the aReadMark[] entries between
1940     ** 1 and WAL_NREADER-1 are zero.  Try to initialize aReadMark[1] to
1941     ** be mxFrame, then retry.
1942     */
1943     rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
1944     if( rc==SQLITE_OK ){
1945       pInfo->aReadMark[1] = pWal->hdr.mxFrame;
1946       walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
1947       rc = WAL_RETRY;
1948     }else if( rc==SQLITE_BUSY ){
1949       rc = WAL_RETRY;
1950     }
1951     return rc;
1952   }else{
1953     if( mxReadMark < pWal->hdr.mxFrame ){
1954       for(i=1; i<WAL_NREADER; i++){
1955         rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
1956         if( rc==SQLITE_OK ){
1957           mxReadMark = pInfo->aReadMark[i] = pWal->hdr.mxFrame;
1958           mxI = i;
1959           walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
1960           break;
1961         }else if( rc!=SQLITE_BUSY ){
1962           return rc;
1963         }
1964       }
1965     }
1966 
1967     rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
1968     if( rc ){
1969       return rc==SQLITE_BUSY ? WAL_RETRY : rc;
1970     }
1971     /* Now that the read-lock has been obtained, check that neither the
1972     ** value in the aReadMark[] array or the contents of the wal-index
1973     ** header have changed.
1974     **
1975     ** It is necessary to check that the wal-index header did not change
1976     ** between the time it was read and when the shared-lock was obtained
1977     ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
1978     ** that the log file may have been wrapped by a writer, or that frames
1979     ** that occur later in the log than pWal->hdr.mxFrame may have been
1980     ** copied into the database by a checkpointer. If either of these things
1981     ** happened, then reading the database with the current value of
1982     ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry
1983     ** instead.
1984     **
1985     ** This does not guarantee that the copy of the wal-index header is up to
1986     ** date before proceeding. That would not be possible without somehow
1987     ** blocking writers. It only guarantees that a dangerous checkpoint or
1988     ** log-wrap (either of which would require an exclusive lock on
1989     ** WAL_READ_LOCK(mxI)) has not occurred since the snapshot was valid.
1990     */
1991     sqlite3OsShmBarrier(pWal->pDbFd);
1992     if( pInfo->aReadMark[mxI]!=mxReadMark
1993      || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))
1994     ){
1995       walUnlockShared(pWal, WAL_READ_LOCK(mxI));
1996       return WAL_RETRY;
1997     }else{
1998       assert( mxReadMark<=pWal->hdr.mxFrame );
1999       pWal->readLock = (i16)mxI;
2000     }
2001   }
2002   return rc;
2003 }
2004 
2005 /*
2006 ** Begin a read transaction on the database.
2007 **
2008 ** This routine used to be called sqlite3OpenSnapshot() and with good reason:
2009 ** it takes a snapshot of the state of the WAL and wal-index for the current
2010 ** instant in time.  The current thread will continue to use this snapshot.
2011 ** Other threads might append new content to the WAL and wal-index but
2012 ** that extra content is ignored by the current thread.
2013 **
2014 ** If the database contents have changes since the previous read
2015 ** transaction, then *pChanged is set to 1 before returning.  The
2016 ** Pager layer will use this to know that is cache is stale and
2017 ** needs to be flushed.
2018 */
2019 int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
2020   int rc;                         /* Return code */
2021   int cnt = 0;                    /* Number of TryBeginRead attempts */
2022 
2023   do{
2024     rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);
2025   }while( rc==WAL_RETRY );
2026   return rc;
2027 }
2028 
2029 /*
2030 ** Finish with a read transaction.  All this does is release the
2031 ** read-lock.
2032 */
2033 void sqlite3WalEndReadTransaction(Wal *pWal){
2034   if( pWal->readLock>=0 ){
2035     walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
2036     pWal->readLock = -1;
2037   }
2038 }
2039 
2040 /*
2041 ** Read a page from the WAL, if it is present in the WAL and if the
2042 ** current read transaction is configured to use the WAL.
2043 **
2044 ** The *pInWal is set to 1 if the requested page is in the WAL and
2045 ** has been loaded.  Or *pInWal is set to 0 if the page was not in
2046 ** the WAL and needs to be read out of the database.
2047 */
2048 int sqlite3WalRead(
2049   Wal *pWal,                      /* WAL handle */
2050   Pgno pgno,                      /* Database page number to read data for */
2051   int *pInWal,                    /* OUT: True if data is read from WAL */
2052   int nOut,                       /* Size of buffer pOut in bytes */
2053   u8 *pOut                        /* Buffer to write page data to */
2054 ){
2055   u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
2056   u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
2057   int iHash;                      /* Used to loop through N hash tables */
2058 
2059   /* This routine is only be called from within a read transaction. */
2060   assert( pWal->readLock>=0 || pWal->lockError );
2061 
2062   /* If the "last page" field of the wal-index header snapshot is 0, then
2063   ** no data will be read from the wal under any circumstances. Return early
2064   ** in this case as an optimization.  Likewise, if pWal->readLock==0,
2065   ** then the WAL is ignored by the reader so return early, as if the
2066   ** WAL were empty.
2067   */
2068   if( iLast==0 || pWal->readLock==0 ){
2069     *pInWal = 0;
2070     return SQLITE_OK;
2071   }
2072 
2073   /* Search the hash table or tables for an entry matching page number
2074   ** pgno. Each iteration of the following for() loop searches one
2075   ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
2076   **
2077   ** This code might run concurrently to the code in walIndexAppend()
2078   ** that adds entries to the wal-index (and possibly to this hash
2079   ** table). This means the value just read from the hash
2080   ** slot (aHash[iKey]) may have been added before or after the
2081   ** current read transaction was opened. Values added after the
2082   ** read transaction was opened may have been written incorrectly -
2083   ** i.e. these slots may contain garbage data. However, we assume
2084   ** that any slots written before the current read transaction was
2085   ** opened remain unmodified.
2086   **
2087   ** For the reasons above, the if(...) condition featured in the inner
2088   ** loop of the following block is more stringent that would be required
2089   ** if we had exclusive access to the hash-table:
2090   **
2091   **   (aPgno[iFrame]==pgno):
2092   **     This condition filters out normal hash-table collisions.
2093   **
2094   **   (iFrame<=iLast):
2095   **     This condition filters out entries that were added to the hash
2096   **     table after the current read-transaction had started.
2097   */
2098   for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
2099     volatile ht_slot *aHash;      /* Pointer to hash table */
2100     volatile u32 *aPgno;          /* Pointer to array of page numbers */
2101     u32 iZero;                    /* Frame number corresponding to aPgno[0] */
2102     int iKey;                     /* Hash slot index */
2103     int nCollide;                 /* Number of hash collisions remaining */
2104     int rc;                       /* Error code */
2105 
2106     rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);
2107     if( rc!=SQLITE_OK ){
2108       return rc;
2109     }
2110     nCollide = HASHTABLE_NSLOT;
2111     for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
2112       u32 iFrame = aHash[iKey] + iZero;
2113       if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
2114         assert( iFrame>iRead );
2115         iRead = iFrame;
2116       }
2117       if( (nCollide--)==0 ){
2118         return SQLITE_CORRUPT_BKPT;
2119       }
2120     }
2121   }
2122 
2123 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
2124   /* If expensive assert() statements are available, do a linear search
2125   ** of the wal-index file content. Make sure the results agree with the
2126   ** result obtained using the hash indexes above.  */
2127   {
2128     u32 iRead2 = 0;
2129     u32 iTest;
2130     for(iTest=iLast; iTest>0; iTest--){
2131       if( walFramePgno(pWal, iTest)==pgno ){
2132         iRead2 = iTest;
2133         break;
2134       }
2135     }
2136     assert( iRead==iRead2 );
2137   }
2138 #endif
2139 
2140   /* If iRead is non-zero, then it is the log frame number that contains the
2141   ** required page. Read and return data from the log file.
2142   */
2143   if( iRead ){
2144     i64 iOffset = walFrameOffset(iRead, pWal->hdr.szPage) + WAL_FRAME_HDRSIZE;
2145     *pInWal = 1;
2146     /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
2147     return sqlite3OsRead(pWal->pWalFd, pOut, nOut, iOffset);
2148   }
2149 
2150   *pInWal = 0;
2151   return SQLITE_OK;
2152 }
2153 
2154 
2155 /*
2156 ** Set *pPgno to the size of the database file (or zero, if unknown).
2157 */
2158 void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
2159   assert( pWal->readLock>=0 || pWal->lockError );
2160   *pPgno = pWal->hdr.nPage;
2161 }
2162 
2163 
2164 /*
2165 ** This function starts a write transaction on the WAL.
2166 **
2167 ** A read transaction must have already been started by a prior call
2168 ** to sqlite3WalBeginReadTransaction().
2169 **
2170 ** If another thread or process has written into the database since
2171 ** the read transaction was started, then it is not possible for this
2172 ** thread to write as doing so would cause a fork.  So this routine
2173 ** returns SQLITE_BUSY in that case and no write transaction is started.
2174 **
2175 ** There can only be a single writer active at a time.
2176 */
2177 int sqlite3WalBeginWriteTransaction(Wal *pWal){
2178   int rc;
2179 
2180   /* Cannot start a write transaction without first holding a read
2181   ** transaction. */
2182   assert( pWal->readLock>=0 );
2183 
2184   if( pWal->readOnly ){
2185     return SQLITE_READONLY;
2186   }
2187 
2188   /* Only one writer allowed at a time.  Get the write lock.  Return
2189   ** SQLITE_BUSY if unable.
2190   */
2191   rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
2192   if( rc ){
2193     return rc;
2194   }
2195   pWal->writeLock = 1;
2196 
2197   /* If another connection has written to the database file since the
2198   ** time the read transaction on this connection was started, then
2199   ** the write is disallowed.
2200   */
2201   if( memcmp(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){
2202     walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
2203     pWal->writeLock = 0;
2204     rc = SQLITE_BUSY;
2205   }
2206 
2207   return rc;
2208 }
2209 
2210 /*
2211 ** End a write transaction.  The commit has already been done.  This
2212 ** routine merely releases the lock.
2213 */
2214 int sqlite3WalEndWriteTransaction(Wal *pWal){
2215   if( pWal->writeLock ){
2216     walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
2217     pWal->writeLock = 0;
2218   }
2219   return SQLITE_OK;
2220 }
2221 
2222 /*
2223 ** If any data has been written (but not committed) to the log file, this
2224 ** function moves the write-pointer back to the start of the transaction.
2225 **
2226 ** Additionally, the callback function is invoked for each frame written
2227 ** to the WAL since the start of the transaction. If the callback returns
2228 ** other than SQLITE_OK, it is not invoked again and the error code is
2229 ** returned to the caller.
2230 **
2231 ** Otherwise, if the callback function does not return an error, this
2232 ** function returns SQLITE_OK.
2233 */
2234 int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
2235   int rc = SQLITE_OK;
2236   if( pWal->writeLock ){
2237     Pgno iMax = pWal->hdr.mxFrame;
2238     Pgno iFrame;
2239 
2240     /* Restore the clients cache of the wal-index header to the state it
2241     ** was in before the client began writing to the database.
2242     */
2243     memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));
2244 
2245     for(iFrame=pWal->hdr.mxFrame+1;
2246         ALWAYS(rc==SQLITE_OK) && iFrame<=iMax;
2247         iFrame++
2248     ){
2249       /* This call cannot fail. Unless the page for which the page number
2250       ** is passed as the second argument is (a) in the cache and
2251       ** (b) has an outstanding reference, then xUndo is either a no-op
2252       ** (if (a) is false) or simply expels the page from the cache (if (b)
2253       ** is false).
2254       **
2255       ** If the upper layer is doing a rollback, it is guaranteed that there
2256       ** are no outstanding references to any page other than page 1. And
2257       ** page 1 is never written to the log until the transaction is
2258       ** committed. As a result, the call to xUndo may not fail.
2259       */
2260       assert( walFramePgno(pWal, iFrame)!=1 );
2261       rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
2262     }
2263     walCleanupHash(pWal);
2264   }
2265   assert( rc==SQLITE_OK );
2266   return rc;
2267 }
2268 
2269 /*
2270 ** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32
2271 ** values. This function populates the array with values required to
2272 ** "rollback" the write position of the WAL handle back to the current
2273 ** point in the event of a savepoint rollback (via WalSavepointUndo()).
2274 */
2275 void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){
2276   assert( pWal->writeLock );
2277   aWalData[0] = pWal->hdr.mxFrame;
2278   aWalData[1] = pWal->hdr.aFrameCksum[0];
2279   aWalData[2] = pWal->hdr.aFrameCksum[1];
2280   aWalData[3] = pWal->nCkpt;
2281 }
2282 
2283 /*
2284 ** Move the write position of the WAL back to the point identified by
2285 ** the values in the aWalData[] array. aWalData must point to an array
2286 ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
2287 ** by a call to WalSavepoint().
2288 */
2289 int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
2290   int rc = SQLITE_OK;
2291 
2292   assert( pWal->writeLock );
2293   assert( aWalData[3]!=pWal->nCkpt || aWalData[0]<=pWal->hdr.mxFrame );
2294 
2295   if( aWalData[3]!=pWal->nCkpt ){
2296     /* This savepoint was opened immediately after the write-transaction
2297     ** was started. Right after that, the writer decided to wrap around
2298     ** to the start of the log. Update the savepoint values to match.
2299     */
2300     aWalData[0] = 0;
2301     aWalData[3] = pWal->nCkpt;
2302   }
2303 
2304   if( aWalData[0]<pWal->hdr.mxFrame ){
2305     pWal->hdr.mxFrame = aWalData[0];
2306     pWal->hdr.aFrameCksum[0] = aWalData[1];
2307     pWal->hdr.aFrameCksum[1] = aWalData[2];
2308     walCleanupHash(pWal);
2309   }
2310 
2311   return rc;
2312 }
2313 
2314 /*
2315 ** This function is called just before writing a set of frames to the log
2316 ** file (see sqlite3WalFrames()). It checks to see if, instead of appending
2317 ** to the current log file, it is possible to overwrite the start of the
2318 ** existing log file with the new frames (i.e. "reset" the log). If so,
2319 ** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left
2320 ** unchanged.
2321 **
2322 ** SQLITE_OK is returned if no error is encountered (regardless of whether
2323 ** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned
2324 ** if some error
2325 */
2326 static int walRestartLog(Wal *pWal){
2327   int rc = SQLITE_OK;
2328   int cnt;
2329 
2330   if( pWal->readLock==0 ){
2331     volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
2332     assert( pInfo->nBackfill==pWal->hdr.mxFrame );
2333     if( pInfo->nBackfill>0 ){
2334       rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
2335       if( rc==SQLITE_OK ){
2336         /* If all readers are using WAL_READ_LOCK(0) (in other words if no
2337         ** readers are currently using the WAL), then the transactions
2338         ** frames will overwrite the start of the existing log. Update the
2339         ** wal-index header to reflect this.
2340         **
2341         ** In theory it would be Ok to update the cache of the header only
2342         ** at this point. But updating the actual wal-index header is also
2343         ** safe and means there is no special case for sqlite3WalUndo()
2344         ** to handle if this transaction is rolled back.
2345         */
2346         int i;                    /* Loop counter */
2347         u32 *aSalt = pWal->hdr.aSalt;       /* Big-endian salt values */
2348         pWal->nCkpt++;
2349         pWal->hdr.mxFrame = 0;
2350         sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
2351         sqlite3_randomness(4, &aSalt[1]);
2352         walIndexWriteHdr(pWal);
2353         pInfo->nBackfill = 0;
2354         for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
2355         assert( pInfo->aReadMark[0]==0 );
2356         walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
2357       }
2358     }
2359     walUnlockShared(pWal, WAL_READ_LOCK(0));
2360     pWal->readLock = -1;
2361     cnt = 0;
2362     do{
2363       int notUsed;
2364       rc = walTryBeginRead(pWal, &notUsed, 1, ++cnt);
2365     }while( rc==WAL_RETRY );
2366   }
2367   return rc;
2368 }
2369 
2370 /*
2371 ** Write a set of frames to the log. The caller must hold the write-lock
2372 ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
2373 */
2374 int sqlite3WalFrames(
2375   Wal *pWal,                      /* Wal handle to write to */
2376   int szPage,                     /* Database page-size in bytes */
2377   PgHdr *pList,                   /* List of dirty pages to write */
2378   Pgno nTruncate,                 /* Database size after this commit */
2379   int isCommit,                   /* True if this is a commit */
2380   int sync_flags                  /* Flags to pass to OsSync() (or 0) */
2381 ){
2382   int rc;                         /* Used to catch return codes */
2383   u32 iFrame;                     /* Next frame address */
2384   u8 aFrame[WAL_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
2385   PgHdr *p;                       /* Iterator to run through pList with. */
2386   PgHdr *pLast = 0;               /* Last frame in list */
2387   int nLast = 0;                  /* Number of extra copies of last page */
2388 
2389   assert( pList );
2390   assert( pWal->writeLock );
2391 
2392 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
2393   { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
2394     WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
2395               pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));
2396   }
2397 #endif
2398 
2399   /* See if it is possible to write these frames into the start of the
2400   ** log file, instead of appending to it at pWal->hdr.mxFrame.
2401   */
2402   if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){
2403     return rc;
2404   }
2405 
2406   /* If this is the first frame written into the log, write the WAL
2407   ** header to the start of the WAL file. See comments at the top of
2408   ** this source file for a description of the WAL header format.
2409   */
2410   iFrame = pWal->hdr.mxFrame;
2411   if( iFrame==0 ){
2412     u8 aWalHdr[WAL_HDRSIZE];      /* Buffer to assemble wal-header in */
2413     u32 aCksum[2];                /* Checksum for wal-header */
2414 
2415     sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN));
2416     sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);
2417     sqlite3Put4byte(&aWalHdr[8], szPage);
2418     sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);
2419     sqlite3_randomness(8, pWal->hdr.aSalt);
2420     memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);
2421     walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);
2422     sqlite3Put4byte(&aWalHdr[24], aCksum[0]);
2423     sqlite3Put4byte(&aWalHdr[28], aCksum[1]);
2424 
2425     pWal->szPage = (u16)szPage;
2426     pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;
2427     pWal->hdr.aFrameCksum[0] = aCksum[0];
2428     pWal->hdr.aFrameCksum[1] = aCksum[1];
2429 
2430     rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
2431     WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
2432     if( rc!=SQLITE_OK ){
2433       return rc;
2434     }
2435   }
2436   assert( pWal->szPage==szPage );
2437 
2438   /* Write the log file. */
2439   for(p=pList; p; p=p->pDirty){
2440     u32 nDbsize;                  /* Db-size field for frame header */
2441     i64 iOffset;                  /* Write offset in log file */
2442     void *pData;
2443 
2444     iOffset = walFrameOffset(++iFrame, szPage);
2445     /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
2446 
2447     /* Populate and write the frame header */
2448     nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
2449 #if defined(SQLITE_HAS_CODEC)
2450     if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
2451 #else
2452     pData = p->pData;
2453 #endif
2454     walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame);
2455     rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
2456     if( rc!=SQLITE_OK ){
2457       return rc;
2458     }
2459 
2460     /* Write the page data */
2461     rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset+sizeof(aFrame));
2462     if( rc!=SQLITE_OK ){
2463       return rc;
2464     }
2465     pLast = p;
2466   }
2467 
2468   /* Sync the log file if the 'isSync' flag was specified. */
2469   if( sync_flags ){
2470     i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd);
2471     i64 iOffset = walFrameOffset(iFrame+1, szPage);
2472 
2473     assert( isCommit );
2474     assert( iSegment>0 );
2475 
2476     iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
2477     while( iOffset<iSegment ){
2478       void *pData;
2479 #if defined(SQLITE_HAS_CODEC)
2480       if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
2481 #else
2482       pData = pLast->pData;
2483 #endif
2484       walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame);
2485       /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
2486       rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
2487       if( rc!=SQLITE_OK ){
2488         return rc;
2489       }
2490       iOffset += WAL_FRAME_HDRSIZE;
2491       rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset);
2492       if( rc!=SQLITE_OK ){
2493         return rc;
2494       }
2495       nLast++;
2496       iOffset += szPage;
2497     }
2498 
2499     rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
2500   }
2501 
2502   /* Append data to the wal-index. It is not necessary to lock the
2503   ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
2504   ** guarantees that there are no other writers, and no data that may
2505   ** be in use by existing readers is being overwritten.
2506   */
2507   iFrame = pWal->hdr.mxFrame;
2508   for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
2509     iFrame++;
2510     rc = walIndexAppend(pWal, iFrame, p->pgno);
2511   }
2512   while( nLast>0 && rc==SQLITE_OK ){
2513     iFrame++;
2514     nLast--;
2515     rc = walIndexAppend(pWal, iFrame, pLast->pgno);
2516   }
2517 
2518   if( rc==SQLITE_OK ){
2519     /* Update the private copy of the header. */
2520     pWal->hdr.szPage = (u16)szPage;
2521     pWal->hdr.mxFrame = iFrame;
2522     if( isCommit ){
2523       pWal->hdr.iChange++;
2524       pWal->hdr.nPage = nTruncate;
2525     }
2526     /* If this is a commit, update the wal-index header too. */
2527     if( isCommit ){
2528       walIndexWriteHdr(pWal);
2529       pWal->iCallback = iFrame;
2530     }
2531   }
2532 
2533   WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));
2534   return rc;
2535 }
2536 
2537 /*
2538 ** This routine is called to implement sqlite3_wal_checkpoint() and
2539 ** related interfaces.
2540 **
2541 ** Obtain a CHECKPOINT lock and then backfill as much information as
2542 ** we can from WAL into the database.
2543 */
2544 int sqlite3WalCheckpoint(
2545   Wal *pWal,                      /* Wal connection */
2546   int sync_flags,                 /* Flags to sync db file with (or 0) */
2547   int nBuf,                       /* Size of temporary buffer */
2548   u8 *zBuf                        /* Temporary buffer to use */
2549 ){
2550   int rc;                         /* Return code */
2551   int isChanged = 0;              /* True if a new wal-index header is loaded */
2552 
2553   assert( pWal->ckptLock==0 );
2554 
2555   WALTRACE(("WAL%p: checkpoint begins\n", pWal));
2556   rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
2557   if( rc ){
2558     /* Usually this is SQLITE_BUSY meaning that another thread or process
2559     ** is already running a checkpoint, or maybe a recovery.  But it might
2560     ** also be SQLITE_IOERR. */
2561     return rc;
2562   }
2563   pWal->ckptLock = 1;
2564 
2565   /* Copy data from the log to the database file. */
2566   rc = walIndexReadHdr(pWal, &isChanged);
2567   if( rc==SQLITE_OK ){
2568     rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
2569   }
2570   if( isChanged ){
2571     /* If a new wal-index header was loaded before the checkpoint was
2572     ** performed, then the pager-cache associated with pWal is now
2573     ** out of date. So zero the cached wal-index header to ensure that
2574     ** next time the pager opens a snapshot on this database it knows that
2575     ** the cache needs to be reset.
2576     */
2577     memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
2578   }
2579 
2580   /* Release the locks. */
2581   walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
2582   pWal->ckptLock = 0;
2583   WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
2584   return rc;
2585 }
2586 
2587 /* Return the value to pass to a sqlite3_wal_hook callback, the
2588 ** number of frames in the WAL at the point of the last commit since
2589 ** sqlite3WalCallback() was called.  If no commits have occurred since
2590 ** the last call, then return 0.
2591 */
2592 int sqlite3WalCallback(Wal *pWal){
2593   u32 ret = 0;
2594   if( pWal ){
2595     ret = pWal->iCallback;
2596     pWal->iCallback = 0;
2597   }
2598   return (int)ret;
2599 }
2600 
2601 /*
2602 ** This function is called to change the WAL subsystem into or out
2603 ** of locking_mode=EXCLUSIVE.
2604 **
2605 ** If op is zero, then attempt to change from locking_mode=EXCLUSIVE
2606 ** into locking_mode=NORMAL.  This means that we must acquire a lock
2607 ** on the pWal->readLock byte.  If the WAL is already in locking_mode=NORMAL
2608 ** or if the acquisition of the lock fails, then return 0.  If the
2609 ** transition out of exclusive-mode is successful, return 1.  This
2610 ** operation must occur while the pager is still holding the exclusive
2611 ** lock on the main database file.
2612 **
2613 ** If op is one, then change from locking_mode=NORMAL into
2614 ** locking_mode=EXCLUSIVE.  This means that the pWal->readLock must
2615 ** be released.  Return 1 if the transition is made and 0 if the
2616 ** WAL is already in exclusive-locking mode - meaning that this
2617 ** routine is a no-op.  The pager must already hold the exclusive lock
2618 ** on the main database file before invoking this operation.
2619 **
2620 ** If op is negative, then do a dry-run of the op==1 case but do
2621 ** not actually change anything.  The pager uses this to see if it
2622 ** should acquire the database exclusive lock prior to invoking
2623 ** the op==1 case.
2624 */
2625 int sqlite3WalExclusiveMode(Wal *pWal, int op){
2626   int rc;
2627   assert( pWal->writeLock==0 );
2628 
2629   /* pWal->readLock is usually set, but might be -1 if there was a
2630   ** prior error while attempting to acquire are read-lock. This cannot
2631   ** happen if the connection is actually in exclusive mode (as no xShmLock
2632   ** locks are taken in this case). Nor should the pager attempt to
2633   ** upgrade to exclusive-mode following such an error.
2634   */
2635   assert( pWal->readLock>=0 || pWal->lockError );
2636   assert( pWal->readLock>=0 || (op<=0 && pWal->exclusiveMode==0) );
2637 
2638   if( op==0 ){
2639     if( pWal->exclusiveMode ){
2640       pWal->exclusiveMode = 0;
2641       if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){
2642         pWal->exclusiveMode = 1;
2643       }
2644       rc = pWal->exclusiveMode==0;
2645     }else{
2646       /* Already in locking_mode=NORMAL */
2647       rc = 0;
2648     }
2649   }else if( op>0 ){
2650     assert( pWal->exclusiveMode==0 );
2651     assert( pWal->readLock>=0 );
2652     walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
2653     pWal->exclusiveMode = 1;
2654     rc = 1;
2655   }else{
2656     rc = pWal->exclusiveMode==0;
2657   }
2658   return rc;
2659 }
2660 
2661 #endif /* #ifndef SQLITE_OMIT_WAL */
2662