xref: /sqlite-3.40.0/src/test_async.c (revision 94dfe476)
1 /*
2 ** 2005 December 14
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** $Id: test_async.c,v 1.54 2009/03/28 15:04:24 drh Exp $
14 **
15 ** This file contains an example implementation of an asynchronous IO
16 ** backend for SQLite.
17 **
18 ** WHAT IS ASYNCHRONOUS I/O?
19 **
20 ** With asynchronous I/O, write requests are handled by a separate thread
21 ** running in the background.  This means that the thread that initiates
22 ** a database write does not have to wait for (sometimes slow) disk I/O
23 ** to occur.  The write seems to happen very quickly, though in reality
24 ** it is happening at its usual slow pace in the background.
25 **
26 ** Asynchronous I/O appears to give better responsiveness, but at a price.
27 ** You lose the Durable property.  With the default I/O backend of SQLite,
28 ** once a write completes, you know that the information you wrote is
29 ** safely on disk.  With the asynchronous I/O, this is not the case.  If
30 ** your program crashes or if a power loss occurs after the database
31 ** write but before the asynchronous write thread has completed, then the
32 ** database change might never make it to disk and the next user of the
33 ** database might not see your change.
34 **
35 ** You lose Durability with asynchronous I/O, but you still retain the
36 ** other parts of ACID:  Atomic,  Consistent, and Isolated.  Many
37 ** appliations get along fine without the Durablity.
38 **
39 ** HOW IT WORKS
40 **
41 ** Asynchronous I/O works by creating a special SQLite "vfs" structure
42 ** and registering it with sqlite3_vfs_register(). When files opened via
43 ** this vfs are written to (using sqlite3OsWrite()), the data is not
44 ** written directly to disk, but is placed in the "write-queue" to be
45 ** handled by the background thread.
46 **
47 ** When files opened with the asynchronous vfs are read from
48 ** (using sqlite3OsRead()), the data is read from the file on
49 ** disk and the write-queue, so that from the point of view of
50 ** the vfs reader the OsWrite() appears to have already completed.
51 **
52 ** The special vfs is registered (and unregistered) by calls to
53 ** function asyncEnable() (see below).
54 **
55 ** LIMITATIONS
56 **
57 ** This demonstration code is deliberately kept simple in order to keep
58 ** the main ideas clear and easy to understand.  Real applications that
59 ** want to do asynchronous I/O might want to add additional capabilities.
60 ** For example, in this demonstration if writes are happening at a steady
61 ** stream that exceeds the I/O capability of the background writer thread,
62 ** the queue of pending write operations will grow without bound until we
63 ** run out of memory.  Users of this technique may want to keep track of
64 ** the quantity of pending writes and stop accepting new write requests
65 ** when the buffer gets to be too big.
66 **
67 ** LOCKING + CONCURRENCY
68 **
69 ** Multiple connections from within a single process that use this
70 ** implementation of asynchronous IO may access a single database
71 ** file concurrently. From the point of view of the user, if all
72 ** connections are from within a single process, there is no difference
73 ** between the concurrency offered by "normal" SQLite and SQLite
74 ** using the asynchronous backend.
75 **
76 ** If connections from within multiple database files may access the
77 ** database file, the ENABLE_FILE_LOCKING symbol (see below) must be
78 ** defined. If it is not defined, then no locks are established on
79 ** the database file. In this case, if multiple processes access
80 ** the database file, corruption will quickly result.
81 **
82 ** If ENABLE_FILE_LOCKING is defined (the default), then connections
83 ** from within multiple processes may access a single database file
84 ** without risking corruption. However concurrency is reduced as
85 ** follows:
86 **
87 **   * When a connection using asynchronous IO begins a database
88 **     transaction, the database is locked immediately. However the
89 **     lock is not released until after all relevant operations
90 **     in the write-queue have been flushed to disk. This means
91 **     (for example) that the database may remain locked for some
92 **     time after a "COMMIT" or "ROLLBACK" is issued.
93 **
94 **   * If an application using asynchronous IO executes transactions
95 **     in quick succession, other database users may be effectively
96 **     locked out of the database. This is because when a BEGIN
97 **     is executed, a database lock is established immediately. But
98 **     when the corresponding COMMIT or ROLLBACK occurs, the lock
99 **     is not released until the relevant part of the write-queue
100 **     has been flushed through. As a result, if a COMMIT is followed
101 **     by a BEGIN before the write-queue is flushed through, the database
102 **     is never unlocked,preventing other processes from accessing
103 **     the database.
104 **
105 ** Defining ENABLE_FILE_LOCKING when using an NFS or other remote
106 ** file-system may slow things down, as synchronous round-trips to the
107 ** server may be required to establish database file locks.
108 */
109 #define ENABLE_FILE_LOCKING
110 
111 #ifndef SQLITE_AMALGAMATION
112 # include "sqliteInt.h"
113 # include <assert.h>
114 # include <string.h>
115 #endif
116 #include <tcl.h>
117 
118 /*
119 ** This test uses pthreads and hence only works on unix and with
120 ** a threadsafe build of SQLite.
121 */
122 #if SQLITE_OS_UNIX && SQLITE_THREADSAFE
123 
124 /*
125 ** This demo uses pthreads.  If you do not have a pthreads implementation
126 ** for your operating system, you will need to recode the threading
127 ** logic.
128 */
129 #include <pthread.h>
130 #include <sched.h>
131 
132 /* Useful macros used in several places */
133 #define MIN(x,y) ((x)<(y)?(x):(y))
134 #define MAX(x,y) ((x)>(y)?(x):(y))
135 
136 /* Forward references */
137 typedef struct AsyncWrite AsyncWrite;
138 typedef struct AsyncFile AsyncFile;
139 typedef struct AsyncFileData AsyncFileData;
140 typedef struct AsyncFileLock AsyncFileLock;
141 typedef struct AsyncLock AsyncLock;
142 
143 /* Enable for debugging */
144 static int sqlite3async_trace = 0;
145 # define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
146 static void asyncTrace(const char *zFormat, ...){
147   char *z;
148   va_list ap;
149   va_start(ap, zFormat);
150   z = sqlite3_vmprintf(zFormat, ap);
151   va_end(ap);
152   fprintf(stderr, "[%d] %s", (int)pthread_self(), z);
153   sqlite3_free(z);
154 }
155 
156 /*
157 ** THREAD SAFETY NOTES
158 **
159 ** Basic rules:
160 **
161 **     * Both read and write access to the global write-op queue must be
162 **       protected by the async.queueMutex. As are the async.ioError and
163 **       async.nFile variables.
164 **
165 **     * The async.pLock list and all AsyncLock and AsyncFileLock
166 **       structures must be protected by the async.lockMutex mutex.
167 **
168 **     * The file handles from the underlying system are not assumed to
169 **       be thread safe.
170 **
171 **     * See the last two paragraphs under "The Writer Thread" for
172 **       an assumption to do with file-handle synchronization by the Os.
173 **
174 ** Deadlock prevention:
175 **
176 **     There are three mutex used by the system: the "writer" mutex,
177 **     the "queue" mutex and the "lock" mutex. Rules are:
178 **
179 **     * It is illegal to block on the writer mutex when any other mutex
180 **       are held, and
181 **
182 **     * It is illegal to block on the queue mutex when the lock mutex
183 **       is held.
184 **
185 **     i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
186 **
187 ** File system operations (invoked by SQLite thread):
188 **
189 **     xOpen
190 **     xDelete
191 **     xFileExists
192 **
193 ** File handle operations (invoked by SQLite thread):
194 **
195 **         asyncWrite, asyncClose, asyncTruncate, asyncSync
196 **
197 **     The operations above add an entry to the global write-op list. They
198 **     prepare the entry, acquire the async.queueMutex momentarily while
199 **     list pointers are  manipulated to insert the new entry, then release
200 **     the mutex and signal the writer thread to wake up in case it happens
201 **     to be asleep.
202 **
203 **
204 **         asyncRead, asyncFileSize.
205 **
206 **     Read operations. Both of these read from both the underlying file
207 **     first then adjust their result based on pending writes in the
208 **     write-op queue.   So async.queueMutex is held for the duration
209 **     of these operations to prevent other threads from changing the
210 **     queue in mid operation.
211 **
212 **
213 **         asyncLock, asyncUnlock, asyncCheckReservedLock
214 **
215 **     These primitives implement in-process locking using a hash table
216 **     on the file name.  Files are locked correctly for connections coming
217 **     from the same process.  But other processes cannot see these locks
218 **     and will therefore not honor them.
219 **
220 **
221 ** The writer thread:
222 **
223 **     The async.writerMutex is used to make sure only there is only
224 **     a single writer thread running at a time.
225 **
226 **     Inside the writer thread is a loop that works like this:
227 **
228 **         WHILE (write-op list is not empty)
229 **             Do IO operation at head of write-op list
230 **             Remove entry from head of write-op list
231 **         END WHILE
232 **
233 **     The async.queueMutex is always held during the <write-op list is
234 **     not empty> test, and when the entry is removed from the head
235 **     of the write-op list. Sometimes it is held for the interim
236 **     period (while the IO is performed), and sometimes it is
237 **     relinquished. It is relinquished if (a) the IO op is an
238 **     ASYNC_CLOSE or (b) when the file handle was opened, two of
239 **     the underlying systems handles were opened on the same
240 **     file-system entry.
241 **
242 **     If condition (b) above is true, then one file-handle
243 **     (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
244 **     file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
245 **     threads to perform write() operations. This means that read
246 **     operations are not blocked by asynchronous writes (although
247 **     asynchronous writes may still be blocked by reads).
248 **
249 **     This assumes that the OS keeps two handles open on the same file
250 **     properly in sync. That is, any read operation that starts after a
251 **     write operation on the same file system entry has completed returns
252 **     data consistent with the write. We also assume that if one thread
253 **     reads a file while another is writing it all bytes other than the
254 **     ones actually being written contain valid data.
255 **
256 **     If the above assumptions are not true, set the preprocessor symbol
257 **     SQLITE_ASYNC_TWO_FILEHANDLES to 0.
258 */
259 
260 #ifndef SQLITE_ASYNC_TWO_FILEHANDLES
261 /* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
262 #define SQLITE_ASYNC_TWO_FILEHANDLES 1
263 #endif
264 
265 /*
266 ** State information is held in the static variable "async" defined
267 ** as the following structure.
268 **
269 ** Both async.ioError and async.nFile are protected by async.queueMutex.
270 */
271 static struct TestAsyncStaticData {
272   pthread_mutex_t lockMutex;   /* For access to aLock hash table */
273   pthread_mutex_t queueMutex;  /* Mutex for access to write operation queue */
274   pthread_mutex_t writerMutex; /* Prevents multiple writer threads */
275   pthread_cond_t queueSignal;  /* For waking up sleeping writer thread */
276   pthread_cond_t emptySignal;  /* Notify when the write queue is empty */
277   AsyncWrite *pQueueFirst;     /* Next write operation to be processed */
278   AsyncWrite *pQueueLast;      /* Last write operation on the list */
279   AsyncLock *pLock;            /* Linked list of all AsyncLock structures */
280   volatile int ioDelay;             /* Extra delay between write operations */
281   volatile int writerHaltWhenIdle;  /* Writer thread halts when queue empty */
282   volatile int writerHaltNow;       /* Writer thread halts after next op */
283   int ioError;                 /* True if an IO error has occurred */
284   int nFile;                   /* Number of open files (from sqlite pov) */
285 } async = {
286   PTHREAD_MUTEX_INITIALIZER,
287   PTHREAD_MUTEX_INITIALIZER,
288   PTHREAD_MUTEX_INITIALIZER,
289   PTHREAD_COND_INITIALIZER,
290   PTHREAD_COND_INITIALIZER,
291 };
292 
293 /* Possible values of AsyncWrite.op */
294 #define ASYNC_NOOP          0
295 #define ASYNC_WRITE         1
296 #define ASYNC_SYNC          2
297 #define ASYNC_TRUNCATE      3
298 #define ASYNC_CLOSE         4
299 #define ASYNC_DELETE        5
300 #define ASYNC_OPENEXCLUSIVE 6
301 #define ASYNC_UNLOCK        7
302 
303 /* Names of opcodes.  Used for debugging only.
304 ** Make sure these stay in sync with the macros above!
305 */
306 static const char *azOpcodeName[] = {
307   "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
308 };
309 
310 /*
311 ** Entries on the write-op queue are instances of the AsyncWrite
312 ** structure, defined here.
313 **
314 ** The interpretation of the iOffset and nByte variables varies depending
315 ** on the value of AsyncWrite.op:
316 **
317 ** ASYNC_NOOP:
318 **     No values used.
319 **
320 ** ASYNC_WRITE:
321 **     iOffset -> Offset in file to write to.
322 **     nByte   -> Number of bytes of data to write (pointed to by zBuf).
323 **
324 ** ASYNC_SYNC:
325 **     nByte   -> flags to pass to sqlite3OsSync().
326 **
327 ** ASYNC_TRUNCATE:
328 **     iOffset -> Size to truncate file to.
329 **     nByte   -> Unused.
330 **
331 ** ASYNC_CLOSE:
332 **     iOffset -> Unused.
333 **     nByte   -> Unused.
334 **
335 ** ASYNC_DELETE:
336 **     iOffset -> Contains the "syncDir" flag.
337 **     nByte   -> Number of bytes of zBuf points to (file name).
338 **
339 ** ASYNC_OPENEXCLUSIVE:
340 **     iOffset -> Value of "delflag".
341 **     nByte   -> Number of bytes of zBuf points to (file name).
342 **
343 ** ASYNC_UNLOCK:
344 **     nByte   -> Argument to sqlite3OsUnlock().
345 **
346 **
347 ** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
348 ** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
349 ** single blob, so is deleted when sqlite3_free() is called on the parent
350 ** structure.
351 */
352 struct AsyncWrite {
353   AsyncFileData *pFileData;    /* File to write data to or sync */
354   int op;                      /* One of ASYNC_xxx etc. */
355   sqlite_int64 iOffset;        /* See above */
356   int nByte;          /* See above */
357   char *zBuf;         /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
358   AsyncWrite *pNext;  /* Next write operation (to any file) */
359 };
360 
361 /*
362 ** An instance of this structure is created for each distinct open file
363 ** (i.e. if two handles are opened on the one file, only one of these
364 ** structures is allocated) and stored in the async.aLock hash table. The
365 ** keys for async.aLock are the full pathnames of the opened files.
366 **
367 ** AsyncLock.pList points to the head of a linked list of AsyncFileLock
368 ** structures, one for each handle currently open on the file.
369 **
370 ** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
371 ** not passed to the sqlite3OsOpen() call), or if ENABLE_FILE_LOCKING is
372 ** not defined at compile time, variables AsyncLock.pFile and
373 ** AsyncLock.eLock are never used. Otherwise, pFile is a file handle
374 ** opened on the file in question and used to obtain the file-system
375 ** locks required by database connections within this process.
376 **
377 ** See comments above the asyncLock() function for more details on
378 ** the implementation of database locking used by this backend.
379 */
380 struct AsyncLock {
381   char *zFile;
382   int nFile;
383   sqlite3_file *pFile;
384   int eLock;
385   AsyncFileLock *pList;
386   AsyncLock *pNext;           /* Next in linked list headed by async.pLock */
387 };
388 
389 /*
390 ** An instance of the following structure is allocated along with each
391 ** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
392 ** file was opened with the SQLITE_OPEN_MAIN_DB.
393 */
394 struct AsyncFileLock {
395   int eLock;                /* Internally visible lock state (sqlite pov) */
396   int eAsyncLock;           /* Lock-state with write-queue unlock */
397   AsyncFileLock *pNext;
398 };
399 
400 /*
401 ** The AsyncFile structure is a subclass of sqlite3_file used for
402 ** asynchronous IO.
403 **
404 ** All of the actual data for the structure is stored in the structure
405 ** pointed to by AsyncFile.pData, which is allocated as part of the
406 ** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
407 ** lifetime of the AsyncFile structure is ended by the caller after OsClose()
408 ** is called, but the data in AsyncFileData may be required by the
409 ** writer thread after that point.
410 */
411 struct AsyncFile {
412   sqlite3_io_methods *pMethod;
413   AsyncFileData *pData;
414 };
415 struct AsyncFileData {
416   char *zName;               /* Underlying OS filename - used for debugging */
417   int nName;                 /* Number of characters in zName */
418   sqlite3_file *pBaseRead;   /* Read handle to the underlying Os file */
419   sqlite3_file *pBaseWrite;  /* Write handle to the underlying Os file */
420   AsyncFileLock lock;        /* Lock state for this handle */
421   AsyncLock *pLock;          /* AsyncLock object for this file system entry */
422   AsyncWrite closeOp;        /* Preallocated close operation */
423 };
424 
425 /*
426 ** The following async_XXX functions are debugging wrappers around the
427 ** corresponding pthread_XXX functions:
428 **
429 **     pthread_mutex_lock();
430 **     pthread_mutex_unlock();
431 **     pthread_mutex_trylock();
432 **     pthread_cond_wait();
433 **
434 ** It is illegal to pass any mutex other than those stored in the
435 ** following global variables of these functions.
436 **
437 **     async.queueMutex
438 **     async.writerMutex
439 **     async.lockMutex
440 **
441 ** If NDEBUG is defined, these wrappers do nothing except call the
442 ** corresponding pthreads function. If NDEBUG is not defined, then the
443 ** following variables are used to store the thread-id (as returned
444 ** by pthread_self()) currently holding the mutex, or 0 otherwise:
445 **
446 **     asyncdebug.queueMutexHolder
447 **     asyncdebug.writerMutexHolder
448 **     asyncdebug.lockMutexHolder
449 **
450 ** These variables are used by some assert() statements that verify
451 ** the statements made in the "Deadlock Prevention" notes earlier
452 ** in this file.
453 */
454 #ifndef NDEBUG
455 
456 static struct TestAsyncDebugData {
457   pthread_t lockMutexHolder;
458   pthread_t queueMutexHolder;
459   pthread_t writerMutexHolder;
460 } asyncdebug = {0, 0, 0};
461 
462 /*
463 ** Wrapper around pthread_mutex_lock(). Checks that we have not violated
464 ** the anti-deadlock rules (see "Deadlock prevention" above).
465 */
466 static int async_mutex_lock(pthread_mutex_t *pMutex){
467   int iIdx;
468   int rc;
469   pthread_mutex_t *aMutex = (pthread_mutex_t *)(&async);
470   pthread_t *aHolder = (pthread_t *)(&asyncdebug);
471 
472   /* The code in this 'ifndef NDEBUG' block depends on a certain alignment
473    * of the variables in TestAsyncStaticData and TestAsyncDebugData. The
474    * following assert() statements check that this has not been changed.
475    *
476    * Really, these only need to be run once at startup time.
477    */
478   assert(&(aMutex[0])==&async.lockMutex);
479   assert(&(aMutex[1])==&async.queueMutex);
480   assert(&(aMutex[2])==&async.writerMutex);
481   assert(&(aHolder[0])==&asyncdebug.lockMutexHolder);
482   assert(&(aHolder[1])==&asyncdebug.queueMutexHolder);
483   assert(&(aHolder[2])==&asyncdebug.writerMutexHolder);
484 
485   assert( pthread_self()!=0 );
486 
487   for(iIdx=0; iIdx<3; iIdx++){
488     if( pMutex==&aMutex[iIdx] ) break;
489 
490     /* This is the key assert(). Here we are checking that if the caller
491      * is trying to block on async.writerMutex, neither of the other two
492      * mutex are held. If the caller is trying to block on async.queueMutex,
493      * lockMutex is not held.
494      */
495     assert(!pthread_equal(aHolder[iIdx], pthread_self()));
496   }
497   assert(iIdx<3);
498 
499   rc = pthread_mutex_lock(pMutex);
500   if( rc==0 ){
501     assert(aHolder[iIdx]==0);
502     aHolder[iIdx] = pthread_self();
503   }
504   return rc;
505 }
506 
507 /*
508 ** Wrapper around pthread_mutex_unlock().
509 */
510 static int async_mutex_unlock(pthread_mutex_t *pMutex){
511   int iIdx;
512   int rc;
513   pthread_mutex_t *aMutex = (pthread_mutex_t *)(&async);
514   pthread_t *aHolder = (pthread_t *)(&asyncdebug);
515 
516   for(iIdx=0; iIdx<3; iIdx++){
517     if( pMutex==&aMutex[iIdx] ) break;
518   }
519   assert(iIdx<3);
520 
521   assert(pthread_equal(aHolder[iIdx], pthread_self()));
522   aHolder[iIdx] = 0;
523   rc = pthread_mutex_unlock(pMutex);
524   assert(rc==0);
525 
526   return 0;
527 }
528 
529 /*
530 ** Wrapper around pthread_mutex_trylock().
531 */
532 static int async_mutex_trylock(pthread_mutex_t *pMutex){
533   int iIdx;
534   int rc;
535   pthread_mutex_t *aMutex = (pthread_mutex_t *)(&async);
536   pthread_t *aHolder = (pthread_t *)(&asyncdebug);
537 
538   for(iIdx=0; iIdx<3; iIdx++){
539     if( pMutex==&aMutex[iIdx] ) break;
540   }
541   assert(iIdx<3);
542 
543   rc = pthread_mutex_trylock(pMutex);
544   if( rc==0 ){
545     assert(aHolder[iIdx]==0);
546     aHolder[iIdx] = pthread_self();
547   }
548   return rc;
549 }
550 
551 /*
552 ** Wrapper around pthread_cond_wait().
553 */
554 static int async_cond_wait(pthread_cond_t *pCond, pthread_mutex_t *pMutex){
555   int iIdx;
556   int rc;
557   pthread_mutex_t *aMutex = (pthread_mutex_t *)(&async);
558   pthread_t *aHolder = (pthread_t *)(&asyncdebug);
559 
560   for(iIdx=0; iIdx<3; iIdx++){
561     if( pMutex==&aMutex[iIdx] ) break;
562   }
563   assert(iIdx<3);
564 
565   assert(pthread_equal(aHolder[iIdx],pthread_self()));
566   aHolder[iIdx] = 0;
567   rc = pthread_cond_wait(pCond, pMutex);
568   if( rc==0 ){
569     aHolder[iIdx] = pthread_self();
570   }
571   return rc;
572 }
573 
574 /*
575 ** Assert that the mutex is held by the current thread.
576 */
577 static void assert_mutex_is_held(pthread_mutex_t *pMutex){
578   int iIdx;
579   pthread_mutex_t *aMutex = (pthread_mutex_t *)(&async);
580   pthread_t *aHolder = (pthread_t *)(&asyncdebug);
581 
582   for(iIdx=0; iIdx<3; iIdx++){
583     if( pMutex==&aMutex[iIdx] ) break;
584   }
585   assert(iIdx<3);
586   assert( aHolder[iIdx]==pthread_self() );
587 }
588 
589 /* Call our async_XX wrappers instead of selected pthread_XX functions */
590 #define pthread_mutex_lock    async_mutex_lock
591 #define pthread_mutex_unlock  async_mutex_unlock
592 #define pthread_mutex_trylock async_mutex_trylock
593 #define pthread_cond_wait     async_cond_wait
594 
595 #else    /* if defined(NDEBUG) */
596 
597 #define assert_mutex_is_held(X)    /* A no-op when not debugging */
598 
599 #endif   /* !defined(NDEBUG) */
600 
601 /*
602 ** Add an entry to the end of the global write-op list. pWrite should point
603 ** to an AsyncWrite structure allocated using sqlite3_malloc().  The writer
604 ** thread will call sqlite3_free() to free the structure after the specified
605 ** operation has been completed.
606 **
607 ** Once an AsyncWrite structure has been added to the list, it becomes the
608 ** property of the writer thread and must not be read or modified by the
609 ** caller.
610 */
611 static void addAsyncWrite(AsyncWrite *pWrite){
612   /* We must hold the queue mutex in order to modify the queue pointers */
613   pthread_mutex_lock(&async.queueMutex);
614 
615   /* Add the record to the end of the write-op queue */
616   assert( !pWrite->pNext );
617   if( async.pQueueLast ){
618     assert( async.pQueueFirst );
619     async.pQueueLast->pNext = pWrite;
620   }else{
621     async.pQueueFirst = pWrite;
622   }
623   async.pQueueLast = pWrite;
624   ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
625          pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
626 
627   if( pWrite->op==ASYNC_CLOSE ){
628     async.nFile--;
629   }
630 
631   /* Drop the queue mutex */
632   pthread_mutex_unlock(&async.queueMutex);
633 
634   /* The writer thread might have been idle because there was nothing
635   ** on the write-op queue for it to do.  So wake it up. */
636   pthread_cond_signal(&async.queueSignal);
637 }
638 
639 /*
640 ** Increment async.nFile in a thread-safe manner.
641 */
642 static void incrOpenFileCount(){
643   /* We must hold the queue mutex in order to modify async.nFile */
644   pthread_mutex_lock(&async.queueMutex);
645   if( async.nFile==0 ){
646     async.ioError = SQLITE_OK;
647   }
648   async.nFile++;
649   pthread_mutex_unlock(&async.queueMutex);
650 }
651 
652 /*
653 ** This is a utility function to allocate and populate a new AsyncWrite
654 ** structure and insert it (via addAsyncWrite() ) into the global list.
655 */
656 static int addNewAsyncWrite(
657   AsyncFileData *pFileData,
658   int op,
659   sqlite3_int64 iOffset,
660   int nByte,
661   const char *zByte
662 ){
663   AsyncWrite *p;
664   if( op!=ASYNC_CLOSE && async.ioError ){
665     return async.ioError;
666   }
667   p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
668   if( !p ){
669     /* The upper layer does not expect operations like OsWrite() to
670     ** return SQLITE_NOMEM. This is partly because under normal conditions
671     ** SQLite is required to do rollback without calling malloc(). So
672     ** if malloc() fails here, treat it as an I/O error. The above
673     ** layer knows how to handle that.
674     */
675     return SQLITE_IOERR;
676   }
677   p->op = op;
678   p->iOffset = iOffset;
679   p->nByte = nByte;
680   p->pFileData = pFileData;
681   p->pNext = 0;
682   if( zByte ){
683     p->zBuf = (char *)&p[1];
684     memcpy(p->zBuf, zByte, nByte);
685   }else{
686     p->zBuf = 0;
687   }
688   addAsyncWrite(p);
689   return SQLITE_OK;
690 }
691 
692 /*
693 ** Close the file. This just adds an entry to the write-op list, the file is
694 ** not actually closed.
695 */
696 static int asyncClose(sqlite3_file *pFile){
697   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
698 
699   /* Unlock the file, if it is locked */
700   pthread_mutex_lock(&async.lockMutex);
701   p->lock.eLock = 0;
702   pthread_mutex_unlock(&async.lockMutex);
703 
704   addAsyncWrite(&p->closeOp);
705   return SQLITE_OK;
706 }
707 
708 /*
709 ** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
710 ** writing to the underlying file, this function adds an entry to the end of
711 ** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
712 ** returned.
713 */
714 static int asyncWrite(
715   sqlite3_file *pFile,
716   const void *pBuf,
717   int amt,
718   sqlite3_int64 iOff
719 ){
720   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
721   return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
722 }
723 
724 /*
725 ** Read data from the file. First we read from the filesystem, then adjust
726 ** the contents of the buffer based on ASYNC_WRITE operations in the
727 ** write-op queue.
728 **
729 ** This method holds the mutex from start to finish.
730 */
731 static int asyncRead(
732   sqlite3_file *pFile,
733   void *zOut,
734   int iAmt,
735   sqlite3_int64 iOffset
736 ){
737   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
738   int rc = SQLITE_OK;
739   sqlite3_int64 filesize;
740   int nRead;
741   sqlite3_file *pBase = p->pBaseRead;
742 
743   /* Grab the write queue mutex for the duration of the call */
744   pthread_mutex_lock(&async.queueMutex);
745 
746   /* If an I/O error has previously occurred in this virtual file
747   ** system, then all subsequent operations fail.
748   */
749   if( async.ioError!=SQLITE_OK ){
750     rc = async.ioError;
751     goto asyncread_out;
752   }
753 
754   if( pBase->pMethods ){
755     rc = pBase->pMethods->xFileSize(pBase, &filesize);
756     if( rc!=SQLITE_OK ){
757       goto asyncread_out;
758     }
759     nRead = MIN(filesize - iOffset, iAmt);
760     if( nRead>0 ){
761       rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
762       ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
763     }
764   }
765 
766   if( rc==SQLITE_OK ){
767     AsyncWrite *pWrite;
768     char *zName = p->zName;
769 
770     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
771       if( pWrite->op==ASYNC_WRITE && (
772         (pWrite->pFileData==p) ||
773         (zName && pWrite->pFileData->zName==zName)
774       )){
775         int iBeginOut = (pWrite->iOffset-iOffset);
776         int iBeginIn = -iBeginOut;
777         int nCopy;
778 
779         if( iBeginIn<0 ) iBeginIn = 0;
780         if( iBeginOut<0 ) iBeginOut = 0;
781         nCopy = MIN(pWrite->nByte-iBeginIn, iAmt-iBeginOut);
782 
783         if( nCopy>0 ){
784           memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
785           ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
786         }
787       }
788     }
789   }
790 
791 asyncread_out:
792   pthread_mutex_unlock(&async.queueMutex);
793   return rc;
794 }
795 
796 /*
797 ** Truncate the file to nByte bytes in length. This just adds an entry to
798 ** the write-op list, no IO actually takes place.
799 */
800 static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
801   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
802   return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
803 }
804 
805 /*
806 ** Sync the file. This just adds an entry to the write-op list, the
807 ** sync() is done later by sqlite3_async_flush().
808 */
809 static int asyncSync(sqlite3_file *pFile, int flags){
810   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
811   return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
812 }
813 
814 /*
815 ** Read the size of the file. First we read the size of the file system
816 ** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
817 ** currently in the write-op list.
818 **
819 ** This method holds the mutex from start to finish.
820 */
821 int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
822   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
823   int rc = SQLITE_OK;
824   sqlite3_int64 s = 0;
825   sqlite3_file *pBase;
826 
827   pthread_mutex_lock(&async.queueMutex);
828 
829   /* Read the filesystem size from the base file. If pBaseRead is NULL, this
830   ** means the file hasn't been opened yet. In this case all relevant data
831   ** must be in the write-op queue anyway, so we can omit reading from the
832   ** file-system.
833   */
834   pBase = p->pBaseRead;
835   if( pBase->pMethods ){
836     rc = pBase->pMethods->xFileSize(pBase, &s);
837   }
838 
839   if( rc==SQLITE_OK ){
840     AsyncWrite *pWrite;
841     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
842       if( pWrite->op==ASYNC_DELETE
843        && p->zName
844        && strcmp(p->zName, pWrite->zBuf)==0
845       ){
846         s = 0;
847       }else if( pWrite->pFileData && (
848           (pWrite->pFileData==p)
849        || (p->zName && pWrite->pFileData->zName==p->zName)
850       )){
851         switch( pWrite->op ){
852           case ASYNC_WRITE:
853             s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
854             break;
855           case ASYNC_TRUNCATE:
856             s = MIN(s, pWrite->iOffset);
857             break;
858         }
859       }
860     }
861     *piSize = s;
862   }
863   pthread_mutex_unlock(&async.queueMutex);
864   return rc;
865 }
866 
867 /*
868 ** Lock or unlock the actual file-system entry.
869 */
870 static int getFileLock(AsyncLock *pLock){
871   int rc = SQLITE_OK;
872   AsyncFileLock *pIter;
873   int eRequired = 0;
874 
875   if( pLock->pFile ){
876     for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
877       assert(pIter->eAsyncLock>=pIter->eLock);
878       if( pIter->eAsyncLock>eRequired ){
879         eRequired = pIter->eAsyncLock;
880         assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
881       }
882     }
883 
884     if( eRequired>pLock->eLock ){
885       rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
886       if( rc==SQLITE_OK ){
887         pLock->eLock = eRequired;
888       }
889     }
890     else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
891       rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
892       if( rc==SQLITE_OK ){
893         pLock->eLock = eRequired;
894       }
895     }
896   }
897 
898   return rc;
899 }
900 
901 /*
902 ** Return the AsyncLock structure from the global async.pLock list
903 ** associated with the file-system entry identified by path zName
904 ** (a string of nName bytes). If no such structure exists, return 0.
905 */
906 static AsyncLock *findLock(const char *zName, int nName){
907   AsyncLock *p = async.pLock;
908   while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
909     p = p->pNext;
910   }
911   return p;
912 }
913 
914 /*
915 ** The following two methods - asyncLock() and asyncUnlock() - are used
916 ** to obtain and release locks on database files opened with the
917 ** asynchronous backend.
918 */
919 static int asyncLock(sqlite3_file *pFile, int eLock){
920   int rc = SQLITE_OK;
921   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
922 
923   if( p->zName ){
924     pthread_mutex_lock(&async.lockMutex);
925     if( p->lock.eLock<eLock ){
926       AsyncLock *pLock = p->pLock;
927       AsyncFileLock *pIter;
928       assert(pLock && pLock->pList);
929       for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
930         if( pIter!=&p->lock && (
931           (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
932           (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
933           (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
934           (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
935         )){
936           rc = SQLITE_BUSY;
937         }
938       }
939       if( rc==SQLITE_OK ){
940         p->lock.eLock = eLock;
941         p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
942       }
943       assert(p->lock.eAsyncLock>=p->lock.eLock);
944       if( rc==SQLITE_OK ){
945         rc = getFileLock(pLock);
946       }
947     }
948     pthread_mutex_unlock(&async.lockMutex);
949   }
950 
951   ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
952   return rc;
953 }
954 static int asyncUnlock(sqlite3_file *pFile, int eLock){
955   int rc = SQLITE_OK;
956   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
957   if( p->zName ){
958     AsyncFileLock *pLock = &p->lock;
959     pthread_mutex_lock(&async.lockMutex);
960     pLock->eLock = MIN(pLock->eLock, eLock);
961     pthread_mutex_unlock(&async.lockMutex);
962     rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
963   }
964   return rc;
965 }
966 
967 /*
968 ** This function is called when the pager layer first opens a database file
969 ** and is checking for a hot-journal.
970 */
971 static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
972   int ret = 0;
973   AsyncFileLock *pIter;
974   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
975 
976   pthread_mutex_lock(&async.lockMutex);
977   for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
978     if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
979       ret = 1;
980     }
981   }
982   pthread_mutex_unlock(&async.lockMutex);
983 
984   ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
985   *pResOut = ret;
986   return SQLITE_OK;
987 }
988 
989 /*
990 ** sqlite3_file_control() implementation.
991 */
992 static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
993   switch( op ){
994     case SQLITE_FCNTL_LOCKSTATE: {
995       pthread_mutex_lock(&async.lockMutex);
996       *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
997       pthread_mutex_unlock(&async.lockMutex);
998       return SQLITE_OK;
999     }
1000   }
1001   return SQLITE_ERROR;
1002 }
1003 
1004 /*
1005 ** Return the device characteristics and sector-size of the device. It
1006 ** is not tricky to implement these correctly, as this backend might
1007 ** not have an open file handle at this point.
1008 */
1009 static int asyncSectorSize(sqlite3_file *pFile){
1010   return 512;
1011 }
1012 static int asyncDeviceCharacteristics(sqlite3_file *pFile){
1013   return 0;
1014 }
1015 
1016 static int unlinkAsyncFile(AsyncFileData *pData){
1017   AsyncFileLock **ppIter;
1018   int rc = SQLITE_OK;
1019 
1020   if( pData->zName ){
1021     AsyncLock *pLock = pData->pLock;
1022     for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
1023       if( (*ppIter)==&pData->lock ){
1024         *ppIter = pData->lock.pNext;
1025         break;
1026       }
1027     }
1028     if( !pLock->pList ){
1029       AsyncLock **pp;
1030       if( pLock->pFile ){
1031         pLock->pFile->pMethods->xClose(pLock->pFile);
1032       }
1033       for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
1034       *pp = pLock->pNext;
1035       sqlite3_free(pLock);
1036     }else{
1037       rc = getFileLock(pLock);
1038     }
1039   }
1040 
1041   return rc;
1042 }
1043 
1044 /*
1045 ** The parameter passed to this function is a copy of a 'flags' parameter
1046 ** passed to this modules xOpen() method. This function returns true
1047 ** if the file should be opened asynchronously, or false if it should
1048 ** be opened immediately.
1049 **
1050 ** If the file is to be opened asynchronously, then asyncOpen() will add
1051 ** an entry to the event queue and the file will not actually be opened
1052 ** until the event is processed. Otherwise, the file is opened directly
1053 ** by the caller.
1054 */
1055 static int doAsynchronousOpen(int flags){
1056   return (flags&SQLITE_OPEN_CREATE) && (
1057       (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1058       (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1059       (flags&SQLITE_OPEN_DELETEONCLOSE)
1060   );
1061 }
1062 
1063 /*
1064 ** Open a file.
1065 */
1066 static int asyncOpen(
1067   sqlite3_vfs *pAsyncVfs,
1068   const char *zName,
1069   sqlite3_file *pFile,
1070   int flags,
1071   int *pOutFlags
1072 ){
1073   static sqlite3_io_methods async_methods = {
1074     1,                               /* iVersion */
1075     asyncClose,                      /* xClose */
1076     asyncRead,                       /* xRead */
1077     asyncWrite,                      /* xWrite */
1078     asyncTruncate,                   /* xTruncate */
1079     asyncSync,                       /* xSync */
1080     asyncFileSize,                   /* xFileSize */
1081     asyncLock,                       /* xLock */
1082     asyncUnlock,                     /* xUnlock */
1083     asyncCheckReservedLock,          /* xCheckReservedLock */
1084     asyncFileControl,                /* xFileControl */
1085     asyncSectorSize,                 /* xSectorSize */
1086     asyncDeviceCharacteristics       /* xDeviceCharacteristics */
1087   };
1088 
1089   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1090   AsyncFile *p = (AsyncFile *)pFile;
1091   int nName = 0;
1092   int rc = SQLITE_OK;
1093   int nByte;
1094   AsyncFileData *pData;
1095   AsyncLock *pLock = 0;
1096   char *z;
1097   int isAsyncOpen = doAsynchronousOpen(flags);
1098 
1099   /* If zName is NULL, then the upper layer is requesting an anonymous file */
1100   if( zName ){
1101     nName = strlen(zName)+1;
1102   }
1103 
1104   nByte = (
1105     sizeof(AsyncFileData) +        /* AsyncFileData structure */
1106     2 * pVfs->szOsFile +           /* AsyncFileData.pBaseRead and pBaseWrite */
1107     nName                          /* AsyncFileData.zName */
1108   );
1109   z = sqlite3_malloc(nByte);
1110   if( !z ){
1111     return SQLITE_NOMEM;
1112   }
1113   memset(z, 0, nByte);
1114   pData = (AsyncFileData*)z;
1115   z += sizeof(pData[0]);
1116   pData->pBaseRead = (sqlite3_file*)z;
1117   z += pVfs->szOsFile;
1118   pData->pBaseWrite = (sqlite3_file*)z;
1119   pData->closeOp.pFileData = pData;
1120   pData->closeOp.op = ASYNC_CLOSE;
1121 
1122   if( zName ){
1123     z += pVfs->szOsFile;
1124     pData->zName = z;
1125     pData->nName = nName;
1126     memcpy(pData->zName, zName, nName);
1127   }
1128 
1129   if( !isAsyncOpen ){
1130     int flagsout;
1131     rc = pVfs->xOpen(pVfs, zName, pData->pBaseRead, flags, &flagsout);
1132     if( rc==SQLITE_OK && (flagsout&SQLITE_OPEN_READWRITE) ){
1133       rc = pVfs->xOpen(pVfs, zName, pData->pBaseWrite, flags, 0);
1134     }
1135     if( pOutFlags ){
1136       *pOutFlags = flagsout;
1137     }
1138   }
1139 
1140   pthread_mutex_lock(&async.lockMutex);
1141 
1142   if( zName && rc==SQLITE_OK ){
1143     pLock = findLock(pData->zName, pData->nName);
1144     if( !pLock ){
1145       int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1146       pLock = (AsyncLock *)sqlite3_malloc(nByte);
1147       if( pLock ){
1148         memset(pLock, 0, nByte);
1149 #ifdef ENABLE_FILE_LOCKING
1150         if( flags&SQLITE_OPEN_MAIN_DB ){
1151           pLock->pFile = (sqlite3_file *)&pLock[1];
1152           rc = pVfs->xOpen(pVfs, zName, pLock->pFile, flags, 0);
1153           if( rc!=SQLITE_OK ){
1154             sqlite3_free(pLock);
1155             pLock = 0;
1156           }
1157         }
1158 #endif
1159         if( pLock ){
1160           pLock->nFile = pData->nName;
1161           pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1162           memcpy(pLock->zFile, pData->zName, pLock->nFile);
1163           pLock->pNext = async.pLock;
1164           async.pLock = pLock;
1165         }
1166       }else{
1167         rc = SQLITE_NOMEM;
1168       }
1169     }
1170   }
1171 
1172   if( rc==SQLITE_OK ){
1173     p->pMethod = &async_methods;
1174     p->pData = pData;
1175 
1176     /* Link AsyncFileData.lock into the linked list of
1177     ** AsyncFileLock structures for this file.
1178     */
1179     if( zName ){
1180       pData->lock.pNext = pLock->pList;
1181       pLock->pList = &pData->lock;
1182       pData->zName = pLock->zFile;
1183     }
1184   }else{
1185     if( pData->pBaseRead->pMethods ){
1186       pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1187     }
1188     if( pData->pBaseWrite->pMethods ){
1189       pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1190     }
1191     sqlite3_free(pData);
1192   }
1193 
1194   pthread_mutex_unlock(&async.lockMutex);
1195 
1196   if( rc==SQLITE_OK ){
1197     incrOpenFileCount();
1198     pData->pLock = pLock;
1199   }
1200 
1201   if( rc==SQLITE_OK && isAsyncOpen ){
1202     rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1203     if( rc==SQLITE_OK ){
1204       if( pOutFlags ) *pOutFlags = flags;
1205     }else{
1206       pthread_mutex_lock(&async.lockMutex);
1207       unlinkAsyncFile(pData);
1208       pthread_mutex_unlock(&async.lockMutex);
1209       sqlite3_free(pData);
1210     }
1211   }
1212   if( rc!=SQLITE_OK ){
1213     p->pMethod = 0;
1214   }
1215   return rc;
1216 }
1217 
1218 /*
1219 ** Implementation of sqlite3OsDelete. Add an entry to the end of the
1220 ** write-op queue to perform the delete.
1221 */
1222 static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1223   return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, strlen(z)+1, z);
1224 }
1225 
1226 /*
1227 ** Implementation of sqlite3OsAccess. This method holds the mutex from
1228 ** start to finish.
1229 */
1230 static int asyncAccess(
1231   sqlite3_vfs *pAsyncVfs,
1232   const char *zName,
1233   int flags,
1234   int *pResOut
1235 ){
1236   int rc;
1237   int ret;
1238   AsyncWrite *p;
1239   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1240 
1241   assert(flags==SQLITE_ACCESS_READWRITE
1242       || flags==SQLITE_ACCESS_READ
1243       || flags==SQLITE_ACCESS_EXISTS
1244   );
1245 
1246   pthread_mutex_lock(&async.queueMutex);
1247   rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1248   if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1249     for(p=async.pQueueFirst; p; p = p->pNext){
1250       if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1251         ret = 0;
1252       }else if( p->op==ASYNC_OPENEXCLUSIVE
1253              && p->pFileData->zName
1254              && 0==strcmp(p->pFileData->zName, zName)
1255       ){
1256         ret = 1;
1257       }
1258     }
1259   }
1260   ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1261     flags==SQLITE_ACCESS_READWRITE?"read-write":
1262     flags==SQLITE_ACCESS_READ?"read":"exists"
1263     , zName, ret)
1264   );
1265   pthread_mutex_unlock(&async.queueMutex);
1266   *pResOut = ret;
1267   return rc;
1268 }
1269 
1270 /*
1271 ** Fill in zPathOut with the full path to the file identified by zPath.
1272 */
1273 static int asyncFullPathname(
1274   sqlite3_vfs *pAsyncVfs,
1275   const char *zPath,
1276   int nPathOut,
1277   char *zPathOut
1278 ){
1279   int rc;
1280   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1281   rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1282 
1283   /* Because of the way intra-process file locking works, this backend
1284   ** needs to return a canonical path. The following block assumes the
1285   ** file-system uses unix style paths.
1286   */
1287   if( rc==SQLITE_OK ){
1288     int i, j;
1289     int n = nPathOut;
1290     char *z = zPathOut;
1291     while( n>1 && z[n-1]=='/' ){ n--; }
1292     for(i=j=0; i<n; i++){
1293       if( z[i]=='/' ){
1294         if( z[i+1]=='/' ) continue;
1295         if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1296           i += 1;
1297           continue;
1298         }
1299         if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1300           while( j>0 && z[j-1]!='/' ){ j--; }
1301           if( j>0 ){ j--; }
1302           i += 2;
1303           continue;
1304         }
1305       }
1306       z[j++] = z[i];
1307     }
1308     z[j] = 0;
1309   }
1310 
1311   return rc;
1312 }
1313 static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1314   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1315   return pVfs->xDlOpen(pVfs, zPath);
1316 }
1317 static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1318   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1319   pVfs->xDlError(pVfs, nByte, zErrMsg);
1320 }
1321 static void (*asyncDlSym(
1322   sqlite3_vfs *pAsyncVfs,
1323   void *pHandle,
1324   const char *zSymbol
1325 ))(void){
1326   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1327   return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1328 }
1329 static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1330   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1331   pVfs->xDlClose(pVfs, pHandle);
1332 }
1333 static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1334   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1335   return pVfs->xRandomness(pVfs, nByte, zBufOut);
1336 }
1337 static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1338   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1339   return pVfs->xSleep(pVfs, nMicro);
1340 }
1341 static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1342   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1343   return pVfs->xCurrentTime(pVfs, pTimeOut);
1344 }
1345 
1346 static sqlite3_vfs async_vfs = {
1347   1,                    /* iVersion */
1348   sizeof(AsyncFile),    /* szOsFile */
1349   0,                    /* mxPathname */
1350   0,                    /* pNext */
1351   "async",              /* zName */
1352   0,                    /* pAppData */
1353   asyncOpen,            /* xOpen */
1354   asyncDelete,          /* xDelete */
1355   asyncAccess,          /* xAccess */
1356   asyncFullPathname,    /* xFullPathname */
1357   asyncDlOpen,          /* xDlOpen */
1358   asyncDlError,         /* xDlError */
1359   asyncDlSym,           /* xDlSym */
1360   asyncDlClose,         /* xDlClose */
1361   asyncRandomness,      /* xDlError */
1362   asyncSleep,           /* xDlSym */
1363   asyncCurrentTime      /* xDlClose */
1364 };
1365 
1366 /*
1367 ** Call this routine to enable or disable the
1368 ** asynchronous IO features implemented in this file.
1369 **
1370 ** This routine is not even remotely threadsafe.  Do not call
1371 ** this routine while any SQLite database connections are open.
1372 */
1373 static void asyncEnable(int enable){
1374   if( enable ){
1375     if( !async_vfs.pAppData ){
1376       async_vfs.pAppData = (void *)sqlite3_vfs_find(0);
1377       async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1378       sqlite3_vfs_register(&async_vfs, 1);
1379     }
1380   }else{
1381     if( async_vfs.pAppData ){
1382       sqlite3_vfs_unregister(&async_vfs);
1383       async_vfs.pAppData = 0;
1384     }
1385   }
1386 }
1387 
1388 /*
1389 ** This procedure runs in a separate thread, reading messages off of the
1390 ** write queue and processing them one by one.
1391 **
1392 ** If async.writerHaltNow is true, then this procedure exits
1393 ** after processing a single message.
1394 **
1395 ** If async.writerHaltWhenIdle is true, then this procedure exits when
1396 ** the write queue is empty.
1397 **
1398 ** If both of the above variables are false, this procedure runs
1399 ** indefinately, waiting for operations to be added to the write queue
1400 ** and processing them in the order in which they arrive.
1401 **
1402 ** An artifical delay of async.ioDelay milliseconds is inserted before
1403 ** each write operation in order to simulate the effect of a slow disk.
1404 **
1405 ** Only one instance of this procedure may be running at a time.
1406 */
1407 static void *asyncWriterThread(void *pIsStarted){
1408   sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1409   AsyncWrite *p = 0;
1410   int rc = SQLITE_OK;
1411   int holdingMutex = 0;
1412 
1413   if( pthread_mutex_trylock(&async.writerMutex) ){
1414     return 0;
1415   }
1416   (*(int *)pIsStarted) = 1;
1417   while( async.writerHaltNow==0 ){
1418     int doNotFree = 0;
1419     sqlite3_file *pBase = 0;
1420 
1421     if( !holdingMutex ){
1422       pthread_mutex_lock(&async.queueMutex);
1423     }
1424     while( (p = async.pQueueFirst)==0 ){
1425       pthread_cond_broadcast(&async.emptySignal);
1426       if( async.writerHaltWhenIdle ){
1427         pthread_mutex_unlock(&async.queueMutex);
1428         break;
1429       }else{
1430         ASYNC_TRACE(("IDLE\n"));
1431         pthread_cond_wait(&async.queueSignal, &async.queueMutex);
1432         ASYNC_TRACE(("WAKEUP\n"));
1433       }
1434     }
1435     if( p==0 ) break;
1436     holdingMutex = 1;
1437 
1438     /* Right now this thread is holding the mutex on the write-op queue.
1439     ** Variable 'p' points to the first entry in the write-op queue. In
1440     ** the general case, we hold on to the mutex for the entire body of
1441     ** the loop.
1442     **
1443     ** However in the cases enumerated below, we relinquish the mutex,
1444     ** perform the IO, and then re-request the mutex before removing 'p' from
1445     ** the head of the write-op queue. The idea is to increase concurrency with
1446     ** sqlite threads.
1447     **
1448     **     * An ASYNC_CLOSE operation.
1449     **     * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1450     **       the mutex, call the underlying xOpenExclusive() function, then
1451     **       re-aquire the mutex before seting the AsyncFile.pBaseRead
1452     **       variable.
1453     **     * ASYNC_SYNC and ASYNC_WRITE operations, if
1454     **       SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1455     **       file-handles are open for the particular file being "synced".
1456     */
1457     if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1458       p->op = ASYNC_NOOP;
1459     }
1460     if( p->pFileData ){
1461       pBase = p->pFileData->pBaseWrite;
1462       if(
1463         p->op==ASYNC_CLOSE ||
1464         p->op==ASYNC_OPENEXCLUSIVE ||
1465         (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1466       ){
1467         pthread_mutex_unlock(&async.queueMutex);
1468         holdingMutex = 0;
1469       }
1470       if( !pBase->pMethods ){
1471         pBase = p->pFileData->pBaseRead;
1472       }
1473     }
1474 
1475     switch( p->op ){
1476       case ASYNC_NOOP:
1477         break;
1478 
1479       case ASYNC_WRITE:
1480         assert( pBase );
1481         ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1482                 p->pFileData->zName, p->nByte, p->iOffset));
1483         rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1484         break;
1485 
1486       case ASYNC_SYNC:
1487         assert( pBase );
1488         ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1489         rc = pBase->pMethods->xSync(pBase, p->nByte);
1490         break;
1491 
1492       case ASYNC_TRUNCATE:
1493         assert( pBase );
1494         ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1495                 p->pFileData->zName, p->iOffset));
1496         rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1497         break;
1498 
1499       case ASYNC_CLOSE: {
1500         AsyncFileData *pData = p->pFileData;
1501         ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1502         if( pData->pBaseWrite->pMethods ){
1503           pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1504         }
1505         if( pData->pBaseRead->pMethods ){
1506           pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1507         }
1508 
1509         /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1510         ** structures for this file. Obtain the async.lockMutex mutex
1511         ** before doing so.
1512         */
1513         pthread_mutex_lock(&async.lockMutex);
1514         rc = unlinkAsyncFile(pData);
1515         pthread_mutex_unlock(&async.lockMutex);
1516 
1517         if( !holdingMutex ){
1518           pthread_mutex_lock(&async.queueMutex);
1519           holdingMutex = 1;
1520         }
1521         assert_mutex_is_held(&async.queueMutex);
1522         async.pQueueFirst = p->pNext;
1523         sqlite3_free(pData);
1524         doNotFree = 1;
1525         break;
1526       }
1527 
1528       case ASYNC_UNLOCK: {
1529         AsyncFileData *pData = p->pFileData;
1530         int eLock = p->nByte;
1531         pthread_mutex_lock(&async.lockMutex);
1532         pData->lock.eAsyncLock = MIN(
1533             pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1534         );
1535         assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1536         rc = getFileLock(pData->pLock);
1537         pthread_mutex_unlock(&async.lockMutex);
1538         break;
1539       }
1540 
1541       case ASYNC_DELETE:
1542         ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1543         rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1544         break;
1545 
1546       case ASYNC_OPENEXCLUSIVE: {
1547         int flags = (int)p->iOffset;
1548         AsyncFileData *pData = p->pFileData;
1549         ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1550         assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1551         rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1552         assert( holdingMutex==0 );
1553         pthread_mutex_lock(&async.queueMutex);
1554         holdingMutex = 1;
1555         break;
1556       }
1557 
1558       default: assert(!"Illegal value for AsyncWrite.op");
1559     }
1560 
1561     /* If we didn't hang on to the mutex during the IO op, obtain it now
1562     ** so that the AsyncWrite structure can be safely removed from the
1563     ** global write-op queue.
1564     */
1565     if( !holdingMutex ){
1566       pthread_mutex_lock(&async.queueMutex);
1567       holdingMutex = 1;
1568     }
1569     /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1570     if( p==async.pQueueLast ){
1571       async.pQueueLast = 0;
1572     }
1573     if( !doNotFree ){
1574       assert_mutex_is_held(&async.queueMutex);
1575       async.pQueueFirst = p->pNext;
1576       sqlite3_free(p);
1577     }
1578     assert( holdingMutex );
1579 
1580     /* An IO error has occurred. We cannot report the error back to the
1581     ** connection that requested the I/O since the error happened
1582     ** asynchronously.  The connection has already moved on.  There
1583     ** really is nobody to report the error to.
1584     **
1585     ** The file for which the error occurred may have been a database or
1586     ** journal file. Regardless, none of the currently queued operations
1587     ** associated with the same database should now be performed. Nor should
1588     ** any subsequently requested IO on either a database or journal file
1589     ** handle for the same database be accepted until the main database
1590     ** file handle has been closed and reopened.
1591     **
1592     ** Furthermore, no further IO should be queued or performed on any file
1593     ** handle associated with a database that may have been part of a
1594     ** multi-file transaction that included the database associated with
1595     ** the IO error (i.e. a database ATTACHed to the same handle at some
1596     ** point in time).
1597     */
1598     if( rc!=SQLITE_OK ){
1599       async.ioError = rc;
1600     }
1601 
1602     if( async.ioError && !async.pQueueFirst ){
1603       pthread_mutex_lock(&async.lockMutex);
1604       if( 0==async.pLock ){
1605         async.ioError = SQLITE_OK;
1606       }
1607       pthread_mutex_unlock(&async.lockMutex);
1608     }
1609 
1610     /* Drop the queue mutex before continuing to the next write operation
1611     ** in order to give other threads a chance to work with the write queue.
1612     */
1613     if( !async.pQueueFirst || !async.ioError ){
1614       pthread_mutex_unlock(&async.queueMutex);
1615       holdingMutex = 0;
1616       if( async.ioDelay>0 ){
1617         pVfs->xSleep(pVfs, async.ioDelay);
1618       }else{
1619         sched_yield();
1620       }
1621     }
1622   }
1623 
1624   pthread_mutex_unlock(&async.writerMutex);
1625   return 0;
1626 }
1627 
1628 /**************************************************************************
1629 ** The remaining code defines a Tcl interface for testing the asynchronous
1630 ** IO implementation in this file.
1631 **
1632 ** To adapt the code to a non-TCL environment, delete or comment out
1633 ** the code that follows.
1634 */
1635 
1636 /*
1637 ** sqlite3async_enable ?YES/NO?
1638 **
1639 ** Enable or disable the asynchronous I/O backend.  This command is
1640 ** not thread-safe.  Do not call it while any database connections
1641 ** are open.
1642 */
1643 static int testAsyncEnable(
1644   void * clientData,
1645   Tcl_Interp *interp,
1646   int objc,
1647   Tcl_Obj *CONST objv[]
1648 ){
1649   if( objc!=1 && objc!=2 ){
1650     Tcl_WrongNumArgs(interp, 1, objv, "?YES/NO?");
1651     return TCL_ERROR;
1652   }
1653   if( objc==1 ){
1654     Tcl_SetObjResult(interp, Tcl_NewBooleanObj(async_vfs.pAppData!=0));
1655   }else{
1656     int en;
1657     if( Tcl_GetBooleanFromObj(interp, objv[1], &en) ) return TCL_ERROR;
1658     asyncEnable(en);
1659   }
1660   return TCL_OK;
1661 }
1662 
1663 /*
1664 ** sqlite3async_halt  "now"|"idle"|"never"
1665 **
1666 ** Set the conditions at which the writer thread will halt.
1667 */
1668 static int testAsyncHalt(
1669   void * clientData,
1670   Tcl_Interp *interp,
1671   int objc,
1672   Tcl_Obj *CONST objv[]
1673 ){
1674   const char *zCond;
1675   if( objc!=2 ){
1676     Tcl_WrongNumArgs(interp, 1, objv, "\"now\"|\"idle\"|\"never\"");
1677     return TCL_ERROR;
1678   }
1679   zCond = Tcl_GetString(objv[1]);
1680   if( strcmp(zCond, "now")==0 ){
1681     async.writerHaltNow = 1;
1682     pthread_cond_broadcast(&async.queueSignal);
1683   }else if( strcmp(zCond, "idle")==0 ){
1684     async.writerHaltWhenIdle = 1;
1685     async.writerHaltNow = 0;
1686     pthread_cond_broadcast(&async.queueSignal);
1687   }else if( strcmp(zCond, "never")==0 ){
1688     async.writerHaltWhenIdle = 0;
1689     async.writerHaltNow = 0;
1690   }else{
1691     Tcl_AppendResult(interp,
1692       "should be one of: \"now\", \"idle\", or \"never\"", (char*)0);
1693     return TCL_ERROR;
1694   }
1695   return TCL_OK;
1696 }
1697 
1698 /*
1699 ** sqlite3async_delay ?MS?
1700 **
1701 ** Query or set the number of milliseconds of delay in the writer
1702 ** thread after each write operation.  The default is 0.  By increasing
1703 ** the memory delay we can simulate the effect of slow disk I/O.
1704 */
1705 static int testAsyncDelay(
1706   void * clientData,
1707   Tcl_Interp *interp,
1708   int objc,
1709   Tcl_Obj *CONST objv[]
1710 ){
1711   if( objc!=1 && objc!=2 ){
1712     Tcl_WrongNumArgs(interp, 1, objv, "?MS?");
1713     return TCL_ERROR;
1714   }
1715   if( objc==1 ){
1716     Tcl_SetObjResult(interp, Tcl_NewIntObj(async.ioDelay));
1717   }else{
1718     int ioDelay;
1719     if( Tcl_GetIntFromObj(interp, objv[1], &ioDelay) ) return TCL_ERROR;
1720     async.ioDelay = ioDelay;
1721   }
1722   return TCL_OK;
1723 }
1724 
1725 /*
1726 ** sqlite3async_start
1727 **
1728 ** Start a new writer thread.
1729 */
1730 static int testAsyncStart(
1731   void * clientData,
1732   Tcl_Interp *interp,
1733   int objc,
1734   Tcl_Obj *CONST objv[]
1735 ){
1736   pthread_t x;
1737   int rc;
1738   volatile int isStarted = 0;
1739   rc = pthread_create(&x, 0, asyncWriterThread, (void *)&isStarted);
1740   if( rc ){
1741     Tcl_AppendResult(interp, "failed to create the thread", 0);
1742     return TCL_ERROR;
1743   }
1744   pthread_detach(x);
1745   while( isStarted==0 ){
1746     sched_yield();
1747   }
1748   return TCL_OK;
1749 }
1750 
1751 /*
1752 ** sqlite3async_wait
1753 **
1754 ** Wait for the current writer thread to terminate.
1755 **
1756 ** If the current writer thread is set to run forever then this
1757 ** command would block forever.  To prevent that, an error is returned.
1758 */
1759 static int testAsyncWait(
1760   void * clientData,
1761   Tcl_Interp *interp,
1762   int objc,
1763   Tcl_Obj *CONST objv[]
1764 ){
1765   int cnt = 10;
1766   if( async.writerHaltNow==0 && async.writerHaltWhenIdle==0 ){
1767     Tcl_AppendResult(interp, "would block forever", (char*)0);
1768     return TCL_ERROR;
1769   }
1770 
1771   while( cnt-- && !pthread_mutex_trylock(&async.writerMutex) ){
1772     pthread_mutex_unlock(&async.writerMutex);
1773     sched_yield();
1774   }
1775   if( cnt>=0 ){
1776     ASYNC_TRACE(("WAIT\n"));
1777     pthread_mutex_lock(&async.queueMutex);
1778     pthread_cond_broadcast(&async.queueSignal);
1779     pthread_mutex_unlock(&async.queueMutex);
1780     pthread_mutex_lock(&async.writerMutex);
1781     pthread_mutex_unlock(&async.writerMutex);
1782   }else{
1783     ASYNC_TRACE(("NO-WAIT\n"));
1784   }
1785   return TCL_OK;
1786 }
1787 
1788 
1789 #endif  /* SQLITE_OS_UNIX and SQLITE_THREADSAFE */
1790 
1791 /*
1792 ** This routine registers the custom TCL commands defined in this
1793 ** module.  This should be the only procedure visible from outside
1794 ** of this module.
1795 */
1796 int Sqlitetestasync_Init(Tcl_Interp *interp){
1797 #if SQLITE_OS_UNIX && SQLITE_THREADSAFE
1798   Tcl_CreateObjCommand(interp,"sqlite3async_enable",testAsyncEnable,0,0);
1799   Tcl_CreateObjCommand(interp,"sqlite3async_halt",testAsyncHalt,0,0);
1800   Tcl_CreateObjCommand(interp,"sqlite3async_delay",testAsyncDelay,0,0);
1801   Tcl_CreateObjCommand(interp,"sqlite3async_start",testAsyncStart,0,0);
1802   Tcl_CreateObjCommand(interp,"sqlite3async_wait",testAsyncWait,0,0);
1803   Tcl_LinkVar(interp, "sqlite3async_trace",
1804       (char*)&sqlite3async_trace, TCL_LINK_INT);
1805 #endif  /* SQLITE_OS_UNIX and SQLITE_THREADSAFE */
1806   return TCL_OK;
1807 }
1808