xref: /sqlite-3.40.0/ext/async/sqlite3async.c (revision 78f1e538)
1 /*
2 ** 2005 December 14
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** $Id: sqlite3async.c,v 1.7 2009/07/18 11:52:04 danielk1977 Exp $
14 **
15 ** This file contains the implementation of an asynchronous IO backend
16 ** for SQLite.
17 */
18 
19 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
20 
21 #include "sqlite3async.h"
22 #include "sqlite3.h"
23 #include <stdarg.h>
24 #include <string.h>
25 #include <assert.h>
26 
27 /* Useful macros used in several places */
28 #define MIN(x,y) ((x)<(y)?(x):(y))
29 #define MAX(x,y) ((x)>(y)?(x):(y))
30 
31 #ifndef SQLITE_AMALGAMATION
32 /* Macro to mark parameters as unused and silence compiler warnings. */
33 #define UNUSED_PARAMETER(x) (void)(x)
34 #endif
35 
36 /* Forward references */
37 typedef struct AsyncWrite AsyncWrite;
38 typedef struct AsyncFile AsyncFile;
39 typedef struct AsyncFileData AsyncFileData;
40 typedef struct AsyncFileLock AsyncFileLock;
41 typedef struct AsyncLock AsyncLock;
42 
43 /* Enable for debugging */
44 #ifndef NDEBUG
45 #include <stdio.h>
46 static int sqlite3async_trace = 0;
47 # define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
48 static void asyncTrace(const char *zFormat, ...){
49   char *z;
50   va_list ap;
51   va_start(ap, zFormat);
52   z = sqlite3_vmprintf(zFormat, ap);
53   va_end(ap);
54   fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
55   sqlite3_free(z);
56 }
57 #else
58 # define ASYNC_TRACE(X)
59 #endif
60 
61 /*
62 ** THREAD SAFETY NOTES
63 **
64 ** Basic rules:
65 **
66 **     * Both read and write access to the global write-op queue must be
67 **       protected by the async.queueMutex. As are the async.ioError and
68 **       async.nFile variables.
69 **
70 **     * The async.pLock list and all AsyncLock and AsyncFileLock
71 **       structures must be protected by the async.lockMutex mutex.
72 **
73 **     * The file handles from the underlying system are not assumed to
74 **       be thread safe.
75 **
76 **     * See the last two paragraphs under "The Writer Thread" for
77 **       an assumption to do with file-handle synchronization by the Os.
78 **
79 ** Deadlock prevention:
80 **
81 **     There are three mutex used by the system: the "writer" mutex,
82 **     the "queue" mutex and the "lock" mutex. Rules are:
83 **
84 **     * It is illegal to block on the writer mutex when any other mutex
85 **       are held, and
86 **
87 **     * It is illegal to block on the queue mutex when the lock mutex
88 **       is held.
89 **
90 **     i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
91 **
92 ** File system operations (invoked by SQLite thread):
93 **
94 **     xOpen
95 **     xDelete
96 **     xFileExists
97 **
98 ** File handle operations (invoked by SQLite thread):
99 **
100 **         asyncWrite, asyncClose, asyncTruncate, asyncSync
101 **
102 **     The operations above add an entry to the global write-op list. They
103 **     prepare the entry, acquire the async.queueMutex momentarily while
104 **     list pointers are  manipulated to insert the new entry, then release
105 **     the mutex and signal the writer thread to wake up in case it happens
106 **     to be asleep.
107 **
108 **
109 **         asyncRead, asyncFileSize.
110 **
111 **     Read operations. Both of these read from both the underlying file
112 **     first then adjust their result based on pending writes in the
113 **     write-op queue.   So async.queueMutex is held for the duration
114 **     of these operations to prevent other threads from changing the
115 **     queue in mid operation.
116 **
117 **
118 **         asyncLock, asyncUnlock, asyncCheckReservedLock
119 **
120 **     These primitives implement in-process locking using a hash table
121 **     on the file name.  Files are locked correctly for connections coming
122 **     from the same process.  But other processes cannot see these locks
123 **     and will therefore not honor them.
124 **
125 **
126 ** The writer thread:
127 **
128 **     The async.writerMutex is used to make sure only there is only
129 **     a single writer thread running at a time.
130 **
131 **     Inside the writer thread is a loop that works like this:
132 **
133 **         WHILE (write-op list is not empty)
134 **             Do IO operation at head of write-op list
135 **             Remove entry from head of write-op list
136 **         END WHILE
137 **
138 **     The async.queueMutex is always held during the <write-op list is
139 **     not empty> test, and when the entry is removed from the head
140 **     of the write-op list. Sometimes it is held for the interim
141 **     period (while the IO is performed), and sometimes it is
142 **     relinquished. It is relinquished if (a) the IO op is an
143 **     ASYNC_CLOSE or (b) when the file handle was opened, two of
144 **     the underlying systems handles were opened on the same
145 **     file-system entry.
146 **
147 **     If condition (b) above is true, then one file-handle
148 **     (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
149 **     file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
150 **     threads to perform write() operations. This means that read
151 **     operations are not blocked by asynchronous writes (although
152 **     asynchronous writes may still be blocked by reads).
153 **
154 **     This assumes that the OS keeps two handles open on the same file
155 **     properly in sync. That is, any read operation that starts after a
156 **     write operation on the same file system entry has completed returns
157 **     data consistent with the write. We also assume that if one thread
158 **     reads a file while another is writing it all bytes other than the
159 **     ones actually being written contain valid data.
160 **
161 **     If the above assumptions are not true, set the preprocessor symbol
162 **     SQLITE_ASYNC_TWO_FILEHANDLES to 0.
163 */
164 
165 
166 #ifndef NDEBUG
167 # define TESTONLY( X ) X
168 #else
169 # define TESTONLY( X )
170 #endif
171 
172 /*
173 ** PORTING FUNCTIONS
174 **
175 ** There are two definitions of the following functions. One for pthreads
176 ** compatible systems and one for Win32. These functions isolate the OS
177 ** specific code required by each platform.
178 **
179 ** The system uses three mutexes and a single condition variable. To
180 ** block on a mutex, async_mutex_enter() is called. The parameter passed
181 ** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
182 ** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
183 ** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
184 ** called with a parameter identifying the mutex being unlocked. Mutexes
185 ** are not recursive - it is an error to call async_mutex_enter() to
186 ** lock a mutex that is already locked, or to call async_mutex_leave()
187 ** to unlock a mutex that is not currently locked.
188 **
189 ** The async_cond_wait() and async_cond_signal() functions are modelled
190 ** on the pthreads functions with similar names. The first parameter to
191 ** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
192 ** is called the mutex identified by the second parameter must be held.
193 ** The mutex is unlocked, and the calling thread simultaneously begins
194 ** waiting for the condition variable to be signalled by another thread.
195 ** After another thread signals the condition variable, the calling
196 ** thread stops waiting, locks mutex eMutex and returns. The
197 ** async_cond_signal() function is used to signal the condition variable.
198 ** It is assumed that the mutex used by the thread calling async_cond_wait()
199 ** is held by the caller of async_cond_signal() (otherwise there would be
200 ** a race condition).
201 **
202 ** It is guaranteed that no other thread will call async_cond_wait() when
203 ** there is already a thread waiting on the condition variable.
204 **
205 ** The async_sched_yield() function is called to suggest to the operating
206 ** system that it would be a good time to shift the current thread off the
207 ** CPU. The system will still work if this function is not implemented
208 ** (it is not currently implemented for win32), but it might be marginally
209 ** more efficient if it is.
210 */
211 static void async_mutex_enter(int eMutex);
212 static void async_mutex_leave(int eMutex);
213 static void async_cond_wait(int eCond, int eMutex);
214 static void async_cond_signal(int eCond);
215 static void async_sched_yield(void);
216 
217 /*
218 ** There are also two definitions of the following. async_os_initialize()
219 ** is called when the asynchronous VFS is first installed, and os_shutdown()
220 ** is called when it is uninstalled (from within sqlite3async_shutdown()).
221 **
222 ** For pthreads builds, both of these functions are no-ops. For win32,
223 ** they provide an opportunity to initialize and finalize the required
224 ** mutex and condition variables.
225 **
226 ** If async_os_initialize() returns other than zero, then the initialization
227 ** fails and SQLITE_ERROR is returned to the user.
228 */
229 static int async_os_initialize(void);
230 static void async_os_shutdown(void);
231 
232 /* Values for use as the 'eMutex' argument of the above functions. The
233 ** integer values assigned to these constants are important for assert()
234 ** statements that verify that mutexes are locked in the correct order.
235 ** Specifically, it is unsafe to try to lock mutex N while holding a lock
236 ** on mutex M if (M<=N).
237 */
238 #define ASYNC_MUTEX_LOCK    0
239 #define ASYNC_MUTEX_QUEUE   1
240 #define ASYNC_MUTEX_WRITER  2
241 
242 /* Values for use as the 'eCond' argument of the above functions. */
243 #define ASYNC_COND_QUEUE    0
244 
245 /*************************************************************************
246 ** Start of OS specific code.
247 */
248 #if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
249 
250 #include <windows.h>
251 
252 /* The following block contains the win32 specific code. */
253 
254 #define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
255 
256 static struct AsyncPrimitives {
257   int isInit;
258   DWORD aHolder[3];
259   CRITICAL_SECTION aMutex[3];
260   HANDLE aCond[1];
261 } primitives = { 0 };
262 
263 static int async_os_initialize(void){
264   if( !primitives.isInit ){
265     primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
266     if( primitives.aCond[0]==NULL ){
267       return 1;
268     }
269     InitializeCriticalSection(&primitives.aMutex[0]);
270     InitializeCriticalSection(&primitives.aMutex[1]);
271     InitializeCriticalSection(&primitives.aMutex[2]);
272     primitives.isInit = 1;
273   }
274   return 0;
275 }
276 static void async_os_shutdown(void){
277   if( primitives.isInit ){
278     DeleteCriticalSection(&primitives.aMutex[0]);
279     DeleteCriticalSection(&primitives.aMutex[1]);
280     DeleteCriticalSection(&primitives.aMutex[2]);
281     CloseHandle(primitives.aCond[0]);
282     primitives.isInit = 0;
283   }
284 }
285 
286 /* The following block contains the Win32 specific code. */
287 static void async_mutex_enter(int eMutex){
288   assert( eMutex==0 || eMutex==1 || eMutex==2 );
289   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
290   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
291   assert( eMutex!=0 || (!mutex_held(0)) );
292   EnterCriticalSection(&primitives.aMutex[eMutex]);
293   TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
294 }
295 static void async_mutex_leave(int eMutex){
296   assert( eMutex==0 || eMutex==1 || eMutex==2 );
297   assert( mutex_held(eMutex) );
298   TESTONLY( primitives.aHolder[eMutex] = 0; )
299   LeaveCriticalSection(&primitives.aMutex[eMutex]);
300 }
301 static void async_cond_wait(int eCond, int eMutex){
302   ResetEvent(primitives.aCond[eCond]);
303   async_mutex_leave(eMutex);
304   WaitForSingleObject(primitives.aCond[eCond], INFINITE);
305   async_mutex_enter(eMutex);
306 }
307 static void async_cond_signal(int eCond){
308   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
309   SetEvent(primitives.aCond[eCond]);
310 }
311 static void async_sched_yield(void){
312   Sleep(0);
313 }
314 #else
315 
316 /* The following block contains the pthreads specific code. */
317 #include <pthread.h>
318 #include <sched.h>
319 
320 #define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
321 
322 static int  async_os_initialize(void) {return 0;}
323 static void async_os_shutdown(void) {}
324 
325 static struct AsyncPrimitives {
326   pthread_mutex_t aMutex[3];
327   pthread_cond_t aCond[1];
328   pthread_t aHolder[3];
329 } primitives = {
330   { PTHREAD_MUTEX_INITIALIZER,
331     PTHREAD_MUTEX_INITIALIZER,
332     PTHREAD_MUTEX_INITIALIZER
333   } , {
334     PTHREAD_COND_INITIALIZER
335   } , { 0, 0, 0 }
336 };
337 
338 static void async_mutex_enter(int eMutex){
339   assert( eMutex==0 || eMutex==1 || eMutex==2 );
340   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
341   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
342   assert( eMutex!=0 || (!mutex_held(0)) );
343   pthread_mutex_lock(&primitives.aMutex[eMutex]);
344   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
345 }
346 static void async_mutex_leave(int eMutex){
347   assert( eMutex==0 || eMutex==1 || eMutex==2 );
348   assert( mutex_held(eMutex) );
349   TESTONLY( primitives.aHolder[eMutex] = 0; )
350   pthread_mutex_unlock(&primitives.aMutex[eMutex]);
351 }
352 static void async_cond_wait(int eCond, int eMutex){
353   assert( eMutex==0 || eMutex==1 || eMutex==2 );
354   assert( mutex_held(eMutex) );
355   TESTONLY( primitives.aHolder[eMutex] = 0; )
356   pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
357   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
358 }
359 static void async_cond_signal(int eCond){
360   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
361   pthread_cond_signal(&primitives.aCond[eCond]);
362 }
363 static void async_sched_yield(void){
364   sched_yield();
365 }
366 #endif
367 /*
368 ** End of OS specific code.
369 *************************************************************************/
370 
371 #define assert_mutex_is_held(X) assert( mutex_held(X) )
372 
373 
374 #ifndef SQLITE_ASYNC_TWO_FILEHANDLES
375 /* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
376 #define SQLITE_ASYNC_TWO_FILEHANDLES 1
377 #endif
378 
379 /*
380 ** State information is held in the static variable "async" defined
381 ** as the following structure.
382 **
383 ** Both async.ioError and async.nFile are protected by async.queueMutex.
384 */
385 static struct TestAsyncStaticData {
386   AsyncWrite *pQueueFirst;     /* Next write operation to be processed */
387   AsyncWrite *pQueueLast;      /* Last write operation on the list */
388   AsyncLock *pLock;            /* Linked list of all AsyncLock structures */
389   volatile int ioDelay;        /* Extra delay between write operations */
390   volatile int eHalt;          /* One of the SQLITEASYNC_HALT_XXX values */
391   volatile int bLockFiles;     /* Current value of "lockfiles" parameter */
392   int ioError;                 /* True if an IO error has occurred */
393   int nFile;                   /* Number of open files (from sqlite pov) */
394 } async = { 0,0,0,0,0,1,0,0 };
395 
396 /* Possible values of AsyncWrite.op */
397 #define ASYNC_NOOP          0
398 #define ASYNC_WRITE         1
399 #define ASYNC_SYNC          2
400 #define ASYNC_TRUNCATE      3
401 #define ASYNC_CLOSE         4
402 #define ASYNC_DELETE        5
403 #define ASYNC_OPENEXCLUSIVE 6
404 #define ASYNC_UNLOCK        7
405 
406 /* Names of opcodes.  Used for debugging only.
407 ** Make sure these stay in sync with the macros above!
408 */
409 static const char *azOpcodeName[] = {
410   "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
411 };
412 
413 /*
414 ** Entries on the write-op queue are instances of the AsyncWrite
415 ** structure, defined here.
416 **
417 ** The interpretation of the iOffset and nByte variables varies depending
418 ** on the value of AsyncWrite.op:
419 **
420 ** ASYNC_NOOP:
421 **     No values used.
422 **
423 ** ASYNC_WRITE:
424 **     iOffset -> Offset in file to write to.
425 **     nByte   -> Number of bytes of data to write (pointed to by zBuf).
426 **
427 ** ASYNC_SYNC:
428 **     nByte   -> flags to pass to sqlite3OsSync().
429 **
430 ** ASYNC_TRUNCATE:
431 **     iOffset -> Size to truncate file to.
432 **     nByte   -> Unused.
433 **
434 ** ASYNC_CLOSE:
435 **     iOffset -> Unused.
436 **     nByte   -> Unused.
437 **
438 ** ASYNC_DELETE:
439 **     iOffset -> Contains the "syncDir" flag.
440 **     nByte   -> Number of bytes of zBuf points to (file name).
441 **
442 ** ASYNC_OPENEXCLUSIVE:
443 **     iOffset -> Value of "delflag".
444 **     nByte   -> Number of bytes of zBuf points to (file name).
445 **
446 ** ASYNC_UNLOCK:
447 **     nByte   -> Argument to sqlite3OsUnlock().
448 **
449 **
450 ** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
451 ** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
452 ** single blob, so is deleted when sqlite3_free() is called on the parent
453 ** structure.
454 */
455 struct AsyncWrite {
456   AsyncFileData *pFileData;    /* File to write data to or sync */
457   int op;                      /* One of ASYNC_xxx etc. */
458   sqlite_int64 iOffset;        /* See above */
459   int nByte;          /* See above */
460   char *zBuf;         /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
461   AsyncWrite *pNext;  /* Next write operation (to any file) */
462 };
463 
464 /*
465 ** An instance of this structure is created for each distinct open file
466 ** (i.e. if two handles are opened on the one file, only one of these
467 ** structures is allocated) and stored in the async.aLock hash table. The
468 ** keys for async.aLock are the full pathnames of the opened files.
469 **
470 ** AsyncLock.pList points to the head of a linked list of AsyncFileLock
471 ** structures, one for each handle currently open on the file.
472 **
473 ** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
474 ** not passed to the sqlite3OsOpen() call), or if async.bLockFiles is
475 ** false, variables AsyncLock.pFile and AsyncLock.eLock are never used.
476 ** Otherwise, pFile is a file handle opened on the file in question and
477 ** used to obtain the file-system locks required by database connections
478 ** within this process.
479 **
480 ** See comments above the asyncLock() function for more details on
481 ** the implementation of database locking used by this backend.
482 */
483 struct AsyncLock {
484   char *zFile;
485   int nFile;
486   sqlite3_file *pFile;
487   int eLock;
488   AsyncFileLock *pList;
489   AsyncLock *pNext;           /* Next in linked list headed by async.pLock */
490 };
491 
492 /*
493 ** An instance of the following structure is allocated along with each
494 ** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
495 ** file was opened with the SQLITE_OPEN_MAIN_DB.
496 */
497 struct AsyncFileLock {
498   int eLock;                /* Internally visible lock state (sqlite pov) */
499   int eAsyncLock;           /* Lock-state with write-queue unlock */
500   AsyncFileLock *pNext;
501 };
502 
503 /*
504 ** The AsyncFile structure is a subclass of sqlite3_file used for
505 ** asynchronous IO.
506 **
507 ** All of the actual data for the structure is stored in the structure
508 ** pointed to by AsyncFile.pData, which is allocated as part of the
509 ** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
510 ** lifetime of the AsyncFile structure is ended by the caller after OsClose()
511 ** is called, but the data in AsyncFileData may be required by the
512 ** writer thread after that point.
513 */
514 struct AsyncFile {
515   sqlite3_io_methods *pMethod;
516   AsyncFileData *pData;
517 };
518 struct AsyncFileData {
519   char *zName;               /* Underlying OS filename - used for debugging */
520   int nName;                 /* Number of characters in zName */
521   sqlite3_file *pBaseRead;   /* Read handle to the underlying Os file */
522   sqlite3_file *pBaseWrite;  /* Write handle to the underlying Os file */
523   AsyncFileLock lock;        /* Lock state for this handle */
524   AsyncLock *pLock;          /* AsyncLock object for this file system entry */
525   AsyncWrite closeOp;        /* Preallocated close operation */
526 };
527 
528 /*
529 ** Add an entry to the end of the global write-op list. pWrite should point
530 ** to an AsyncWrite structure allocated using sqlite3_malloc().  The writer
531 ** thread will call sqlite3_free() to free the structure after the specified
532 ** operation has been completed.
533 **
534 ** Once an AsyncWrite structure has been added to the list, it becomes the
535 ** property of the writer thread and must not be read or modified by the
536 ** caller.
537 */
538 static void addAsyncWrite(AsyncWrite *pWrite){
539   /* We must hold the queue mutex in order to modify the queue pointers */
540   if( pWrite->op!=ASYNC_UNLOCK ){
541     async_mutex_enter(ASYNC_MUTEX_QUEUE);
542   }
543 
544   /* Add the record to the end of the write-op queue */
545   assert( !pWrite->pNext );
546   if( async.pQueueLast ){
547     assert( async.pQueueFirst );
548     async.pQueueLast->pNext = pWrite;
549   }else{
550     async.pQueueFirst = pWrite;
551   }
552   async.pQueueLast = pWrite;
553   ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
554          pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
555 
556   if( pWrite->op==ASYNC_CLOSE ){
557     async.nFile--;
558   }
559 
560   /* The writer thread might have been idle because there was nothing
561   ** on the write-op queue for it to do.  So wake it up. */
562   async_cond_signal(ASYNC_COND_QUEUE);
563 
564   /* Drop the queue mutex */
565   if( pWrite->op!=ASYNC_UNLOCK ){
566     async_mutex_leave(ASYNC_MUTEX_QUEUE);
567   }
568 }
569 
570 /*
571 ** Increment async.nFile in a thread-safe manner.
572 */
573 static void incrOpenFileCount(void){
574   /* We must hold the queue mutex in order to modify async.nFile */
575   async_mutex_enter(ASYNC_MUTEX_QUEUE);
576   if( async.nFile==0 ){
577     async.ioError = SQLITE_OK;
578   }
579   async.nFile++;
580   async_mutex_leave(ASYNC_MUTEX_QUEUE);
581 }
582 
583 /*
584 ** This is a utility function to allocate and populate a new AsyncWrite
585 ** structure and insert it (via addAsyncWrite() ) into the global list.
586 */
587 static int addNewAsyncWrite(
588   AsyncFileData *pFileData,
589   int op,
590   sqlite3_int64 iOffset,
591   int nByte,
592   const char *zByte
593 ){
594   AsyncWrite *p;
595   if( op!=ASYNC_CLOSE && async.ioError ){
596     return async.ioError;
597   }
598   p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
599   if( !p ){
600     /* The upper layer does not expect operations like OsWrite() to
601     ** return SQLITE_NOMEM. This is partly because under normal conditions
602     ** SQLite is required to do rollback without calling malloc(). So
603     ** if malloc() fails here, treat it as an I/O error. The above
604     ** layer knows how to handle that.
605     */
606     return SQLITE_IOERR;
607   }
608   p->op = op;
609   p->iOffset = iOffset;
610   p->nByte = nByte;
611   p->pFileData = pFileData;
612   p->pNext = 0;
613   if( zByte ){
614     p->zBuf = (char *)&p[1];
615     memcpy(p->zBuf, zByte, nByte);
616   }else{
617     p->zBuf = 0;
618   }
619   addAsyncWrite(p);
620   return SQLITE_OK;
621 }
622 
623 /*
624 ** Close the file. This just adds an entry to the write-op list, the file is
625 ** not actually closed.
626 */
627 static int asyncClose(sqlite3_file *pFile){
628   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
629 
630   /* Unlock the file, if it is locked */
631   async_mutex_enter(ASYNC_MUTEX_LOCK);
632   p->lock.eLock = 0;
633   async_mutex_leave(ASYNC_MUTEX_LOCK);
634 
635   addAsyncWrite(&p->closeOp);
636   return SQLITE_OK;
637 }
638 
639 /*
640 ** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
641 ** writing to the underlying file, this function adds an entry to the end of
642 ** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
643 ** returned.
644 */
645 static int asyncWrite(
646   sqlite3_file *pFile,
647   const void *pBuf,
648   int amt,
649   sqlite3_int64 iOff
650 ){
651   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
652   return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
653 }
654 
655 /*
656 ** Read data from the file. First we read from the filesystem, then adjust
657 ** the contents of the buffer based on ASYNC_WRITE operations in the
658 ** write-op queue.
659 **
660 ** This method holds the mutex from start to finish.
661 */
662 static int asyncRead(
663   sqlite3_file *pFile,
664   void *zOut,
665   int iAmt,
666   sqlite3_int64 iOffset
667 ){
668   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
669   int rc = SQLITE_OK;
670   sqlite3_int64 filesize = 0;
671   sqlite3_file *pBase = p->pBaseRead;
672   sqlite3_int64 iAmt64 = (sqlite3_int64)iAmt;
673 
674   /* Grab the write queue mutex for the duration of the call */
675   async_mutex_enter(ASYNC_MUTEX_QUEUE);
676 
677   /* If an I/O error has previously occurred in this virtual file
678   ** system, then all subsequent operations fail.
679   */
680   if( async.ioError!=SQLITE_OK ){
681     rc = async.ioError;
682     goto asyncread_out;
683   }
684 
685   if( pBase->pMethods ){
686     sqlite3_int64 nRead;
687     rc = pBase->pMethods->xFileSize(pBase, &filesize);
688     if( rc!=SQLITE_OK ){
689       goto asyncread_out;
690     }
691     nRead = MIN(filesize - iOffset, iAmt64);
692     if( nRead>0 ){
693       rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
694       ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
695     }
696   }
697 
698   if( rc==SQLITE_OK ){
699     AsyncWrite *pWrite;
700     char *zName = p->zName;
701 
702     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
703       if( pWrite->op==ASYNC_WRITE && (
704         (pWrite->pFileData==p) ||
705         (zName && pWrite->pFileData->zName==zName)
706       )){
707         sqlite3_int64 nCopy;
708         sqlite3_int64 nByte64 = (sqlite3_int64)pWrite->nByte;
709         filesize = MAX(filesize, pWrite->iOffset+nByte64);
710 
711         /* Set variable iBeginIn to the offset in buffer pWrite->zBuf[] from
712         ** which data should be copied. Set iBeginOut to the offset within
713         ** the output buffer to which data should be copied. If either of
714         ** these offsets is a negative number, set them to 0.
715         */
716         sqlite3_int64 iBeginOut = (pWrite->iOffset-iOffset);
717         sqlite3_int64 iBeginIn = -iBeginOut;
718         if( iBeginIn<0 ) iBeginIn = 0;
719         if( iBeginOut<0 ) iBeginOut = 0;
720 
721         nCopy = MIN(nByte64-iBeginIn, iAmt64-iBeginOut);
722         if( nCopy>0 ){
723           memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
724           ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
725         }
726       }
727     }
728   }
729 
730 asyncread_out:
731   async_mutex_leave(ASYNC_MUTEX_QUEUE);
732   if( rc==SQLITE_OK && filesize<(iOffset+iAmt) ){
733     rc = SQLITE_IOERR_SHORT_READ;
734   }
735   return rc;
736 }
737 
738 /*
739 ** Truncate the file to nByte bytes in length. This just adds an entry to
740 ** the write-op list, no IO actually takes place.
741 */
742 static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
743   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
744   return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
745 }
746 
747 /*
748 ** Sync the file. This just adds an entry to the write-op list, the
749 ** sync() is done later by sqlite3_async_flush().
750 */
751 static int asyncSync(sqlite3_file *pFile, int flags){
752   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
753   return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
754 }
755 
756 /*
757 ** Read the size of the file. First we read the size of the file system
758 ** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
759 ** currently in the write-op list.
760 **
761 ** This method holds the mutex from start to finish.
762 */
763 int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
764   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
765   int rc = SQLITE_OK;
766   sqlite3_int64 s = 0;
767   sqlite3_file *pBase;
768 
769   async_mutex_enter(ASYNC_MUTEX_QUEUE);
770 
771   /* Read the filesystem size from the base file. If pMethods is NULL, this
772   ** means the file hasn't been opened yet. In this case all relevant data
773   ** must be in the write-op queue anyway, so we can omit reading from the
774   ** file-system.
775   */
776   pBase = p->pBaseRead;
777   if( pBase->pMethods ){
778     rc = pBase->pMethods->xFileSize(pBase, &s);
779   }
780 
781   if( rc==SQLITE_OK ){
782     AsyncWrite *pWrite;
783     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
784       if( pWrite->op==ASYNC_DELETE
785        && p->zName
786        && strcmp(p->zName, pWrite->zBuf)==0
787       ){
788         s = 0;
789       }else if( pWrite->pFileData && (
790           (pWrite->pFileData==p)
791        || (p->zName && pWrite->pFileData->zName==p->zName)
792       )){
793         switch( pWrite->op ){
794           case ASYNC_WRITE:
795             s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
796             break;
797           case ASYNC_TRUNCATE:
798             s = MIN(s, pWrite->iOffset);
799             break;
800         }
801       }
802     }
803     *piSize = s;
804   }
805   async_mutex_leave(ASYNC_MUTEX_QUEUE);
806   return rc;
807 }
808 
809 /*
810 ** Lock or unlock the actual file-system entry.
811 */
812 static int getFileLock(AsyncLock *pLock){
813   int rc = SQLITE_OK;
814   AsyncFileLock *pIter;
815   int eRequired = 0;
816 
817   if( pLock->pFile ){
818     for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
819       assert(pIter->eAsyncLock>=pIter->eLock);
820       if( pIter->eAsyncLock>eRequired ){
821         eRequired = pIter->eAsyncLock;
822         assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
823       }
824     }
825 
826     if( eRequired>pLock->eLock ){
827       rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
828       if( rc==SQLITE_OK ){
829         pLock->eLock = eRequired;
830       }
831     }
832     else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
833       rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
834       if( rc==SQLITE_OK ){
835         pLock->eLock = eRequired;
836       }
837     }
838   }
839 
840   return rc;
841 }
842 
843 /*
844 ** Return the AsyncLock structure from the global async.pLock list
845 ** associated with the file-system entry identified by path zName
846 ** (a string of nName bytes). If no such structure exists, return 0.
847 */
848 static AsyncLock *findLock(const char *zName, int nName){
849   AsyncLock *p = async.pLock;
850   while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
851     p = p->pNext;
852   }
853   return p;
854 }
855 
856 /*
857 ** The following two methods - asyncLock() and asyncUnlock() - are used
858 ** to obtain and release locks on database files opened with the
859 ** asynchronous backend.
860 */
861 static int asyncLock(sqlite3_file *pFile, int eLock){
862   int rc = SQLITE_OK;
863   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
864 
865   if( p->zName ){
866     async_mutex_enter(ASYNC_MUTEX_LOCK);
867     if( p->lock.eLock<eLock ){
868       AsyncLock *pLock = p->pLock;
869       AsyncFileLock *pIter;
870       assert(pLock && pLock->pList);
871       for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
872         if( pIter!=&p->lock && (
873           (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
874           (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
875           (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
876           (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
877         )){
878           rc = SQLITE_BUSY;
879         }
880       }
881       if( rc==SQLITE_OK ){
882         p->lock.eLock = eLock;
883         p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
884       }
885       assert(p->lock.eAsyncLock>=p->lock.eLock);
886       if( rc==SQLITE_OK ){
887         rc = getFileLock(pLock);
888       }
889     }
890     async_mutex_leave(ASYNC_MUTEX_LOCK);
891   }
892 
893   ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
894   return rc;
895 }
896 static int asyncUnlock(sqlite3_file *pFile, int eLock){
897   int rc = SQLITE_OK;
898   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
899   if( p->zName ){
900     AsyncFileLock *pLock = &p->lock;
901     async_mutex_enter(ASYNC_MUTEX_QUEUE);
902     async_mutex_enter(ASYNC_MUTEX_LOCK);
903     pLock->eLock = MIN(pLock->eLock, eLock);
904     rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
905     async_mutex_leave(ASYNC_MUTEX_LOCK);
906     async_mutex_leave(ASYNC_MUTEX_QUEUE);
907   }
908   return rc;
909 }
910 
911 /*
912 ** This function is called when the pager layer first opens a database file
913 ** and is checking for a hot-journal.
914 */
915 static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
916   int ret = 0;
917   AsyncFileLock *pIter;
918   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
919 
920   async_mutex_enter(ASYNC_MUTEX_LOCK);
921   for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
922     if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
923       ret = 1;
924       break;
925     }
926   }
927   async_mutex_leave(ASYNC_MUTEX_LOCK);
928 
929   ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
930   *pResOut = ret;
931   return SQLITE_OK;
932 }
933 
934 /*
935 ** sqlite3_file_control() implementation.
936 */
937 static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
938   switch( op ){
939     case SQLITE_FCNTL_LOCKSTATE: {
940       async_mutex_enter(ASYNC_MUTEX_LOCK);
941       *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
942       async_mutex_leave(ASYNC_MUTEX_LOCK);
943       return SQLITE_OK;
944     }
945   }
946   return SQLITE_ERROR;
947 }
948 
949 /*
950 ** Return the device characteristics and sector-size of the device. It
951 ** is tricky to implement these correctly, as this backend might
952 ** not have an open file handle at this point.
953 */
954 static int asyncSectorSize(sqlite3_file *pFile){
955   UNUSED_PARAMETER(pFile);
956   return 512;
957 }
958 static int asyncDeviceCharacteristics(sqlite3_file *pFile){
959   UNUSED_PARAMETER(pFile);
960   return 0;
961 }
962 
963 static int unlinkAsyncFile(AsyncFileData *pData){
964   AsyncFileLock **ppIter;
965   int rc = SQLITE_OK;
966 
967   if( pData->zName ){
968     AsyncLock *pLock = pData->pLock;
969     for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
970       if( (*ppIter)==&pData->lock ){
971         *ppIter = pData->lock.pNext;
972         break;
973       }
974     }
975     if( !pLock->pList ){
976       AsyncLock **pp;
977       if( pLock->pFile ){
978         pLock->pFile->pMethods->xClose(pLock->pFile);
979       }
980       for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
981       *pp = pLock->pNext;
982       sqlite3_free(pLock);
983     }else{
984       rc = getFileLock(pLock);
985     }
986   }
987 
988   return rc;
989 }
990 
991 /*
992 ** The parameter passed to this function is a copy of a 'flags' parameter
993 ** passed to this modules xOpen() method. This function returns true
994 ** if the file should be opened asynchronously, or false if it should
995 ** be opened immediately.
996 **
997 ** If the file is to be opened asynchronously, then asyncOpen() will add
998 ** an entry to the event queue and the file will not actually be opened
999 ** until the event is processed. Otherwise, the file is opened directly
1000 ** by the caller.
1001 */
1002 static int doAsynchronousOpen(int flags){
1003   return (flags&SQLITE_OPEN_CREATE) && (
1004       (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1005       (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1006       (flags&SQLITE_OPEN_DELETEONCLOSE)
1007   );
1008 }
1009 
1010 /*
1011 ** Open a file.
1012 */
1013 static int asyncOpen(
1014   sqlite3_vfs *pAsyncVfs,
1015   const char *zName,
1016   sqlite3_file *pFile,
1017   int flags,
1018   int *pOutFlags
1019 ){
1020   static sqlite3_io_methods async_methods = {
1021     1,                               /* iVersion */
1022     asyncClose,                      /* xClose */
1023     asyncRead,                       /* xRead */
1024     asyncWrite,                      /* xWrite */
1025     asyncTruncate,                   /* xTruncate */
1026     asyncSync,                       /* xSync */
1027     asyncFileSize,                   /* xFileSize */
1028     asyncLock,                       /* xLock */
1029     asyncUnlock,                     /* xUnlock */
1030     asyncCheckReservedLock,          /* xCheckReservedLock */
1031     asyncFileControl,                /* xFileControl */
1032     asyncSectorSize,                 /* xSectorSize */
1033     asyncDeviceCharacteristics       /* xDeviceCharacteristics */
1034   };
1035 
1036   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1037   AsyncFile *p = (AsyncFile *)pFile;
1038   int nName = 0;
1039   int rc = SQLITE_OK;
1040   int nByte;
1041   AsyncFileData *pData;
1042   AsyncLock *pLock = 0;
1043   char *z;
1044   int isAsyncOpen = doAsynchronousOpen(flags);
1045 
1046   /* If zName is NULL, then the upper layer is requesting an anonymous file */
1047   if( zName ){
1048     nName = (int)strlen(zName)+1;
1049   }
1050 
1051   nByte = (
1052     sizeof(AsyncFileData) +        /* AsyncFileData structure */
1053     2 * pVfs->szOsFile +           /* AsyncFileData.pBaseRead and pBaseWrite */
1054     nName                          /* AsyncFileData.zName */
1055   );
1056   z = sqlite3_malloc(nByte);
1057   if( !z ){
1058     return SQLITE_NOMEM;
1059   }
1060   memset(z, 0, nByte);
1061   pData = (AsyncFileData*)z;
1062   z += sizeof(pData[0]);
1063   pData->pBaseRead = (sqlite3_file*)z;
1064   z += pVfs->szOsFile;
1065   pData->pBaseWrite = (sqlite3_file*)z;
1066   pData->closeOp.pFileData = pData;
1067   pData->closeOp.op = ASYNC_CLOSE;
1068 
1069   if( zName ){
1070     z += pVfs->szOsFile;
1071     pData->zName = z;
1072     pData->nName = nName;
1073     memcpy(pData->zName, zName, nName);
1074   }
1075 
1076   if( !isAsyncOpen ){
1077     int flagsout;
1078     rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
1079     if( rc==SQLITE_OK
1080      && (flagsout&SQLITE_OPEN_READWRITE)
1081      && (flags&SQLITE_OPEN_EXCLUSIVE)==0
1082     ){
1083       rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1084     }
1085     if( pOutFlags ){
1086       *pOutFlags = flagsout;
1087     }
1088   }
1089 
1090   async_mutex_enter(ASYNC_MUTEX_LOCK);
1091 
1092   if( zName && rc==SQLITE_OK ){
1093     pLock = findLock(pData->zName, pData->nName);
1094     if( !pLock ){
1095       int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1096       pLock = (AsyncLock *)sqlite3_malloc(nByte);
1097       if( pLock ){
1098         memset(pLock, 0, nByte);
1099         if( async.bLockFiles && (flags&SQLITE_OPEN_MAIN_DB) ){
1100           pLock->pFile = (sqlite3_file *)&pLock[1];
1101           rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1102           if( rc!=SQLITE_OK ){
1103             sqlite3_free(pLock);
1104             pLock = 0;
1105           }
1106         }
1107         if( pLock ){
1108           pLock->nFile = pData->nName;
1109           pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1110           memcpy(pLock->zFile, pData->zName, pLock->nFile);
1111           pLock->pNext = async.pLock;
1112           async.pLock = pLock;
1113         }
1114       }else{
1115         rc = SQLITE_NOMEM;
1116       }
1117     }
1118   }
1119 
1120   if( rc==SQLITE_OK ){
1121     p->pMethod = &async_methods;
1122     p->pData = pData;
1123 
1124     /* Link AsyncFileData.lock into the linked list of
1125     ** AsyncFileLock structures for this file.
1126     */
1127     if( zName ){
1128       pData->lock.pNext = pLock->pList;
1129       pLock->pList = &pData->lock;
1130       pData->zName = pLock->zFile;
1131     }
1132   }else{
1133     if( pData->pBaseRead->pMethods ){
1134       pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1135     }
1136     if( pData->pBaseWrite->pMethods ){
1137       pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1138     }
1139     sqlite3_free(pData);
1140   }
1141 
1142   async_mutex_leave(ASYNC_MUTEX_LOCK);
1143 
1144   if( rc==SQLITE_OK ){
1145     incrOpenFileCount();
1146     pData->pLock = pLock;
1147   }
1148 
1149   if( rc==SQLITE_OK && isAsyncOpen ){
1150     rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1151     if( rc==SQLITE_OK ){
1152       if( pOutFlags ) *pOutFlags = flags;
1153     }else{
1154       async_mutex_enter(ASYNC_MUTEX_LOCK);
1155       unlinkAsyncFile(pData);
1156       async_mutex_leave(ASYNC_MUTEX_LOCK);
1157       sqlite3_free(pData);
1158     }
1159   }
1160   if( rc!=SQLITE_OK ){
1161     p->pMethod = 0;
1162   }
1163   return rc;
1164 }
1165 
1166 /*
1167 ** Implementation of sqlite3OsDelete. Add an entry to the end of the
1168 ** write-op queue to perform the delete.
1169 */
1170 static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1171   UNUSED_PARAMETER(pAsyncVfs);
1172   return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, (int)strlen(z)+1, z);
1173 }
1174 
1175 /*
1176 ** Implementation of sqlite3OsAccess. This method holds the mutex from
1177 ** start to finish.
1178 */
1179 static int asyncAccess(
1180   sqlite3_vfs *pAsyncVfs,
1181   const char *zName,
1182   int flags,
1183   int *pResOut
1184 ){
1185   int rc;
1186   int ret;
1187   AsyncWrite *p;
1188   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1189 
1190   assert(flags==SQLITE_ACCESS_READWRITE
1191       || flags==SQLITE_ACCESS_READ
1192       || flags==SQLITE_ACCESS_EXISTS
1193   );
1194 
1195   async_mutex_enter(ASYNC_MUTEX_QUEUE);
1196   rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1197   if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1198     for(p=async.pQueueFirst; p; p = p->pNext){
1199       if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1200         ret = 0;
1201       }else if( p->op==ASYNC_OPENEXCLUSIVE
1202              && p->pFileData->zName
1203              && 0==strcmp(p->pFileData->zName, zName)
1204       ){
1205         ret = 1;
1206       }
1207     }
1208   }
1209   ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1210     flags==SQLITE_ACCESS_READWRITE?"read-write":
1211     flags==SQLITE_ACCESS_READ?"read":"exists"
1212     , zName, ret)
1213   );
1214   async_mutex_leave(ASYNC_MUTEX_QUEUE);
1215   *pResOut = ret;
1216   return rc;
1217 }
1218 
1219 /*
1220 ** Fill in zPathOut with the full path to the file identified by zPath.
1221 */
1222 static int asyncFullPathname(
1223   sqlite3_vfs *pAsyncVfs,
1224   const char *zPath,
1225   int nPathOut,
1226   char *zPathOut
1227 ){
1228   int rc;
1229   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1230   rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1231 
1232   /* Because of the way intra-process file locking works, this backend
1233   ** needs to return a canonical path. The following block assumes the
1234   ** file-system uses unix style paths.
1235   */
1236   if( rc==SQLITE_OK ){
1237     int i, j;
1238     char *z = zPathOut;
1239     int n = strlen(z);
1240     while( n>1 && z[n-1]=='/' ){ n--; }
1241     for(i=j=0; i<n; i++){
1242       if( z[i]=='/' ){
1243         if( z[i+1]=='/' ) continue;
1244         if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1245           i += 1;
1246           continue;
1247         }
1248         if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1249           while( j>0 && z[j-1]!='/' ){ j--; }
1250           if( j>0 ){ j--; }
1251           i += 2;
1252           continue;
1253         }
1254       }
1255       z[j++] = z[i];
1256     }
1257     z[j] = 0;
1258   }
1259 
1260   return rc;
1261 }
1262 static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1263   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1264   return pVfs->xDlOpen(pVfs, zPath);
1265 }
1266 static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1267   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1268   pVfs->xDlError(pVfs, nByte, zErrMsg);
1269 }
1270 static void (*asyncDlSym(
1271   sqlite3_vfs *pAsyncVfs,
1272   void *pHandle,
1273   const char *zSymbol
1274 ))(void){
1275   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1276   return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1277 }
1278 static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1279   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1280   pVfs->xDlClose(pVfs, pHandle);
1281 }
1282 static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1283   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1284   return pVfs->xRandomness(pVfs, nByte, zBufOut);
1285 }
1286 static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1287   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1288   return pVfs->xSleep(pVfs, nMicro);
1289 }
1290 static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1291   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1292   return pVfs->xCurrentTime(pVfs, pTimeOut);
1293 }
1294 
1295 static sqlite3_vfs async_vfs = {
1296   1,                    /* iVersion */
1297   sizeof(AsyncFile),    /* szOsFile */
1298   0,                    /* mxPathname */
1299   0,                    /* pNext */
1300   SQLITEASYNC_VFSNAME,  /* zName */
1301   0,                    /* pAppData */
1302   asyncOpen,            /* xOpen */
1303   asyncDelete,          /* xDelete */
1304   asyncAccess,          /* xAccess */
1305   asyncFullPathname,    /* xFullPathname */
1306   asyncDlOpen,          /* xDlOpen */
1307   asyncDlError,         /* xDlError */
1308   asyncDlSym,           /* xDlSym */
1309   asyncDlClose,         /* xDlClose */
1310   asyncRandomness,      /* xDlError */
1311   asyncSleep,           /* xDlSym */
1312   asyncCurrentTime      /* xDlClose */
1313 };
1314 
1315 /*
1316 ** This procedure runs in a separate thread, reading messages off of the
1317 ** write queue and processing them one by one.
1318 **
1319 ** If async.writerHaltNow is true, then this procedure exits
1320 ** after processing a single message.
1321 **
1322 ** If async.writerHaltWhenIdle is true, then this procedure exits when
1323 ** the write queue is empty.
1324 **
1325 ** If both of the above variables are false, this procedure runs
1326 ** indefinately, waiting for operations to be added to the write queue
1327 ** and processing them in the order in which they arrive.
1328 **
1329 ** An artifical delay of async.ioDelay milliseconds is inserted before
1330 ** each write operation in order to simulate the effect of a slow disk.
1331 **
1332 ** Only one instance of this procedure may be running at a time.
1333 */
1334 static void asyncWriterThread(void){
1335   sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1336   AsyncWrite *p = 0;
1337   int rc = SQLITE_OK;
1338   int holdingMutex = 0;
1339 
1340   async_mutex_enter(ASYNC_MUTEX_WRITER);
1341 
1342   while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1343     int doNotFree = 0;
1344     sqlite3_file *pBase = 0;
1345 
1346     if( !holdingMutex ){
1347       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1348     }
1349     while( (p = async.pQueueFirst)==0 ){
1350       if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1351         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1352         break;
1353       }else{
1354         ASYNC_TRACE(("IDLE\n"));
1355         async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1356         ASYNC_TRACE(("WAKEUP\n"));
1357       }
1358     }
1359     if( p==0 ) break;
1360     holdingMutex = 1;
1361 
1362     /* Right now this thread is holding the mutex on the write-op queue.
1363     ** Variable 'p' points to the first entry in the write-op queue. In
1364     ** the general case, we hold on to the mutex for the entire body of
1365     ** the loop.
1366     **
1367     ** However in the cases enumerated below, we relinquish the mutex,
1368     ** perform the IO, and then re-request the mutex before removing 'p' from
1369     ** the head of the write-op queue. The idea is to increase concurrency with
1370     ** sqlite threads.
1371     **
1372     **     * An ASYNC_CLOSE operation.
1373     **     * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1374     **       the mutex, call the underlying xOpenExclusive() function, then
1375     **       re-aquire the mutex before seting the AsyncFile.pBaseRead
1376     **       variable.
1377     **     * ASYNC_SYNC and ASYNC_WRITE operations, if
1378     **       SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1379     **       file-handles are open for the particular file being "synced".
1380     */
1381     if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1382       p->op = ASYNC_NOOP;
1383     }
1384     if( p->pFileData ){
1385       pBase = p->pFileData->pBaseWrite;
1386       if(
1387         p->op==ASYNC_CLOSE ||
1388         p->op==ASYNC_OPENEXCLUSIVE ||
1389         (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1390       ){
1391         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1392         holdingMutex = 0;
1393       }
1394       if( !pBase->pMethods ){
1395         pBase = p->pFileData->pBaseRead;
1396       }
1397     }
1398 
1399     switch( p->op ){
1400       case ASYNC_NOOP:
1401         break;
1402 
1403       case ASYNC_WRITE:
1404         assert( pBase );
1405         ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1406                 p->pFileData->zName, p->nByte, p->iOffset));
1407         rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1408         break;
1409 
1410       case ASYNC_SYNC:
1411         assert( pBase );
1412         ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1413         rc = pBase->pMethods->xSync(pBase, p->nByte);
1414         break;
1415 
1416       case ASYNC_TRUNCATE:
1417         assert( pBase );
1418         ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1419                 p->pFileData->zName, p->iOffset));
1420         rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1421         break;
1422 
1423       case ASYNC_CLOSE: {
1424         AsyncFileData *pData = p->pFileData;
1425         ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1426         if( pData->pBaseWrite->pMethods ){
1427           pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1428         }
1429         if( pData->pBaseRead->pMethods ){
1430           pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1431         }
1432 
1433         /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1434         ** structures for this file. Obtain the async.lockMutex mutex
1435         ** before doing so.
1436         */
1437         async_mutex_enter(ASYNC_MUTEX_LOCK);
1438         rc = unlinkAsyncFile(pData);
1439         async_mutex_leave(ASYNC_MUTEX_LOCK);
1440 
1441         if( !holdingMutex ){
1442           async_mutex_enter(ASYNC_MUTEX_QUEUE);
1443           holdingMutex = 1;
1444         }
1445         assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1446         async.pQueueFirst = p->pNext;
1447         sqlite3_free(pData);
1448         doNotFree = 1;
1449         break;
1450       }
1451 
1452       case ASYNC_UNLOCK: {
1453         AsyncWrite *pIter;
1454         AsyncFileData *pData = p->pFileData;
1455         int eLock = p->nByte;
1456 
1457         /* When a file is locked by SQLite using the async backend, it is
1458         ** locked within the 'real' file-system synchronously. When it is
1459         ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1460         ** unlock the file asynchronously. The design of the async backend
1461         ** requires that the 'real' file-system file be locked from the
1462         ** time that SQLite first locks it (and probably reads from it)
1463         ** until all asynchronous write events that were scheduled before
1464         ** SQLite unlocked the file have been processed.
1465         **
1466         ** This is more complex if SQLite locks and unlocks the file multiple
1467         ** times in quick succession. For example, if SQLite does:
1468         **
1469         **   lock, write, unlock, lock, write, unlock
1470         **
1471         ** Each "lock" operation locks the file immediately. Each "write"
1472         ** and "unlock" operation adds an event to the event queue. If the
1473         ** second "lock" operation is performed before the first "unlock"
1474         ** operation has been processed asynchronously, then the first
1475         ** "unlock" cannot be safely processed as is, since this would mean
1476         ** the file was unlocked when the second "write" operation is
1477         ** processed. To work around this, when processing an ASYNC_UNLOCK
1478         ** operation, SQLite:
1479         **
1480         **   1) Unlocks the file to the minimum of the argument passed to
1481         **      the xUnlock() call and the current lock from SQLite's point
1482         **      of view, and
1483         **
1484         **   2) Only unlocks the file at all if this event is the last
1485         **      ASYNC_UNLOCK event on this file in the write-queue.
1486         */
1487         assert( holdingMutex==1 );
1488         assert( async.pQueueFirst==p );
1489         for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1490           if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1491         }
1492         if( !pIter ){
1493           async_mutex_enter(ASYNC_MUTEX_LOCK);
1494           pData->lock.eAsyncLock = MIN(
1495               pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1496           );
1497           assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1498           rc = getFileLock(pData->pLock);
1499           async_mutex_leave(ASYNC_MUTEX_LOCK);
1500         }
1501         break;
1502       }
1503 
1504       case ASYNC_DELETE:
1505         ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1506         rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1507         break;
1508 
1509       case ASYNC_OPENEXCLUSIVE: {
1510         int flags = (int)p->iOffset;
1511         AsyncFileData *pData = p->pFileData;
1512         ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1513         assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1514         rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1515         assert( holdingMutex==0 );
1516         async_mutex_enter(ASYNC_MUTEX_QUEUE);
1517         holdingMutex = 1;
1518         break;
1519       }
1520 
1521       default: assert(!"Illegal value for AsyncWrite.op");
1522     }
1523 
1524     /* If we didn't hang on to the mutex during the IO op, obtain it now
1525     ** so that the AsyncWrite structure can be safely removed from the
1526     ** global write-op queue.
1527     */
1528     if( !holdingMutex ){
1529       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1530       holdingMutex = 1;
1531     }
1532     /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1533     if( p==async.pQueueLast ){
1534       async.pQueueLast = 0;
1535     }
1536     if( !doNotFree ){
1537       assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1538       async.pQueueFirst = p->pNext;
1539       sqlite3_free(p);
1540     }
1541     assert( holdingMutex );
1542 
1543     /* An IO error has occurred. We cannot report the error back to the
1544     ** connection that requested the I/O since the error happened
1545     ** asynchronously.  The connection has already moved on.  There
1546     ** really is nobody to report the error to.
1547     **
1548     ** The file for which the error occurred may have been a database or
1549     ** journal file. Regardless, none of the currently queued operations
1550     ** associated with the same database should now be performed. Nor should
1551     ** any subsequently requested IO on either a database or journal file
1552     ** handle for the same database be accepted until the main database
1553     ** file handle has been closed and reopened.
1554     **
1555     ** Furthermore, no further IO should be queued or performed on any file
1556     ** handle associated with a database that may have been part of a
1557     ** multi-file transaction that included the database associated with
1558     ** the IO error (i.e. a database ATTACHed to the same handle at some
1559     ** point in time).
1560     */
1561     if( rc!=SQLITE_OK ){
1562       async.ioError = rc;
1563     }
1564 
1565     if( async.ioError && !async.pQueueFirst ){
1566       async_mutex_enter(ASYNC_MUTEX_LOCK);
1567       if( 0==async.pLock ){
1568         async.ioError = SQLITE_OK;
1569       }
1570       async_mutex_leave(ASYNC_MUTEX_LOCK);
1571     }
1572 
1573     /* Drop the queue mutex before continuing to the next write operation
1574     ** in order to give other threads a chance to work with the write queue.
1575     */
1576     if( !async.pQueueFirst || !async.ioError ){
1577       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1578       holdingMutex = 0;
1579       if( async.ioDelay>0 ){
1580         pVfs->xSleep(pVfs, async.ioDelay*1000);
1581       }else{
1582         async_sched_yield();
1583       }
1584     }
1585   }
1586 
1587   async_mutex_leave(ASYNC_MUTEX_WRITER);
1588   return;
1589 }
1590 
1591 /*
1592 ** Install the asynchronous VFS.
1593 */
1594 int sqlite3async_initialize(const char *zParent, int isDefault){
1595   int rc = SQLITE_OK;
1596   if( async_vfs.pAppData==0 ){
1597     sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1598     if( !pParent || async_os_initialize() ){
1599       rc = SQLITE_ERROR;
1600     }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1601       async_os_shutdown();
1602     }else{
1603       async_vfs.pAppData = (void *)pParent;
1604       async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1605     }
1606   }
1607   return rc;
1608 }
1609 
1610 /*
1611 ** Uninstall the asynchronous VFS.
1612 */
1613 void sqlite3async_shutdown(void){
1614   if( async_vfs.pAppData ){
1615     async_os_shutdown();
1616     sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1617     async_vfs.pAppData = 0;
1618   }
1619 }
1620 
1621 /*
1622 ** Process events on the write-queue.
1623 */
1624 void sqlite3async_run(void){
1625   asyncWriterThread();
1626 }
1627 
1628 /*
1629 ** Control/configure the asynchronous IO system.
1630 */
1631 int sqlite3async_control(int op, ...){
1632   va_list ap;
1633   va_start(ap, op);
1634   switch( op ){
1635     case SQLITEASYNC_HALT: {
1636       int eWhen = va_arg(ap, int);
1637       if( eWhen!=SQLITEASYNC_HALT_NEVER
1638        && eWhen!=SQLITEASYNC_HALT_NOW
1639        && eWhen!=SQLITEASYNC_HALT_IDLE
1640       ){
1641         return SQLITE_MISUSE;
1642       }
1643       async.eHalt = eWhen;
1644       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1645       async_cond_signal(ASYNC_COND_QUEUE);
1646       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1647       break;
1648     }
1649 
1650     case SQLITEASYNC_DELAY: {
1651       int iDelay = va_arg(ap, int);
1652       if( iDelay<0 ){
1653         return SQLITE_MISUSE;
1654       }
1655       async.ioDelay = iDelay;
1656       break;
1657     }
1658 
1659     case SQLITEASYNC_LOCKFILES: {
1660       int bLock = va_arg(ap, int);
1661       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1662       if( async.nFile || async.pQueueFirst ){
1663         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1664         return SQLITE_MISUSE;
1665       }
1666       async.bLockFiles = bLock;
1667       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1668       break;
1669     }
1670 
1671     case SQLITEASYNC_GET_HALT: {
1672       int *peWhen = va_arg(ap, int *);
1673       *peWhen = async.eHalt;
1674       break;
1675     }
1676     case SQLITEASYNC_GET_DELAY: {
1677       int *piDelay = va_arg(ap, int *);
1678       *piDelay = async.ioDelay;
1679       break;
1680     }
1681     case SQLITEASYNC_GET_LOCKFILES: {
1682       int *piDelay = va_arg(ap, int *);
1683       *piDelay = async.bLockFiles;
1684       break;
1685     }
1686 
1687     default:
1688       return SQLITE_ERROR;
1689   }
1690   return SQLITE_OK;
1691 }
1692 
1693 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1694 
1695