xref: /sqlite-3.40.0/ext/async/sqlite3async.c (revision a3628d14)
1 /*
2 ** 2005 December 14
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** $Id: sqlite3async.c,v 1.5 2009/04/29 18:12:00 shane Exp $
14 **
15 ** This file contains the implementation of an asynchronous IO backend
16 ** for SQLite.
17 */
18 
19 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
20 
21 #include "sqlite3async.h"
22 #include "sqliteInt.h"
23 #include <stdarg.h>
24 #include <string.h>
25 #include <assert.h>
26 
27 /* Useful macros used in several places */
28 #define MIN(x,y) ((x)<(y)?(x):(y))
29 #define MAX(x,y) ((x)>(y)?(x):(y))
30 
31 /* Forward references */
32 typedef struct AsyncWrite AsyncWrite;
33 typedef struct AsyncFile AsyncFile;
34 typedef struct AsyncFileData AsyncFileData;
35 typedef struct AsyncFileLock AsyncFileLock;
36 typedef struct AsyncLock AsyncLock;
37 
38 /* Enable for debugging */
39 #ifndef NDEBUG
40 #include <stdio.h>
41 static int sqlite3async_trace = 0;
42 # define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
43 static void asyncTrace(const char *zFormat, ...){
44   char *z;
45   va_list ap;
46   va_start(ap, zFormat);
47   z = sqlite3_vmprintf(zFormat, ap);
48   va_end(ap);
49   fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
50   sqlite3_free(z);
51 }
52 #else
53 # define ASYNC_TRACE(X)
54 #endif
55 
56 /*
57 ** THREAD SAFETY NOTES
58 **
59 ** Basic rules:
60 **
61 **     * Both read and write access to the global write-op queue must be
62 **       protected by the async.queueMutex. As are the async.ioError and
63 **       async.nFile variables.
64 **
65 **     * The async.pLock list and all AsyncLock and AsyncFileLock
66 **       structures must be protected by the async.lockMutex mutex.
67 **
68 **     * The file handles from the underlying system are not assumed to
69 **       be thread safe.
70 **
71 **     * See the last two paragraphs under "The Writer Thread" for
72 **       an assumption to do with file-handle synchronization by the Os.
73 **
74 ** Deadlock prevention:
75 **
76 **     There are three mutex used by the system: the "writer" mutex,
77 **     the "queue" mutex and the "lock" mutex. Rules are:
78 **
79 **     * It is illegal to block on the writer mutex when any other mutex
80 **       are held, and
81 **
82 **     * It is illegal to block on the queue mutex when the lock mutex
83 **       is held.
84 **
85 **     i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
86 **
87 ** File system operations (invoked by SQLite thread):
88 **
89 **     xOpen
90 **     xDelete
91 **     xFileExists
92 **
93 ** File handle operations (invoked by SQLite thread):
94 **
95 **         asyncWrite, asyncClose, asyncTruncate, asyncSync
96 **
97 **     The operations above add an entry to the global write-op list. They
98 **     prepare the entry, acquire the async.queueMutex momentarily while
99 **     list pointers are  manipulated to insert the new entry, then release
100 **     the mutex and signal the writer thread to wake up in case it happens
101 **     to be asleep.
102 **
103 **
104 **         asyncRead, asyncFileSize.
105 **
106 **     Read operations. Both of these read from both the underlying file
107 **     first then adjust their result based on pending writes in the
108 **     write-op queue.   So async.queueMutex is held for the duration
109 **     of these operations to prevent other threads from changing the
110 **     queue in mid operation.
111 **
112 **
113 **         asyncLock, asyncUnlock, asyncCheckReservedLock
114 **
115 **     These primitives implement in-process locking using a hash table
116 **     on the file name.  Files are locked correctly for connections coming
117 **     from the same process.  But other processes cannot see these locks
118 **     and will therefore not honor them.
119 **
120 **
121 ** The writer thread:
122 **
123 **     The async.writerMutex is used to make sure only there is only
124 **     a single writer thread running at a time.
125 **
126 **     Inside the writer thread is a loop that works like this:
127 **
128 **         WHILE (write-op list is not empty)
129 **             Do IO operation at head of write-op list
130 **             Remove entry from head of write-op list
131 **         END WHILE
132 **
133 **     The async.queueMutex is always held during the <write-op list is
134 **     not empty> test, and when the entry is removed from the head
135 **     of the write-op list. Sometimes it is held for the interim
136 **     period (while the IO is performed), and sometimes it is
137 **     relinquished. It is relinquished if (a) the IO op is an
138 **     ASYNC_CLOSE or (b) when the file handle was opened, two of
139 **     the underlying systems handles were opened on the same
140 **     file-system entry.
141 **
142 **     If condition (b) above is true, then one file-handle
143 **     (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
144 **     file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
145 **     threads to perform write() operations. This means that read
146 **     operations are not blocked by asynchronous writes (although
147 **     asynchronous writes may still be blocked by reads).
148 **
149 **     This assumes that the OS keeps two handles open on the same file
150 **     properly in sync. That is, any read operation that starts after a
151 **     write operation on the same file system entry has completed returns
152 **     data consistent with the write. We also assume that if one thread
153 **     reads a file while another is writing it all bytes other than the
154 **     ones actually being written contain valid data.
155 **
156 **     If the above assumptions are not true, set the preprocessor symbol
157 **     SQLITE_ASYNC_TWO_FILEHANDLES to 0.
158 */
159 
160 
161 #ifndef NDEBUG
162 # define TESTONLY( X ) X
163 #else
164 # define TESTONLY( X )
165 #endif
166 
167 /*
168 ** PORTING FUNCTIONS
169 **
170 ** There are two definitions of the following functions. One for pthreads
171 ** compatible systems and one for Win32. These functions isolate the OS
172 ** specific code required by each platform.
173 **
174 ** The system uses three mutexes and a single condition variable. To
175 ** block on a mutex, async_mutex_enter() is called. The parameter passed
176 ** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
177 ** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
178 ** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
179 ** called with a parameter identifying the mutex being unlocked. Mutexes
180 ** are not recursive - it is an error to call async_mutex_enter() to
181 ** lock a mutex that is already locked, or to call async_mutex_leave()
182 ** to unlock a mutex that is not currently locked.
183 **
184 ** The async_cond_wait() and async_cond_signal() functions are modelled
185 ** on the pthreads functions with similar names. The first parameter to
186 ** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
187 ** is called the mutex identified by the second parameter must be held.
188 ** The mutex is unlocked, and the calling thread simultaneously begins
189 ** waiting for the condition variable to be signalled by another thread.
190 ** After another thread signals the condition variable, the calling
191 ** thread stops waiting, locks mutex eMutex and returns. The
192 ** async_cond_signal() function is used to signal the condition variable.
193 ** It is assumed that the mutex used by the thread calling async_cond_wait()
194 ** is held by the caller of async_cond_signal() (otherwise there would be
195 ** a race condition).
196 **
197 ** It is guaranteed that no other thread will call async_cond_wait() when
198 ** there is already a thread waiting on the condition variable.
199 **
200 ** The async_sched_yield() function is called to suggest to the operating
201 ** system that it would be a good time to shift the current thread off the
202 ** CPU. The system will still work if this function is not implemented
203 ** (it is not currently implemented for win32), but it might be marginally
204 ** more efficient if it is.
205 */
206 static void async_mutex_enter(int eMutex);
207 static void async_mutex_leave(int eMutex);
208 static void async_cond_wait(int eCond, int eMutex);
209 static void async_cond_signal(int eCond);
210 static void async_sched_yield(void);
211 
212 /*
213 ** There are also two definitions of the following. async_os_initialize()
214 ** is called when the asynchronous VFS is first installed, and os_shutdown()
215 ** is called when it is uninstalled (from within sqlite3async_shutdown()).
216 **
217 ** For pthreads builds, both of these functions are no-ops. For win32,
218 ** they provide an opportunity to initialize and finalize the required
219 ** mutex and condition variables.
220 **
221 ** If async_os_initialize() returns other than zero, then the initialization
222 ** fails and SQLITE_ERROR is returned to the user.
223 */
224 static int async_os_initialize(void);
225 static void async_os_shutdown(void);
226 
227 /* Values for use as the 'eMutex' argument of the above functions. The
228 ** integer values assigned to these constants are important for assert()
229 ** statements that verify that mutexes are locked in the correct order.
230 ** Specifically, it is unsafe to try to lock mutex N while holding a lock
231 ** on mutex M if (M<=N).
232 */
233 #define ASYNC_MUTEX_LOCK    0
234 #define ASYNC_MUTEX_QUEUE   1
235 #define ASYNC_MUTEX_WRITER  2
236 
237 /* Values for use as the 'eCond' argument of the above functions. */
238 #define ASYNC_COND_QUEUE    0
239 
240 /*************************************************************************
241 ** Start of OS specific code.
242 */
243 #if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
244 
245 /* The following block contains the win32 specific code. */
246 
247 #define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
248 
249 static struct AsyncPrimitives {
250   int isInit;
251   DWORD aHolder[3];
252   CRITICAL_SECTION aMutex[3];
253   HANDLE aCond[1];
254 } primitives = { 0 };
255 
256 static int async_os_initialize(void){
257   if( !primitives.isInit ){
258     primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
259     if( primitives.aCond[0]==NULL ){
260       return 1;
261     }
262     InitializeCriticalSection(&primitives.aMutex[0]);
263     InitializeCriticalSection(&primitives.aMutex[1]);
264     InitializeCriticalSection(&primitives.aMutex[2]);
265     primitives.isInit = 1;
266   }
267   return 0;
268 }
269 static void async_os_shutdown(void){
270   if( primitives.isInit ){
271     DeleteCriticalSection(&primitives.aMutex[0]);
272     DeleteCriticalSection(&primitives.aMutex[1]);
273     DeleteCriticalSection(&primitives.aMutex[2]);
274     CloseHandle(primitives.aCond[0]);
275     primitives.isInit = 0;
276   }
277 }
278 
279 /* The following block contains the Win32 specific code. */
280 static void async_mutex_enter(int eMutex){
281   assert( eMutex==0 || eMutex==1 || eMutex==2 );
282   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
283   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
284   assert( eMutex!=0 || (!mutex_held(0)) );
285   EnterCriticalSection(&primitives.aMutex[eMutex]);
286   TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
287 }
288 static void async_mutex_leave(int eMutex){
289   assert( eMutex==0 || eMutex==1 || eMutex==2 );
290   assert( mutex_held(eMutex) );
291   TESTONLY( primitives.aHolder[eMutex] = 0; )
292   LeaveCriticalSection(&primitives.aMutex[eMutex]);
293 }
294 static void async_cond_wait(int eCond, int eMutex){
295   ResetEvent(primitives.aCond[eCond]);
296   async_mutex_leave(eMutex);
297   WaitForSingleObject(primitives.aCond[eCond], INFINITE);
298   async_mutex_enter(eMutex);
299 }
300 static void async_cond_signal(int eCond){
301   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
302   SetEvent(primitives.aCond[eCond]);
303 }
304 static void async_sched_yield(void){
305   /* Todo: Find out if win32 offers anything like sched_yield() */
306   Sleep(0);
307 }
308 #else
309 
310 /* The following block contains the pthreads specific code. */
311 #include <pthread.h>
312 #include <sched.h>
313 
314 #define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
315 
316 static int  async_os_initialize(void) {return 0;}
317 static void async_os_shutdown(void) {}
318 
319 static struct AsyncPrimitives {
320   pthread_mutex_t aMutex[3];
321   pthread_cond_t aCond[1];
322   pthread_t aHolder[3];
323 } primitives = {
324   { PTHREAD_MUTEX_INITIALIZER,
325     PTHREAD_MUTEX_INITIALIZER,
326     PTHREAD_MUTEX_INITIALIZER
327   } , {
328     PTHREAD_COND_INITIALIZER
329   } , { 0, 0, 0 }
330 };
331 
332 static void async_mutex_enter(int eMutex){
333   assert( eMutex==0 || eMutex==1 || eMutex==2 );
334   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
335   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
336   assert( eMutex!=0 || (!mutex_held(0)) );
337   pthread_mutex_lock(&primitives.aMutex[eMutex]);
338   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
339 }
340 static void async_mutex_leave(int eMutex){
341   assert( eMutex==0 || eMutex==1 || eMutex==2 );
342   assert( mutex_held(eMutex) );
343   TESTONLY( primitives.aHolder[eMutex] = 0; )
344   pthread_mutex_unlock(&primitives.aMutex[eMutex]);
345 }
346 static void async_cond_wait(int eCond, int eMutex){
347   assert( eMutex==0 || eMutex==1 || eMutex==2 );
348   assert( mutex_held(eMutex) );
349   TESTONLY( primitives.aHolder[eMutex] = 0; )
350   pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
351   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
352 }
353 static void async_cond_signal(int eCond){
354   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
355   pthread_cond_signal(&primitives.aCond[eCond]);
356 }
357 static void async_sched_yield(void){
358   sched_yield();
359 }
360 #endif
361 /*
362 ** End of OS specific code.
363 *************************************************************************/
364 
365 #define assert_mutex_is_held(X) assert( mutex_held(X) )
366 
367 
368 #ifndef SQLITE_ASYNC_TWO_FILEHANDLES
369 /* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
370 #define SQLITE_ASYNC_TWO_FILEHANDLES 1
371 #endif
372 
373 /*
374 ** State information is held in the static variable "async" defined
375 ** as the following structure.
376 **
377 ** Both async.ioError and async.nFile are protected by async.queueMutex.
378 */
379 static struct TestAsyncStaticData {
380   AsyncWrite *pQueueFirst;     /* Next write operation to be processed */
381   AsyncWrite *pQueueLast;      /* Last write operation on the list */
382   AsyncLock *pLock;            /* Linked list of all AsyncLock structures */
383   volatile int ioDelay;        /* Extra delay between write operations */
384   volatile int eHalt;          /* One of the SQLITEASYNC_HALT_XXX values */
385   volatile int bLockFiles;     /* Current value of "lockfiles" parameter */
386   int ioError;                 /* True if an IO error has occurred */
387   int nFile;                   /* Number of open files (from sqlite pov) */
388 } async = { 0,0,0,0,0,1,0,0 };
389 
390 /* Possible values of AsyncWrite.op */
391 #define ASYNC_NOOP          0
392 #define ASYNC_WRITE         1
393 #define ASYNC_SYNC          2
394 #define ASYNC_TRUNCATE      3
395 #define ASYNC_CLOSE         4
396 #define ASYNC_DELETE        5
397 #define ASYNC_OPENEXCLUSIVE 6
398 #define ASYNC_UNLOCK        7
399 
400 /* Names of opcodes.  Used for debugging only.
401 ** Make sure these stay in sync with the macros above!
402 */
403 static const char *azOpcodeName[] = {
404   "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
405 };
406 
407 /*
408 ** Entries on the write-op queue are instances of the AsyncWrite
409 ** structure, defined here.
410 **
411 ** The interpretation of the iOffset and nByte variables varies depending
412 ** on the value of AsyncWrite.op:
413 **
414 ** ASYNC_NOOP:
415 **     No values used.
416 **
417 ** ASYNC_WRITE:
418 **     iOffset -> Offset in file to write to.
419 **     nByte   -> Number of bytes of data to write (pointed to by zBuf).
420 **
421 ** ASYNC_SYNC:
422 **     nByte   -> flags to pass to sqlite3OsSync().
423 **
424 ** ASYNC_TRUNCATE:
425 **     iOffset -> Size to truncate file to.
426 **     nByte   -> Unused.
427 **
428 ** ASYNC_CLOSE:
429 **     iOffset -> Unused.
430 **     nByte   -> Unused.
431 **
432 ** ASYNC_DELETE:
433 **     iOffset -> Contains the "syncDir" flag.
434 **     nByte   -> Number of bytes of zBuf points to (file name).
435 **
436 ** ASYNC_OPENEXCLUSIVE:
437 **     iOffset -> Value of "delflag".
438 **     nByte   -> Number of bytes of zBuf points to (file name).
439 **
440 ** ASYNC_UNLOCK:
441 **     nByte   -> Argument to sqlite3OsUnlock().
442 **
443 **
444 ** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
445 ** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
446 ** single blob, so is deleted when sqlite3_free() is called on the parent
447 ** structure.
448 */
449 struct AsyncWrite {
450   AsyncFileData *pFileData;    /* File to write data to or sync */
451   int op;                      /* One of ASYNC_xxx etc. */
452   sqlite_int64 iOffset;        /* See above */
453   int nByte;          /* See above */
454   char *zBuf;         /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
455   AsyncWrite *pNext;  /* Next write operation (to any file) */
456 };
457 
458 /*
459 ** An instance of this structure is created for each distinct open file
460 ** (i.e. if two handles are opened on the one file, only one of these
461 ** structures is allocated) and stored in the async.aLock hash table. The
462 ** keys for async.aLock are the full pathnames of the opened files.
463 **
464 ** AsyncLock.pList points to the head of a linked list of AsyncFileLock
465 ** structures, one for each handle currently open on the file.
466 **
467 ** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
468 ** not passed to the sqlite3OsOpen() call), or if async.bLockFiles is
469 ** false, variables AsyncLock.pFile and AsyncLock.eLock are never used.
470 ** Otherwise, pFile is a file handle opened on the file in question and
471 ** used to obtain the file-system locks required by database connections
472 ** within this process.
473 **
474 ** See comments above the asyncLock() function for more details on
475 ** the implementation of database locking used by this backend.
476 */
477 struct AsyncLock {
478   char *zFile;
479   int nFile;
480   sqlite3_file *pFile;
481   int eLock;
482   AsyncFileLock *pList;
483   AsyncLock *pNext;           /* Next in linked list headed by async.pLock */
484 };
485 
486 /*
487 ** An instance of the following structure is allocated along with each
488 ** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
489 ** file was opened with the SQLITE_OPEN_MAIN_DB.
490 */
491 struct AsyncFileLock {
492   int eLock;                /* Internally visible lock state (sqlite pov) */
493   int eAsyncLock;           /* Lock-state with write-queue unlock */
494   AsyncFileLock *pNext;
495 };
496 
497 /*
498 ** The AsyncFile structure is a subclass of sqlite3_file used for
499 ** asynchronous IO.
500 **
501 ** All of the actual data for the structure is stored in the structure
502 ** pointed to by AsyncFile.pData, which is allocated as part of the
503 ** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
504 ** lifetime of the AsyncFile structure is ended by the caller after OsClose()
505 ** is called, but the data in AsyncFileData may be required by the
506 ** writer thread after that point.
507 */
508 struct AsyncFile {
509   sqlite3_io_methods *pMethod;
510   AsyncFileData *pData;
511 };
512 struct AsyncFileData {
513   char *zName;               /* Underlying OS filename - used for debugging */
514   int nName;                 /* Number of characters in zName */
515   sqlite3_file *pBaseRead;   /* Read handle to the underlying Os file */
516   sqlite3_file *pBaseWrite;  /* Write handle to the underlying Os file */
517   AsyncFileLock lock;        /* Lock state for this handle */
518   AsyncLock *pLock;          /* AsyncLock object for this file system entry */
519   AsyncWrite closeOp;        /* Preallocated close operation */
520 };
521 
522 /*
523 ** Add an entry to the end of the global write-op list. pWrite should point
524 ** to an AsyncWrite structure allocated using sqlite3_malloc().  The writer
525 ** thread will call sqlite3_free() to free the structure after the specified
526 ** operation has been completed.
527 **
528 ** Once an AsyncWrite structure has been added to the list, it becomes the
529 ** property of the writer thread and must not be read or modified by the
530 ** caller.
531 */
532 static void addAsyncWrite(AsyncWrite *pWrite){
533   /* We must hold the queue mutex in order to modify the queue pointers */
534   if( pWrite->op!=ASYNC_UNLOCK ){
535     async_mutex_enter(ASYNC_MUTEX_QUEUE);
536   }
537 
538   /* Add the record to the end of the write-op queue */
539   assert( !pWrite->pNext );
540   if( async.pQueueLast ){
541     assert( async.pQueueFirst );
542     async.pQueueLast->pNext = pWrite;
543   }else{
544     async.pQueueFirst = pWrite;
545   }
546   async.pQueueLast = pWrite;
547   ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
548          pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
549 
550   if( pWrite->op==ASYNC_CLOSE ){
551     async.nFile--;
552   }
553 
554   /* The writer thread might have been idle because there was nothing
555   ** on the write-op queue for it to do.  So wake it up. */
556   async_cond_signal(ASYNC_COND_QUEUE);
557 
558   /* Drop the queue mutex */
559   if( pWrite->op!=ASYNC_UNLOCK ){
560     async_mutex_leave(ASYNC_MUTEX_QUEUE);
561   }
562 }
563 
564 /*
565 ** Increment async.nFile in a thread-safe manner.
566 */
567 static void incrOpenFileCount(void){
568   /* We must hold the queue mutex in order to modify async.nFile */
569   async_mutex_enter(ASYNC_MUTEX_QUEUE);
570   if( async.nFile==0 ){
571     async.ioError = SQLITE_OK;
572   }
573   async.nFile++;
574   async_mutex_leave(ASYNC_MUTEX_QUEUE);
575 }
576 
577 /*
578 ** This is a utility function to allocate and populate a new AsyncWrite
579 ** structure and insert it (via addAsyncWrite() ) into the global list.
580 */
581 static int addNewAsyncWrite(
582   AsyncFileData *pFileData,
583   int op,
584   sqlite3_int64 iOffset,
585   int nByte,
586   const char *zByte
587 ){
588   AsyncWrite *p;
589   if( op!=ASYNC_CLOSE && async.ioError ){
590     return async.ioError;
591   }
592   p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
593   if( !p ){
594     /* The upper layer does not expect operations like OsWrite() to
595     ** return SQLITE_NOMEM. This is partly because under normal conditions
596     ** SQLite is required to do rollback without calling malloc(). So
597     ** if malloc() fails here, treat it as an I/O error. The above
598     ** layer knows how to handle that.
599     */
600     return SQLITE_IOERR;
601   }
602   p->op = op;
603   p->iOffset = iOffset;
604   p->nByte = nByte;
605   p->pFileData = pFileData;
606   p->pNext = 0;
607   if( zByte ){
608     p->zBuf = (char *)&p[1];
609     memcpy(p->zBuf, zByte, nByte);
610   }else{
611     p->zBuf = 0;
612   }
613   addAsyncWrite(p);
614   return SQLITE_OK;
615 }
616 
617 /*
618 ** Close the file. This just adds an entry to the write-op list, the file is
619 ** not actually closed.
620 */
621 static int asyncClose(sqlite3_file *pFile){
622   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
623 
624   /* Unlock the file, if it is locked */
625   async_mutex_enter(ASYNC_MUTEX_LOCK);
626   p->lock.eLock = 0;
627   async_mutex_leave(ASYNC_MUTEX_LOCK);
628 
629   addAsyncWrite(&p->closeOp);
630   return SQLITE_OK;
631 }
632 
633 /*
634 ** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
635 ** writing to the underlying file, this function adds an entry to the end of
636 ** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
637 ** returned.
638 */
639 static int asyncWrite(
640   sqlite3_file *pFile,
641   const void *pBuf,
642   int amt,
643   sqlite3_int64 iOff
644 ){
645   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
646   return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
647 }
648 
649 /*
650 ** Read data from the file. First we read from the filesystem, then adjust
651 ** the contents of the buffer based on ASYNC_WRITE operations in the
652 ** write-op queue.
653 **
654 ** This method holds the mutex from start to finish.
655 */
656 static int asyncRead(
657   sqlite3_file *pFile,
658   void *zOut,
659   int iAmt,
660   sqlite3_int64 iOffset
661 ){
662   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
663   int rc = SQLITE_OK;
664   sqlite3_int64 filesize;
665   int nRead;
666   sqlite3_file *pBase = p->pBaseRead;
667 
668   /* Grab the write queue mutex for the duration of the call */
669   async_mutex_enter(ASYNC_MUTEX_QUEUE);
670 
671   /* If an I/O error has previously occurred in this virtual file
672   ** system, then all subsequent operations fail.
673   */
674   if( async.ioError!=SQLITE_OK ){
675     rc = async.ioError;
676     goto asyncread_out;
677   }
678 
679   if( pBase->pMethods ){
680     rc = pBase->pMethods->xFileSize(pBase, &filesize);
681     if( rc!=SQLITE_OK ){
682       goto asyncread_out;
683     }
684     nRead = (int)MIN(filesize - iOffset, iAmt);
685     if( nRead>0 ){
686       rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
687       ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
688     }
689   }
690 
691   if( rc==SQLITE_OK ){
692     AsyncWrite *pWrite;
693     char *zName = p->zName;
694 
695     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
696       if( pWrite->op==ASYNC_WRITE && (
697         (pWrite->pFileData==p) ||
698         (zName && pWrite->pFileData->zName==zName)
699       )){
700         sqlite3_int64 iBeginOut = (pWrite->iOffset-iOffset);
701         sqlite3_int64 iBeginIn = -iBeginOut;
702         int nCopy;
703 
704         if( iBeginIn<0 ) iBeginIn = 0;
705         if( iBeginOut<0 ) iBeginOut = 0;
706         nCopy = MIN(pWrite->nByte-iBeginIn, iAmt-iBeginOut);
707 
708         if( nCopy>0 ){
709           memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
710           ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
711         }
712       }
713     }
714   }
715 
716 asyncread_out:
717   async_mutex_leave(ASYNC_MUTEX_QUEUE);
718   return rc;
719 }
720 
721 /*
722 ** Truncate the file to nByte bytes in length. This just adds an entry to
723 ** the write-op list, no IO actually takes place.
724 */
725 static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
726   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
727   return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
728 }
729 
730 /*
731 ** Sync the file. This just adds an entry to the write-op list, the
732 ** sync() is done later by sqlite3_async_flush().
733 */
734 static int asyncSync(sqlite3_file *pFile, int flags){
735   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
736   return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
737 }
738 
739 /*
740 ** Read the size of the file. First we read the size of the file system
741 ** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
742 ** currently in the write-op list.
743 **
744 ** This method holds the mutex from start to finish.
745 */
746 int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
747   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
748   int rc = SQLITE_OK;
749   sqlite3_int64 s = 0;
750   sqlite3_file *pBase;
751 
752   async_mutex_enter(ASYNC_MUTEX_QUEUE);
753 
754   /* Read the filesystem size from the base file. If pBaseRead is NULL, this
755   ** means the file hasn't been opened yet. In this case all relevant data
756   ** must be in the write-op queue anyway, so we can omit reading from the
757   ** file-system.
758   */
759   pBase = p->pBaseRead;
760   if( pBase->pMethods ){
761     rc = pBase->pMethods->xFileSize(pBase, &s);
762   }
763 
764   if( rc==SQLITE_OK ){
765     AsyncWrite *pWrite;
766     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
767       if( pWrite->op==ASYNC_DELETE
768        && p->zName
769        && strcmp(p->zName, pWrite->zBuf)==0
770       ){
771         s = 0;
772       }else if( pWrite->pFileData && (
773           (pWrite->pFileData==p)
774        || (p->zName && pWrite->pFileData->zName==p->zName)
775       )){
776         switch( pWrite->op ){
777           case ASYNC_WRITE:
778             s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
779             break;
780           case ASYNC_TRUNCATE:
781             s = MIN(s, pWrite->iOffset);
782             break;
783         }
784       }
785     }
786     *piSize = s;
787   }
788   async_mutex_leave(ASYNC_MUTEX_QUEUE);
789   return rc;
790 }
791 
792 /*
793 ** Lock or unlock the actual file-system entry.
794 */
795 static int getFileLock(AsyncLock *pLock){
796   int rc = SQLITE_OK;
797   AsyncFileLock *pIter;
798   int eRequired = 0;
799 
800   if( pLock->pFile ){
801     for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
802       assert(pIter->eAsyncLock>=pIter->eLock);
803       if( pIter->eAsyncLock>eRequired ){
804         eRequired = pIter->eAsyncLock;
805         assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
806       }
807     }
808 
809     if( eRequired>pLock->eLock ){
810       rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
811       if( rc==SQLITE_OK ){
812         pLock->eLock = eRequired;
813       }
814     }
815     else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
816       rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
817       if( rc==SQLITE_OK ){
818         pLock->eLock = eRequired;
819       }
820     }
821   }
822 
823   return rc;
824 }
825 
826 /*
827 ** Return the AsyncLock structure from the global async.pLock list
828 ** associated with the file-system entry identified by path zName
829 ** (a string of nName bytes). If no such structure exists, return 0.
830 */
831 static AsyncLock *findLock(const char *zName, int nName){
832   AsyncLock *p = async.pLock;
833   while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
834     p = p->pNext;
835   }
836   return p;
837 }
838 
839 /*
840 ** The following two methods - asyncLock() and asyncUnlock() - are used
841 ** to obtain and release locks on database files opened with the
842 ** asynchronous backend.
843 */
844 static int asyncLock(sqlite3_file *pFile, int eLock){
845   int rc = SQLITE_OK;
846   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
847 
848   if( p->zName ){
849     async_mutex_enter(ASYNC_MUTEX_LOCK);
850     if( p->lock.eLock<eLock ){
851       AsyncLock *pLock = p->pLock;
852       AsyncFileLock *pIter;
853       assert(pLock && pLock->pList);
854       for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
855         if( pIter!=&p->lock && (
856           (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
857           (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
858           (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
859           (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
860         )){
861           rc = SQLITE_BUSY;
862         }
863       }
864       if( rc==SQLITE_OK ){
865         p->lock.eLock = eLock;
866         p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
867       }
868       assert(p->lock.eAsyncLock>=p->lock.eLock);
869       if( rc==SQLITE_OK ){
870         rc = getFileLock(pLock);
871       }
872     }
873     async_mutex_leave(ASYNC_MUTEX_LOCK);
874   }
875 
876   ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
877   return rc;
878 }
879 static int asyncUnlock(sqlite3_file *pFile, int eLock){
880   int rc = SQLITE_OK;
881   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
882   if( p->zName ){
883     AsyncFileLock *pLock = &p->lock;
884     async_mutex_enter(ASYNC_MUTEX_QUEUE);
885     async_mutex_enter(ASYNC_MUTEX_LOCK);
886     pLock->eLock = MIN(pLock->eLock, eLock);
887     rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
888     async_mutex_leave(ASYNC_MUTEX_LOCK);
889     async_mutex_leave(ASYNC_MUTEX_QUEUE);
890   }
891   return rc;
892 }
893 
894 /*
895 ** This function is called when the pager layer first opens a database file
896 ** and is checking for a hot-journal.
897 */
898 static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
899   int ret = 0;
900   AsyncFileLock *pIter;
901   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
902 
903   async_mutex_enter(ASYNC_MUTEX_LOCK);
904   for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
905     if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
906       ret = 1;
907       break;
908     }
909   }
910   async_mutex_leave(ASYNC_MUTEX_LOCK);
911 
912   ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
913   *pResOut = ret;
914   return SQLITE_OK;
915 }
916 
917 /*
918 ** sqlite3_file_control() implementation.
919 */
920 static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
921   switch( op ){
922     case SQLITE_FCNTL_LOCKSTATE: {
923       async_mutex_enter(ASYNC_MUTEX_LOCK);
924       *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
925       async_mutex_leave(ASYNC_MUTEX_LOCK);
926       return SQLITE_OK;
927     }
928   }
929   return SQLITE_ERROR;
930 }
931 
932 /*
933 ** Return the device characteristics and sector-size of the device. It
934 ** is tricky to implement these correctly, as this backend might
935 ** not have an open file handle at this point.
936 */
937 static int asyncSectorSize(sqlite3_file *pFile){
938   UNUSED_PARAMETER(pFile);
939   return 512;
940 }
941 static int asyncDeviceCharacteristics(sqlite3_file *pFile){
942   UNUSED_PARAMETER(pFile);
943   return 0;
944 }
945 
946 static int unlinkAsyncFile(AsyncFileData *pData){
947   AsyncFileLock **ppIter;
948   int rc = SQLITE_OK;
949 
950   if( pData->zName ){
951     AsyncLock *pLock = pData->pLock;
952     for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
953       if( (*ppIter)==&pData->lock ){
954         *ppIter = pData->lock.pNext;
955         break;
956       }
957     }
958     if( !pLock->pList ){
959       AsyncLock **pp;
960       if( pLock->pFile ){
961         pLock->pFile->pMethods->xClose(pLock->pFile);
962       }
963       for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
964       *pp = pLock->pNext;
965       sqlite3_free(pLock);
966     }else{
967       rc = getFileLock(pLock);
968     }
969   }
970 
971   return rc;
972 }
973 
974 /*
975 ** The parameter passed to this function is a copy of a 'flags' parameter
976 ** passed to this modules xOpen() method. This function returns true
977 ** if the file should be opened asynchronously, or false if it should
978 ** be opened immediately.
979 **
980 ** If the file is to be opened asynchronously, then asyncOpen() will add
981 ** an entry to the event queue and the file will not actually be opened
982 ** until the event is processed. Otherwise, the file is opened directly
983 ** by the caller.
984 */
985 static int doAsynchronousOpen(int flags){
986   return (flags&SQLITE_OPEN_CREATE) && (
987       (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
988       (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
989       (flags&SQLITE_OPEN_DELETEONCLOSE)
990   );
991 }
992 
993 /*
994 ** Open a file.
995 */
996 static int asyncOpen(
997   sqlite3_vfs *pAsyncVfs,
998   const char *zName,
999   sqlite3_file *pFile,
1000   int flags,
1001   int *pOutFlags
1002 ){
1003   static sqlite3_io_methods async_methods = {
1004     1,                               /* iVersion */
1005     asyncClose,                      /* xClose */
1006     asyncRead,                       /* xRead */
1007     asyncWrite,                      /* xWrite */
1008     asyncTruncate,                   /* xTruncate */
1009     asyncSync,                       /* xSync */
1010     asyncFileSize,                   /* xFileSize */
1011     asyncLock,                       /* xLock */
1012     asyncUnlock,                     /* xUnlock */
1013     asyncCheckReservedLock,          /* xCheckReservedLock */
1014     asyncFileControl,                /* xFileControl */
1015     asyncSectorSize,                 /* xSectorSize */
1016     asyncDeviceCharacteristics       /* xDeviceCharacteristics */
1017   };
1018 
1019   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1020   AsyncFile *p = (AsyncFile *)pFile;
1021   int nName = 0;
1022   int rc = SQLITE_OK;
1023   int nByte;
1024   AsyncFileData *pData;
1025   AsyncLock *pLock = 0;
1026   char *z;
1027   int isAsyncOpen = doAsynchronousOpen(flags);
1028 
1029   /* If zName is NULL, then the upper layer is requesting an anonymous file */
1030   if( zName ){
1031     nName = (int)strlen(zName)+1;
1032   }
1033 
1034   nByte = (
1035     sizeof(AsyncFileData) +        /* AsyncFileData structure */
1036     2 * pVfs->szOsFile +           /* AsyncFileData.pBaseRead and pBaseWrite */
1037     nName                          /* AsyncFileData.zName */
1038   );
1039   z = sqlite3_malloc(nByte);
1040   if( !z ){
1041     return SQLITE_NOMEM;
1042   }
1043   memset(z, 0, nByte);
1044   pData = (AsyncFileData*)z;
1045   z += sizeof(pData[0]);
1046   pData->pBaseRead = (sqlite3_file*)z;
1047   z += pVfs->szOsFile;
1048   pData->pBaseWrite = (sqlite3_file*)z;
1049   pData->closeOp.pFileData = pData;
1050   pData->closeOp.op = ASYNC_CLOSE;
1051 
1052   if( zName ){
1053     z += pVfs->szOsFile;
1054     pData->zName = z;
1055     pData->nName = nName;
1056     memcpy(pData->zName, zName, nName);
1057   }
1058 
1059   if( !isAsyncOpen ){
1060     int flagsout;
1061     rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
1062     if( rc==SQLITE_OK && (flagsout&SQLITE_OPEN_READWRITE) ){
1063       rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1064     }
1065     if( pOutFlags ){
1066       *pOutFlags = flagsout;
1067     }
1068   }
1069 
1070   async_mutex_enter(ASYNC_MUTEX_LOCK);
1071 
1072   if( zName && rc==SQLITE_OK ){
1073     pLock = findLock(pData->zName, pData->nName);
1074     if( !pLock ){
1075       int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1076       pLock = (AsyncLock *)sqlite3_malloc(nByte);
1077       if( pLock ){
1078         memset(pLock, 0, nByte);
1079         if( async.bLockFiles && (flags&SQLITE_OPEN_MAIN_DB) ){
1080           pLock->pFile = (sqlite3_file *)&pLock[1];
1081           rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1082           if( rc!=SQLITE_OK ){
1083             sqlite3_free(pLock);
1084             pLock = 0;
1085           }
1086         }
1087         if( pLock ){
1088           pLock->nFile = pData->nName;
1089           pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1090           memcpy(pLock->zFile, pData->zName, pLock->nFile);
1091           pLock->pNext = async.pLock;
1092           async.pLock = pLock;
1093         }
1094       }else{
1095         rc = SQLITE_NOMEM;
1096       }
1097     }
1098   }
1099 
1100   if( rc==SQLITE_OK ){
1101     p->pMethod = &async_methods;
1102     p->pData = pData;
1103 
1104     /* Link AsyncFileData.lock into the linked list of
1105     ** AsyncFileLock structures for this file.
1106     */
1107     if( zName ){
1108       pData->lock.pNext = pLock->pList;
1109       pLock->pList = &pData->lock;
1110       pData->zName = pLock->zFile;
1111     }
1112   }else{
1113     if( pData->pBaseRead->pMethods ){
1114       pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1115     }
1116     if( pData->pBaseWrite->pMethods ){
1117       pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1118     }
1119     sqlite3_free(pData);
1120   }
1121 
1122   async_mutex_leave(ASYNC_MUTEX_LOCK);
1123 
1124   if( rc==SQLITE_OK ){
1125     incrOpenFileCount();
1126     pData->pLock = pLock;
1127   }
1128 
1129   if( rc==SQLITE_OK && isAsyncOpen ){
1130     rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1131     if( rc==SQLITE_OK ){
1132       if( pOutFlags ) *pOutFlags = flags;
1133     }else{
1134       async_mutex_enter(ASYNC_MUTEX_LOCK);
1135       unlinkAsyncFile(pData);
1136       async_mutex_leave(ASYNC_MUTEX_LOCK);
1137       sqlite3_free(pData);
1138     }
1139   }
1140   if( rc!=SQLITE_OK ){
1141     p->pMethod = 0;
1142   }
1143   return rc;
1144 }
1145 
1146 /*
1147 ** Implementation of sqlite3OsDelete. Add an entry to the end of the
1148 ** write-op queue to perform the delete.
1149 */
1150 static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1151   UNUSED_PARAMETER(pAsyncVfs);
1152   return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, (int)strlen(z)+1, z);
1153 }
1154 
1155 /*
1156 ** Implementation of sqlite3OsAccess. This method holds the mutex from
1157 ** start to finish.
1158 */
1159 static int asyncAccess(
1160   sqlite3_vfs *pAsyncVfs,
1161   const char *zName,
1162   int flags,
1163   int *pResOut
1164 ){
1165   int rc;
1166   int ret;
1167   AsyncWrite *p;
1168   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1169 
1170   assert(flags==SQLITE_ACCESS_READWRITE
1171       || flags==SQLITE_ACCESS_READ
1172       || flags==SQLITE_ACCESS_EXISTS
1173   );
1174 
1175   async_mutex_enter(ASYNC_MUTEX_QUEUE);
1176   rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1177   if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1178     for(p=async.pQueueFirst; p; p = p->pNext){
1179       if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1180         ret = 0;
1181       }else if( p->op==ASYNC_OPENEXCLUSIVE
1182              && p->pFileData->zName
1183              && 0==strcmp(p->pFileData->zName, zName)
1184       ){
1185         ret = 1;
1186       }
1187     }
1188   }
1189   ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1190     flags==SQLITE_ACCESS_READWRITE?"read-write":
1191     flags==SQLITE_ACCESS_READ?"read":"exists"
1192     , zName, ret)
1193   );
1194   async_mutex_leave(ASYNC_MUTEX_QUEUE);
1195   *pResOut = ret;
1196   return rc;
1197 }
1198 
1199 /*
1200 ** Fill in zPathOut with the full path to the file identified by zPath.
1201 */
1202 static int asyncFullPathname(
1203   sqlite3_vfs *pAsyncVfs,
1204   const char *zPath,
1205   int nPathOut,
1206   char *zPathOut
1207 ){
1208   int rc;
1209   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1210   rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1211 
1212   /* Because of the way intra-process file locking works, this backend
1213   ** needs to return a canonical path. The following block assumes the
1214   ** file-system uses unix style paths.
1215   */
1216   if( rc==SQLITE_OK ){
1217     int i, j;
1218     int n = nPathOut;
1219     char *z = zPathOut;
1220     while( n>1 && z[n-1]=='/' ){ n--; }
1221     for(i=j=0; i<n; i++){
1222       if( z[i]=='/' ){
1223         if( z[i+1]=='/' ) continue;
1224         if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1225           i += 1;
1226           continue;
1227         }
1228         if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1229           while( j>0 && z[j-1]!='/' ){ j--; }
1230           if( j>0 ){ j--; }
1231           i += 2;
1232           continue;
1233         }
1234       }
1235       z[j++] = z[i];
1236     }
1237     z[j] = 0;
1238   }
1239 
1240   return rc;
1241 }
1242 static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1243   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1244   return pVfs->xDlOpen(pVfs, zPath);
1245 }
1246 static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1247   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1248   pVfs->xDlError(pVfs, nByte, zErrMsg);
1249 }
1250 static void (*asyncDlSym(
1251   sqlite3_vfs *pAsyncVfs,
1252   void *pHandle,
1253   const char *zSymbol
1254 ))(void){
1255   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1256   return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1257 }
1258 static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1259   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1260   pVfs->xDlClose(pVfs, pHandle);
1261 }
1262 static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1263   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1264   return pVfs->xRandomness(pVfs, nByte, zBufOut);
1265 }
1266 static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1267   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1268   return pVfs->xSleep(pVfs, nMicro);
1269 }
1270 static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1271   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1272   return pVfs->xCurrentTime(pVfs, pTimeOut);
1273 }
1274 
1275 static sqlite3_vfs async_vfs = {
1276   1,                    /* iVersion */
1277   sizeof(AsyncFile),    /* szOsFile */
1278   0,                    /* mxPathname */
1279   0,                    /* pNext */
1280   SQLITEASYNC_VFSNAME,  /* zName */
1281   0,                    /* pAppData */
1282   asyncOpen,            /* xOpen */
1283   asyncDelete,          /* xDelete */
1284   asyncAccess,          /* xAccess */
1285   asyncFullPathname,    /* xFullPathname */
1286   asyncDlOpen,          /* xDlOpen */
1287   asyncDlError,         /* xDlError */
1288   asyncDlSym,           /* xDlSym */
1289   asyncDlClose,         /* xDlClose */
1290   asyncRandomness,      /* xDlError */
1291   asyncSleep,           /* xDlSym */
1292   asyncCurrentTime      /* xDlClose */
1293 };
1294 
1295 /*
1296 ** This procedure runs in a separate thread, reading messages off of the
1297 ** write queue and processing them one by one.
1298 **
1299 ** If async.writerHaltNow is true, then this procedure exits
1300 ** after processing a single message.
1301 **
1302 ** If async.writerHaltWhenIdle is true, then this procedure exits when
1303 ** the write queue is empty.
1304 **
1305 ** If both of the above variables are false, this procedure runs
1306 ** indefinately, waiting for operations to be added to the write queue
1307 ** and processing them in the order in which they arrive.
1308 **
1309 ** An artifical delay of async.ioDelay milliseconds is inserted before
1310 ** each write operation in order to simulate the effect of a slow disk.
1311 **
1312 ** Only one instance of this procedure may be running at a time.
1313 */
1314 static void asyncWriterThread(void){
1315   sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1316   AsyncWrite *p = 0;
1317   int rc = SQLITE_OK;
1318   int holdingMutex = 0;
1319 
1320   async_mutex_enter(ASYNC_MUTEX_WRITER);
1321 
1322   while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1323     int doNotFree = 0;
1324     sqlite3_file *pBase = 0;
1325 
1326     if( !holdingMutex ){
1327       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1328     }
1329     while( (p = async.pQueueFirst)==0 ){
1330       if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1331         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1332         break;
1333       }else{
1334         ASYNC_TRACE(("IDLE\n"));
1335         async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1336         ASYNC_TRACE(("WAKEUP\n"));
1337       }
1338     }
1339     if( p==0 ) break;
1340     holdingMutex = 1;
1341 
1342     /* Right now this thread is holding the mutex on the write-op queue.
1343     ** Variable 'p' points to the first entry in the write-op queue. In
1344     ** the general case, we hold on to the mutex for the entire body of
1345     ** the loop.
1346     **
1347     ** However in the cases enumerated below, we relinquish the mutex,
1348     ** perform the IO, and then re-request the mutex before removing 'p' from
1349     ** the head of the write-op queue. The idea is to increase concurrency with
1350     ** sqlite threads.
1351     **
1352     **     * An ASYNC_CLOSE operation.
1353     **     * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1354     **       the mutex, call the underlying xOpenExclusive() function, then
1355     **       re-aquire the mutex before seting the AsyncFile.pBaseRead
1356     **       variable.
1357     **     * ASYNC_SYNC and ASYNC_WRITE operations, if
1358     **       SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1359     **       file-handles are open for the particular file being "synced".
1360     */
1361     if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1362       p->op = ASYNC_NOOP;
1363     }
1364     if( p->pFileData ){
1365       pBase = p->pFileData->pBaseWrite;
1366       if(
1367         p->op==ASYNC_CLOSE ||
1368         p->op==ASYNC_OPENEXCLUSIVE ||
1369         (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1370       ){
1371         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1372         holdingMutex = 0;
1373       }
1374       if( !pBase->pMethods ){
1375         pBase = p->pFileData->pBaseRead;
1376       }
1377     }
1378 
1379     switch( p->op ){
1380       case ASYNC_NOOP:
1381         break;
1382 
1383       case ASYNC_WRITE:
1384         assert( pBase );
1385         ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1386                 p->pFileData->zName, p->nByte, p->iOffset));
1387         rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1388         break;
1389 
1390       case ASYNC_SYNC:
1391         assert( pBase );
1392         ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1393         rc = pBase->pMethods->xSync(pBase, p->nByte);
1394         break;
1395 
1396       case ASYNC_TRUNCATE:
1397         assert( pBase );
1398         ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1399                 p->pFileData->zName, p->iOffset));
1400         rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1401         break;
1402 
1403       case ASYNC_CLOSE: {
1404         AsyncFileData *pData = p->pFileData;
1405         ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1406         if( pData->pBaseWrite->pMethods ){
1407           pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1408         }
1409         if( pData->pBaseRead->pMethods ){
1410           pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1411         }
1412 
1413         /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1414         ** structures for this file. Obtain the async.lockMutex mutex
1415         ** before doing so.
1416         */
1417         async_mutex_enter(ASYNC_MUTEX_LOCK);
1418         rc = unlinkAsyncFile(pData);
1419         async_mutex_leave(ASYNC_MUTEX_LOCK);
1420 
1421         if( !holdingMutex ){
1422           async_mutex_enter(ASYNC_MUTEX_QUEUE);
1423           holdingMutex = 1;
1424         }
1425         assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1426         async.pQueueFirst = p->pNext;
1427         sqlite3_free(pData);
1428         doNotFree = 1;
1429         break;
1430       }
1431 
1432       case ASYNC_UNLOCK: {
1433         AsyncWrite *pIter;
1434         AsyncFileData *pData = p->pFileData;
1435         int eLock = p->nByte;
1436 
1437         /* When a file is locked by SQLite using the async backend, it is
1438         ** locked within the 'real' file-system synchronously. When it is
1439         ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1440         ** unlock the file asynchronously. The design of the async backend
1441         ** requires that the 'real' file-system file be locked from the
1442         ** time that SQLite first locks it (and probably reads from it)
1443         ** until all asynchronous write events that were scheduled before
1444         ** SQLite unlocked the file have been processed.
1445         **
1446         ** This is more complex if SQLite locks and unlocks the file multiple
1447         ** times in quick succession. For example, if SQLite does:
1448         **
1449         **   lock, write, unlock, lock, write, unlock
1450         **
1451         ** Each "lock" operation locks the file immediately. Each "write"
1452         ** and "unlock" operation adds an event to the event queue. If the
1453         ** second "lock" operation is performed before the first "unlock"
1454         ** operation has been processed asynchronously, then the first
1455         ** "unlock" cannot be safely processed as is, since this would mean
1456         ** the file was unlocked when the second "write" operation is
1457         ** processed. To work around this, when processing an ASYNC_UNLOCK
1458         ** operation, SQLite:
1459         **
1460         **   1) Unlocks the file to the minimum of the argument passed to
1461         **      the xUnlock() call and the current lock from SQLite's point
1462         **      of view, and
1463         **
1464         **   2) Only unlocks the file at all if this event is the last
1465         **      ASYNC_UNLOCK event on this file in the write-queue.
1466         */
1467         assert( holdingMutex==1 );
1468         assert( async.pQueueFirst==p );
1469         for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1470           if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1471         }
1472         if( !pIter ){
1473           async_mutex_enter(ASYNC_MUTEX_LOCK);
1474           pData->lock.eAsyncLock = MIN(
1475               pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1476           );
1477           assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1478           rc = getFileLock(pData->pLock);
1479           async_mutex_leave(ASYNC_MUTEX_LOCK);
1480         }
1481         break;
1482       }
1483 
1484       case ASYNC_DELETE:
1485         ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1486         rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1487         break;
1488 
1489       case ASYNC_OPENEXCLUSIVE: {
1490         int flags = (int)p->iOffset;
1491         AsyncFileData *pData = p->pFileData;
1492         ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1493         assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1494         rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1495         assert( holdingMutex==0 );
1496         async_mutex_enter(ASYNC_MUTEX_QUEUE);
1497         holdingMutex = 1;
1498         break;
1499       }
1500 
1501       default: assert(!"Illegal value for AsyncWrite.op");
1502     }
1503 
1504     /* If we didn't hang on to the mutex during the IO op, obtain it now
1505     ** so that the AsyncWrite structure can be safely removed from the
1506     ** global write-op queue.
1507     */
1508     if( !holdingMutex ){
1509       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1510       holdingMutex = 1;
1511     }
1512     /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1513     if( p==async.pQueueLast ){
1514       async.pQueueLast = 0;
1515     }
1516     if( !doNotFree ){
1517       assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1518       async.pQueueFirst = p->pNext;
1519       sqlite3_free(p);
1520     }
1521     assert( holdingMutex );
1522 
1523     /* An IO error has occurred. We cannot report the error back to the
1524     ** connection that requested the I/O since the error happened
1525     ** asynchronously.  The connection has already moved on.  There
1526     ** really is nobody to report the error to.
1527     **
1528     ** The file for which the error occurred may have been a database or
1529     ** journal file. Regardless, none of the currently queued operations
1530     ** associated with the same database should now be performed. Nor should
1531     ** any subsequently requested IO on either a database or journal file
1532     ** handle for the same database be accepted until the main database
1533     ** file handle has been closed and reopened.
1534     **
1535     ** Furthermore, no further IO should be queued or performed on any file
1536     ** handle associated with a database that may have been part of a
1537     ** multi-file transaction that included the database associated with
1538     ** the IO error (i.e. a database ATTACHed to the same handle at some
1539     ** point in time).
1540     */
1541     if( rc!=SQLITE_OK ){
1542       async.ioError = rc;
1543     }
1544 
1545     if( async.ioError && !async.pQueueFirst ){
1546       async_mutex_enter(ASYNC_MUTEX_LOCK);
1547       if( 0==async.pLock ){
1548         async.ioError = SQLITE_OK;
1549       }
1550       async_mutex_leave(ASYNC_MUTEX_LOCK);
1551     }
1552 
1553     /* Drop the queue mutex before continuing to the next write operation
1554     ** in order to give other threads a chance to work with the write queue.
1555     */
1556     if( !async.pQueueFirst || !async.ioError ){
1557       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1558       holdingMutex = 0;
1559       if( async.ioDelay>0 ){
1560         pVfs->xSleep(pVfs, async.ioDelay*1000);
1561       }else{
1562         async_sched_yield();
1563       }
1564     }
1565   }
1566 
1567   async_mutex_leave(ASYNC_MUTEX_WRITER);
1568   return;
1569 }
1570 
1571 /*
1572 ** Install the asynchronous VFS.
1573 */
1574 int sqlite3async_initialize(const char *zParent, int isDefault){
1575   int rc = SQLITE_OK;
1576   if( async_vfs.pAppData==0 ){
1577     sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1578     if( !pParent || async_os_initialize() ){
1579       rc = SQLITE_ERROR;
1580     }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1581       async_os_shutdown();
1582     }else{
1583       async_vfs.pAppData = (void *)pParent;
1584       async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1585     }
1586   }
1587   return rc;
1588 }
1589 
1590 /*
1591 ** Uninstall the asynchronous VFS.
1592 */
1593 void sqlite3async_shutdown(void){
1594   if( async_vfs.pAppData ){
1595     async_os_shutdown();
1596     sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1597     async_vfs.pAppData = 0;
1598   }
1599 }
1600 
1601 /*
1602 ** Process events on the write-queue.
1603 */
1604 void sqlite3async_run(void){
1605   asyncWriterThread();
1606 }
1607 
1608 /*
1609 ** Control/configure the asynchronous IO system.
1610 */
1611 int sqlite3async_control(int op, ...){
1612   va_list ap;
1613   va_start(ap, op);
1614   switch( op ){
1615     case SQLITEASYNC_HALT: {
1616       int eWhen = va_arg(ap, int);
1617       if( eWhen!=SQLITEASYNC_HALT_NEVER
1618        && eWhen!=SQLITEASYNC_HALT_NOW
1619        && eWhen!=SQLITEASYNC_HALT_IDLE
1620       ){
1621         return SQLITE_MISUSE;
1622       }
1623       async.eHalt = eWhen;
1624       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1625       async_cond_signal(ASYNC_COND_QUEUE);
1626       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1627       break;
1628     }
1629 
1630     case SQLITEASYNC_DELAY: {
1631       int iDelay = va_arg(ap, int);
1632       if( iDelay<0 ){
1633         return SQLITE_MISUSE;
1634       }
1635       async.ioDelay = iDelay;
1636       break;
1637     }
1638 
1639     case SQLITEASYNC_LOCKFILES: {
1640       int bLock = va_arg(ap, int);
1641       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1642       if( async.nFile || async.pQueueFirst ){
1643         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1644         return SQLITE_MISUSE;
1645       }
1646       async.bLockFiles = bLock;
1647       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1648       break;
1649     }
1650 
1651     case SQLITEASYNC_GET_HALT: {
1652       int *peWhen = va_arg(ap, int *);
1653       *peWhen = async.eHalt;
1654       break;
1655     }
1656     case SQLITEASYNC_GET_DELAY: {
1657       int *piDelay = va_arg(ap, int *);
1658       *piDelay = async.ioDelay;
1659       break;
1660     }
1661     case SQLITEASYNC_GET_LOCKFILES: {
1662       int *piDelay = va_arg(ap, int *);
1663       *piDelay = async.bLockFiles;
1664       break;
1665     }
1666 
1667     default:
1668       return SQLITE_ERROR;
1669   }
1670   return SQLITE_OK;
1671 }
1672 
1673 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1674 
1675