xref: /sqlite-3.40.0/ext/async/sqlite3async.c (revision a3f06598)
1 /*
2 ** 2005 December 14
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** $Id: sqlite3async.c,v 1.1 2009/04/23 14:58:40 danielk1977 Exp $
14 **
15 ** This file contains an example implementation of an asynchronous IO
16 ** backend for SQLite.
17 **
18 ** WHAT IS ASYNCHRONOUS I/O?
19 **
20 ** With asynchronous I/O, write requests are handled by a separate thread
21 ** running in the background.  This means that the thread that initiates
22 ** a database write does not have to wait for (sometimes slow) disk I/O
23 ** to occur.  The write seems to happen very quickly, though in reality
24 ** it is happening at its usual slow pace in the background.
25 **
26 ** Asynchronous I/O appears to give better responsiveness, but at a price.
27 ** You lose the Durable property.  With the default I/O backend of SQLite,
28 ** once a write completes, you know that the information you wrote is
29 ** safely on disk.  With the asynchronous I/O, this is not the case.  If
30 ** your program crashes or if a power loss occurs after the database
31 ** write but before the asynchronous write thread has completed, then the
32 ** database change might never make it to disk and the next user of the
33 ** database might not see your change.
34 **
35 ** You lose Durability with asynchronous I/O, but you still retain the
36 ** other parts of ACID:  Atomic,  Consistent, and Isolated.  Many
37 ** appliations get along fine without the Durablity.
38 **
39 ** HOW IT WORKS
40 **
41 ** Asynchronous I/O works by creating a special SQLite "vfs" structure
42 ** and registering it with sqlite3_vfs_register(). When files opened via
43 ** this vfs are written to (using sqlite3OsWrite()), the data is not
44 ** written directly to disk, but is placed in the "write-queue" to be
45 ** handled by the background thread.
46 **
47 ** When files opened with the asynchronous vfs are read from
48 ** (using sqlite3OsRead()), the data is read from the file on
49 ** disk and the write-queue, so that from the point of view of
50 ** the vfs reader the OsWrite() appears to have already completed.
51 **
52 ** The special vfs is registered (and unregistered) by calls to
53 ** function asyncEnable() (see below).
54 **
55 ** LIMITATIONS
56 **
57 ** This demonstration code is deliberately kept simple in order to keep
58 ** the main ideas clear and easy to understand.  Real applications that
59 ** want to do asynchronous I/O might want to add additional capabilities.
60 ** For example, in this demonstration if writes are happening at a steady
61 ** stream that exceeds the I/O capability of the background writer thread,
62 ** the queue of pending write operations will grow without bound until we
63 ** run out of memory.  Users of this technique may want to keep track of
64 ** the quantity of pending writes and stop accepting new write requests
65 ** when the buffer gets to be too big.
66 **
67 ** LOCKING + CONCURRENCY
68 **
69 ** Multiple connections from within a single process that use this
70 ** implementation of asynchronous IO may access a single database
71 ** file concurrently. From the point of view of the user, if all
72 ** connections are from within a single process, there is no difference
73 ** between the concurrency offered by "normal" SQLite and SQLite
74 ** using the asynchronous backend.
75 **
76 ** If connections from within multiple processes may access the
77 ** database file, the ENABLE_FILE_LOCKING symbol (see below) must be
78 ** defined. If it is not defined, then no locks are established on
79 ** the database file. In this case, if multiple processes access
80 ** the database file, corruption will quickly result.
81 **
82 ** If ENABLE_FILE_LOCKING is defined (the default), then connections
83 ** from within multiple processes may access a single database file
84 ** without risking corruption. However concurrency is reduced as
85 ** follows:
86 **
87 **   * When a connection using asynchronous IO begins a database
88 **     transaction, the database is locked immediately. However the
89 **     lock is not released until after all relevant operations
90 **     in the write-queue have been flushed to disk. This means
91 **     (for example) that the database may remain locked for some
92 **     time after a "COMMIT" or "ROLLBACK" is issued.
93 **
94 **   * If an application using asynchronous IO executes transactions
95 **     in quick succession, other database users may be effectively
96 **     locked out of the database. This is because when a BEGIN
97 **     is executed, a database lock is established immediately. But
98 **     when the corresponding COMMIT or ROLLBACK occurs, the lock
99 **     is not released until the relevant part of the write-queue
100 **     has been flushed through. As a result, if a COMMIT is followed
101 **     by a BEGIN before the write-queue is flushed through, the database
102 **     is never unlocked,preventing other processes from accessing
103 **     the database.
104 **
105 ** Defining ENABLE_FILE_LOCKING when using an NFS or other remote
106 ** file-system may slow things down, as synchronous round-trips to the
107 ** server may be required to establish database file locks.
108 */
109 
110 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
111 
112 #include "sqlite3async.h"
113 
114 #define ENABLE_FILE_LOCKING
115 
116 #ifndef SQLITE_AMALGAMATION
117 # include "sqliteInt.h"
118 # include <assert.h>
119 # include <string.h>
120 #endif
121 
122 /* Useful macros used in several places */
123 #define MIN(x,y) ((x)<(y)?(x):(y))
124 #define MAX(x,y) ((x)>(y)?(x):(y))
125 
126 /* Forward references */
127 typedef struct AsyncWrite AsyncWrite;
128 typedef struct AsyncFile AsyncFile;
129 typedef struct AsyncFileData AsyncFileData;
130 typedef struct AsyncFileLock AsyncFileLock;
131 typedef struct AsyncLock AsyncLock;
132 
133 /* Enable for debugging */
134 static int sqlite3async_trace = 0;
135 # define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
136 static void asyncTrace(const char *zFormat, ...){
137   char *z;
138   va_list ap;
139   va_start(ap, zFormat);
140   z = sqlite3_vmprintf(zFormat, ap);
141   va_end(ap);
142   fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
143   sqlite3_free(z);
144 }
145 
146 /*
147 ** THREAD SAFETY NOTES
148 **
149 ** Basic rules:
150 **
151 **     * Both read and write access to the global write-op queue must be
152 **       protected by the async.queueMutex. As are the async.ioError and
153 **       async.nFile variables.
154 **
155 **     * The async.pLock list and all AsyncLock and AsyncFileLock
156 **       structures must be protected by the async.lockMutex mutex.
157 **
158 **     * The file handles from the underlying system are not assumed to
159 **       be thread safe.
160 **
161 **     * See the last two paragraphs under "The Writer Thread" for
162 **       an assumption to do with file-handle synchronization by the Os.
163 **
164 ** Deadlock prevention:
165 **
166 **     There are three mutex used by the system: the "writer" mutex,
167 **     the "queue" mutex and the "lock" mutex. Rules are:
168 **
169 **     * It is illegal to block on the writer mutex when any other mutex
170 **       are held, and
171 **
172 **     * It is illegal to block on the queue mutex when the lock mutex
173 **       is held.
174 **
175 **     i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
176 **
177 ** File system operations (invoked by SQLite thread):
178 **
179 **     xOpen
180 **     xDelete
181 **     xFileExists
182 **
183 ** File handle operations (invoked by SQLite thread):
184 **
185 **         asyncWrite, asyncClose, asyncTruncate, asyncSync
186 **
187 **     The operations above add an entry to the global write-op list. They
188 **     prepare the entry, acquire the async.queueMutex momentarily while
189 **     list pointers are  manipulated to insert the new entry, then release
190 **     the mutex and signal the writer thread to wake up in case it happens
191 **     to be asleep.
192 **
193 **
194 **         asyncRead, asyncFileSize.
195 **
196 **     Read operations. Both of these read from both the underlying file
197 **     first then adjust their result based on pending writes in the
198 **     write-op queue.   So async.queueMutex is held for the duration
199 **     of these operations to prevent other threads from changing the
200 **     queue in mid operation.
201 **
202 **
203 **         asyncLock, asyncUnlock, asyncCheckReservedLock
204 **
205 **     These primitives implement in-process locking using a hash table
206 **     on the file name.  Files are locked correctly for connections coming
207 **     from the same process.  But other processes cannot see these locks
208 **     and will therefore not honor them.
209 **
210 **
211 ** The writer thread:
212 **
213 **     The async.writerMutex is used to make sure only there is only
214 **     a single writer thread running at a time.
215 **
216 **     Inside the writer thread is a loop that works like this:
217 **
218 **         WHILE (write-op list is not empty)
219 **             Do IO operation at head of write-op list
220 **             Remove entry from head of write-op list
221 **         END WHILE
222 **
223 **     The async.queueMutex is always held during the <write-op list is
224 **     not empty> test, and when the entry is removed from the head
225 **     of the write-op list. Sometimes it is held for the interim
226 **     period (while the IO is performed), and sometimes it is
227 **     relinquished. It is relinquished if (a) the IO op is an
228 **     ASYNC_CLOSE or (b) when the file handle was opened, two of
229 **     the underlying systems handles were opened on the same
230 **     file-system entry.
231 **
232 **     If condition (b) above is true, then one file-handle
233 **     (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
234 **     file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
235 **     threads to perform write() operations. This means that read
236 **     operations are not blocked by asynchronous writes (although
237 **     asynchronous writes may still be blocked by reads).
238 **
239 **     This assumes that the OS keeps two handles open on the same file
240 **     properly in sync. That is, any read operation that starts after a
241 **     write operation on the same file system entry has completed returns
242 **     data consistent with the write. We also assume that if one thread
243 **     reads a file while another is writing it all bytes other than the
244 **     ones actually being written contain valid data.
245 **
246 **     If the above assumptions are not true, set the preprocessor symbol
247 **     SQLITE_ASYNC_TWO_FILEHANDLES to 0.
248 */
249 
250 
251 #ifndef NDEBUG
252 # define TESTONLY( X ) X
253 #else
254 # define TESTONLY( X )
255 #endif
256 
257 /*
258 ** There are two definitions of the following functions. One for pthreads
259 ** compatible systems and one for Win32. These functions isolate the OS
260 ** specific code required by each platform.
261 **
262 ** The system uses three mutexes and a single condition variable. To
263 ** block on a mutex, async_mutex_enter() is called. The parameter passed
264 ** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
265 ** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
266 ** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
267 ** called with a parameter identifying the mutex being unlocked. Mutexes
268 ** are not recursive - it is an error to call async_mutex_enter() to
269 ** lock a mutex that is already locked, or to call async_mutex_leave()
270 ** to unlock a mutex that is not currently locked.
271 **
272 ** The async_cond_wait() and async_cond_signal() functions are modelled
273 ** on the pthreads functions with similar names. The first parameter to
274 ** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
275 ** is called the mutex identified by the second parameter must be held.
276 ** The mutex is unlocked, and the calling thread simultaneously begins
277 ** waiting for the condition variable to be signalled by another thread.
278 ** After another thread signals the condition variable, the calling
279 ** thread stops waiting, locks mutex eMutex and returns. The
280 ** async_cond_signal() function is used to signal the condition variable.
281 ** It is assumed that the mutex used by the thread calling async_cond_wait()
282 ** is held by the caller of async_cond_signal() (otherwise there would be
283 ** a race condition).
284 **
285 ** It is guaranteed that no other thread will call async_cond_wait() when
286 ** there is already a thread waiting on the condition variable.
287 **
288 ** The async_sched_yield() function is called to suggest to the operating
289 ** system that it would be a good time to shift the current thread off the
290 ** CPU. The system will still work if this function is not implemented
291 ** (it is not currently implemented for win32), but it might be marginally
292 ** more efficient if it is.
293 */
294 static void async_mutex_enter(int eMutex);
295 static void async_mutex_leave(int eMutex);
296 static void async_cond_wait(int eCond, int eMutex);
297 static void async_cond_signal(int eCond);
298 static void async_sched_yield(void);
299 
300 /*
301 ** There are also two definitions of the following. async_os_initialize()
302 ** is called when the asynchronous VFS is first installed, and os_shutdown()
303 ** is called when it is uninstalled (from within sqlite3async_shutdown()).
304 **
305 ** For pthreads builds, both of these functions are no-ops. For win32,
306 ** they provide an opportunity to initialize and finalize the required
307 ** mutex and condition variables.
308 **
309 ** If async_os_initialize() returns other than zero, then the initialization
310 ** fails and SQLITE_ERROR is returned to the user.
311 */
312 static int async_os_initialize(void);
313 static void async_os_shutdown(void);
314 
315 /* Values for use as the 'eMutex' argument of the above functions. The
316 ** integer values assigned to these constants are important for assert()
317 ** statements that verify that mutexes are locked in the correct order.
318 ** Specifically, it is unsafe to try to lock mutex N while holding a lock
319 ** on mutex M if (M<=N).
320 */
321 #define ASYNC_MUTEX_LOCK    0
322 #define ASYNC_MUTEX_QUEUE   1
323 #define ASYNC_MUTEX_WRITER  2
324 
325 /* Values for use as the 'eCond' argument of the above functions. */
326 #define ASYNC_COND_QUEUE    0
327 
328 /*************************************************************************
329 ** Start of OS specific code.
330 */
331 #if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
332 
333 /* The following block contains the win32 specific code. */
334 
335 #define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
336 
337 static struct AsyncPrimitives {
338   int isInit;
339   DWORD aHolder[3];
340   CRITICAL_SECTION aMutex[3];
341   HANDLE aCond[1];
342 } primitives = { 0 };
343 
344 static int async_os_initialize(void){
345   if( !primitives.isInit ){
346     primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
347     if( primitives.aCond[0]==NULL ){
348       return 1;
349     }
350     InitializeCriticalSection(&primitives.aMutex[0]);
351     InitializeCriticalSection(&primitives.aMutex[1]);
352     InitializeCriticalSection(&primitives.aMutex[2]);
353     primitives.isInit = 1;
354   }
355   return 0;
356 }
357 static void async_os_shutdown(void){
358   if( primitives.isInit ){
359     DeleteCriticalSection(&primitives.aMutex[0]);
360     DeleteCriticalSection(&primitives.aMutex[1]);
361     DeleteCriticalSection(&primitives.aMutex[2]);
362     CloseHandle(primitives.aCond[0]);
363     primitives.isInit = 0;
364   }
365 }
366 
367 /* The following block contains the Win32 specific code. */
368 static void async_mutex_enter(int eMutex){
369   assert( eMutex==0 || eMutex==1 || eMutex==2 );
370   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
371   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
372   assert( eMutex!=0 || (!mutex_held(0)) );
373   EnterCriticalSection(&primitives.aMutex[eMutex]);
374   TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
375 }
376 static void async_mutex_leave(int eMutex){
377   assert( eMutex==0 || eMutex==1 || eMutex==2 );
378   assert( mutex_held(eMutex) );
379   TESTONLY( primitives.aHolder[eMutex] = 0; )
380   LeaveCriticalSection(&primitives.aMutex[eMutex]);
381 }
382 static void async_cond_wait(int eCond, int eMutex){
383   ResetEvent(primitives.aCond[eCond]);
384   async_mutex_leave(eMutex);
385   WaitForSingleObject(primitives.aCond[eCond], INFINITE);
386   async_mutex_enter(eMutex);
387 }
388 static void async_cond_signal(int eCond){
389   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
390   SetEvent(primitives.aCond[eCond]);
391 }
392 static void async_sched_yield(void){
393   /* Todo: Find out if win32 offers anything like sched_yield() */
394 }
395 #else
396 
397 /* The following block contains the pthreads specific code. */
398 #include <pthread.h>
399 #include <sched.h>
400 
401 #define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
402 
403 static int  async_os_initialize(void) {return 0;}
404 static void async_os_shutdown(void) {}
405 
406 static struct AsyncPrimitives {
407   pthread_mutex_t aMutex[3];
408   pthread_cond_t aCond[1];
409   pthread_t aHolder[3];
410 } primitives = {
411   { PTHREAD_MUTEX_INITIALIZER,
412     PTHREAD_MUTEX_INITIALIZER,
413     PTHREAD_MUTEX_INITIALIZER
414   } , {
415     PTHREAD_COND_INITIALIZER
416   } , { 0, 0, 0 }
417 };
418 
419 static void async_mutex_enter(int eMutex){
420   assert( eMutex==0 || eMutex==1 || eMutex==2 );
421   assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
422   assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
423   assert( eMutex!=0 || (!mutex_held(0)) );
424   pthread_mutex_lock(&primitives.aMutex[eMutex]);
425   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
426 }
427 static void async_mutex_leave(int eMutex){
428   assert( eMutex==0 || eMutex==1 || eMutex==2 );
429   assert( mutex_held(eMutex) );
430   TESTONLY( primitives.aHolder[eMutex] = 0; )
431   pthread_mutex_unlock(&primitives.aMutex[eMutex]);
432 }
433 static void async_cond_wait(int eCond, int eMutex){
434   assert( eMutex==0 || eMutex==1 || eMutex==2 );
435   assert( mutex_held(eMutex) );
436   TESTONLY( primitives.aHolder[eMutex] = 0; )
437   pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
438   TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
439 }
440 static void async_cond_signal(int eCond){
441   assert( mutex_held(ASYNC_MUTEX_QUEUE) );
442   pthread_cond_signal(&primitives.aCond[eCond]);
443 }
444 static void async_sched_yield(void){
445   sched_yield();
446 }
447 #endif
448 /*
449 ** End of OS specific code.
450 *************************************************************************/
451 
452 #define assert_mutex_is_held(X) assert( mutex_held(X) )
453 
454 
455 #ifndef SQLITE_ASYNC_TWO_FILEHANDLES
456 /* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
457 #define SQLITE_ASYNC_TWO_FILEHANDLES 1
458 #endif
459 
460 /*
461 ** State information is held in the static variable "async" defined
462 ** as the following structure.
463 **
464 ** Both async.ioError and async.nFile are protected by async.queueMutex.
465 */
466 static struct TestAsyncStaticData {
467   AsyncWrite *pQueueFirst;     /* Next write operation to be processed */
468   AsyncWrite *pQueueLast;      /* Last write operation on the list */
469   AsyncLock *pLock;            /* Linked list of all AsyncLock structures */
470   volatile int ioDelay;        /* Extra delay between write operations */
471   volatile int eHalt;          /* One of the SQLITEASYNC_HALT_XXX values */
472   int ioError;                 /* True if an IO error has occurred */
473   int nFile;                   /* Number of open files (from sqlite pov) */
474 } async = { 0,0,0,0,0,0,0 };
475 
476 /* Possible values of AsyncWrite.op */
477 #define ASYNC_NOOP          0
478 #define ASYNC_WRITE         1
479 #define ASYNC_SYNC          2
480 #define ASYNC_TRUNCATE      3
481 #define ASYNC_CLOSE         4
482 #define ASYNC_DELETE        5
483 #define ASYNC_OPENEXCLUSIVE 6
484 #define ASYNC_UNLOCK        7
485 
486 /* Names of opcodes.  Used for debugging only.
487 ** Make sure these stay in sync with the macros above!
488 */
489 static const char *azOpcodeName[] = {
490   "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
491 };
492 
493 /*
494 ** Entries on the write-op queue are instances of the AsyncWrite
495 ** structure, defined here.
496 **
497 ** The interpretation of the iOffset and nByte variables varies depending
498 ** on the value of AsyncWrite.op:
499 **
500 ** ASYNC_NOOP:
501 **     No values used.
502 **
503 ** ASYNC_WRITE:
504 **     iOffset -> Offset in file to write to.
505 **     nByte   -> Number of bytes of data to write (pointed to by zBuf).
506 **
507 ** ASYNC_SYNC:
508 **     nByte   -> flags to pass to sqlite3OsSync().
509 **
510 ** ASYNC_TRUNCATE:
511 **     iOffset -> Size to truncate file to.
512 **     nByte   -> Unused.
513 **
514 ** ASYNC_CLOSE:
515 **     iOffset -> Unused.
516 **     nByte   -> Unused.
517 **
518 ** ASYNC_DELETE:
519 **     iOffset -> Contains the "syncDir" flag.
520 **     nByte   -> Number of bytes of zBuf points to (file name).
521 **
522 ** ASYNC_OPENEXCLUSIVE:
523 **     iOffset -> Value of "delflag".
524 **     nByte   -> Number of bytes of zBuf points to (file name).
525 **
526 ** ASYNC_UNLOCK:
527 **     nByte   -> Argument to sqlite3OsUnlock().
528 **
529 **
530 ** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
531 ** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
532 ** single blob, so is deleted when sqlite3_free() is called on the parent
533 ** structure.
534 */
535 struct AsyncWrite {
536   AsyncFileData *pFileData;    /* File to write data to or sync */
537   int op;                      /* One of ASYNC_xxx etc. */
538   sqlite_int64 iOffset;        /* See above */
539   int nByte;          /* See above */
540   char *zBuf;         /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
541   AsyncWrite *pNext;  /* Next write operation (to any file) */
542 };
543 
544 /*
545 ** An instance of this structure is created for each distinct open file
546 ** (i.e. if two handles are opened on the one file, only one of these
547 ** structures is allocated) and stored in the async.aLock hash table. The
548 ** keys for async.aLock are the full pathnames of the opened files.
549 **
550 ** AsyncLock.pList points to the head of a linked list of AsyncFileLock
551 ** structures, one for each handle currently open on the file.
552 **
553 ** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
554 ** not passed to the sqlite3OsOpen() call), or if ENABLE_FILE_LOCKING is
555 ** not defined at compile time, variables AsyncLock.pFile and
556 ** AsyncLock.eLock are never used. Otherwise, pFile is a file handle
557 ** opened on the file in question and used to obtain the file-system
558 ** locks required by database connections within this process.
559 **
560 ** See comments above the asyncLock() function for more details on
561 ** the implementation of database locking used by this backend.
562 */
563 struct AsyncLock {
564   char *zFile;
565   int nFile;
566   sqlite3_file *pFile;
567   int eLock;
568   AsyncFileLock *pList;
569   AsyncLock *pNext;           /* Next in linked list headed by async.pLock */
570 };
571 
572 /*
573 ** An instance of the following structure is allocated along with each
574 ** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
575 ** file was opened with the SQLITE_OPEN_MAIN_DB.
576 */
577 struct AsyncFileLock {
578   int eLock;                /* Internally visible lock state (sqlite pov) */
579   int eAsyncLock;           /* Lock-state with write-queue unlock */
580   AsyncFileLock *pNext;
581 };
582 
583 /*
584 ** The AsyncFile structure is a subclass of sqlite3_file used for
585 ** asynchronous IO.
586 **
587 ** All of the actual data for the structure is stored in the structure
588 ** pointed to by AsyncFile.pData, which is allocated as part of the
589 ** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
590 ** lifetime of the AsyncFile structure is ended by the caller after OsClose()
591 ** is called, but the data in AsyncFileData may be required by the
592 ** writer thread after that point.
593 */
594 struct AsyncFile {
595   sqlite3_io_methods *pMethod;
596   AsyncFileData *pData;
597 };
598 struct AsyncFileData {
599   char *zName;               /* Underlying OS filename - used for debugging */
600   int nName;                 /* Number of characters in zName */
601   sqlite3_file *pBaseRead;   /* Read handle to the underlying Os file */
602   sqlite3_file *pBaseWrite;  /* Write handle to the underlying Os file */
603   AsyncFileLock lock;        /* Lock state for this handle */
604   AsyncLock *pLock;          /* AsyncLock object for this file system entry */
605   AsyncWrite closeOp;        /* Preallocated close operation */
606 };
607 
608 /*
609 ** Add an entry to the end of the global write-op list. pWrite should point
610 ** to an AsyncWrite structure allocated using sqlite3_malloc().  The writer
611 ** thread will call sqlite3_free() to free the structure after the specified
612 ** operation has been completed.
613 **
614 ** Once an AsyncWrite structure has been added to the list, it becomes the
615 ** property of the writer thread and must not be read or modified by the
616 ** caller.
617 */
618 static void addAsyncWrite(AsyncWrite *pWrite){
619   /* We must hold the queue mutex in order to modify the queue pointers */
620   if( pWrite->op!=ASYNC_UNLOCK ){
621     async_mutex_enter(ASYNC_MUTEX_QUEUE);
622   }
623 
624   /* Add the record to the end of the write-op queue */
625   assert( !pWrite->pNext );
626   if( async.pQueueLast ){
627     assert( async.pQueueFirst );
628     async.pQueueLast->pNext = pWrite;
629   }else{
630     async.pQueueFirst = pWrite;
631   }
632   async.pQueueLast = pWrite;
633   ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
634          pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
635 
636   if( pWrite->op==ASYNC_CLOSE ){
637     async.nFile--;
638   }
639 
640   /* The writer thread might have been idle because there was nothing
641   ** on the write-op queue for it to do.  So wake it up. */
642   async_cond_signal(ASYNC_COND_QUEUE);
643 
644   /* Drop the queue mutex */
645   if( pWrite->op!=ASYNC_UNLOCK ){
646     async_mutex_leave(ASYNC_MUTEX_QUEUE);
647   }
648 }
649 
650 /*
651 ** Increment async.nFile in a thread-safe manner.
652 */
653 static void incrOpenFileCount(void){
654   /* We must hold the queue mutex in order to modify async.nFile */
655   async_mutex_enter(ASYNC_MUTEX_QUEUE);
656   if( async.nFile==0 ){
657     async.ioError = SQLITE_OK;
658   }
659   async.nFile++;
660   async_mutex_leave(ASYNC_MUTEX_QUEUE);
661 }
662 
663 /*
664 ** This is a utility function to allocate and populate a new AsyncWrite
665 ** structure and insert it (via addAsyncWrite() ) into the global list.
666 */
667 static int addNewAsyncWrite(
668   AsyncFileData *pFileData,
669   int op,
670   sqlite3_int64 iOffset,
671   int nByte,
672   const char *zByte
673 ){
674   AsyncWrite *p;
675   if( op!=ASYNC_CLOSE && async.ioError ){
676     return async.ioError;
677   }
678   p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
679   if( !p ){
680     /* The upper layer does not expect operations like OsWrite() to
681     ** return SQLITE_NOMEM. This is partly because under normal conditions
682     ** SQLite is required to do rollback without calling malloc(). So
683     ** if malloc() fails here, treat it as an I/O error. The above
684     ** layer knows how to handle that.
685     */
686     return SQLITE_IOERR;
687   }
688   p->op = op;
689   p->iOffset = iOffset;
690   p->nByte = nByte;
691   p->pFileData = pFileData;
692   p->pNext = 0;
693   if( zByte ){
694     p->zBuf = (char *)&p[1];
695     memcpy(p->zBuf, zByte, nByte);
696   }else{
697     p->zBuf = 0;
698   }
699   addAsyncWrite(p);
700   return SQLITE_OK;
701 }
702 
703 /*
704 ** Close the file. This just adds an entry to the write-op list, the file is
705 ** not actually closed.
706 */
707 static int asyncClose(sqlite3_file *pFile){
708   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
709 
710   /* Unlock the file, if it is locked */
711   async_mutex_enter(ASYNC_MUTEX_LOCK);
712   p->lock.eLock = 0;
713   async_mutex_leave(ASYNC_MUTEX_LOCK);
714 
715   addAsyncWrite(&p->closeOp);
716   return SQLITE_OK;
717 }
718 
719 /*
720 ** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
721 ** writing to the underlying file, this function adds an entry to the end of
722 ** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
723 ** returned.
724 */
725 static int asyncWrite(
726   sqlite3_file *pFile,
727   const void *pBuf,
728   int amt,
729   sqlite3_int64 iOff
730 ){
731   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
732   return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
733 }
734 
735 /*
736 ** Read data from the file. First we read from the filesystem, then adjust
737 ** the contents of the buffer based on ASYNC_WRITE operations in the
738 ** write-op queue.
739 **
740 ** This method holds the mutex from start to finish.
741 */
742 static int asyncRead(
743   sqlite3_file *pFile,
744   void *zOut,
745   int iAmt,
746   sqlite3_int64 iOffset
747 ){
748   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
749   int rc = SQLITE_OK;
750   sqlite3_int64 filesize;
751   int nRead;
752   sqlite3_file *pBase = p->pBaseRead;
753 
754   /* Grab the write queue mutex for the duration of the call */
755   async_mutex_enter(ASYNC_MUTEX_QUEUE);
756 
757   /* If an I/O error has previously occurred in this virtual file
758   ** system, then all subsequent operations fail.
759   */
760   if( async.ioError!=SQLITE_OK ){
761     rc = async.ioError;
762     goto asyncread_out;
763   }
764 
765   if( pBase->pMethods ){
766     rc = pBase->pMethods->xFileSize(pBase, &filesize);
767     if( rc!=SQLITE_OK ){
768       goto asyncread_out;
769     }
770     nRead = MIN(filesize - iOffset, iAmt);
771     if( nRead>0 ){
772       rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
773       ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
774     }
775   }
776 
777   if( rc==SQLITE_OK ){
778     AsyncWrite *pWrite;
779     char *zName = p->zName;
780 
781     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
782       if( pWrite->op==ASYNC_WRITE && (
783         (pWrite->pFileData==p) ||
784         (zName && pWrite->pFileData->zName==zName)
785       )){
786         int iBeginOut = (pWrite->iOffset-iOffset);
787         int iBeginIn = -iBeginOut;
788         int nCopy;
789 
790         if( iBeginIn<0 ) iBeginIn = 0;
791         if( iBeginOut<0 ) iBeginOut = 0;
792         nCopy = MIN(pWrite->nByte-iBeginIn, iAmt-iBeginOut);
793 
794         if( nCopy>0 ){
795           memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
796           ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
797         }
798       }
799     }
800   }
801 
802 asyncread_out:
803   async_mutex_leave(ASYNC_MUTEX_QUEUE);
804   return rc;
805 }
806 
807 /*
808 ** Truncate the file to nByte bytes in length. This just adds an entry to
809 ** the write-op list, no IO actually takes place.
810 */
811 static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
812   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
813   return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
814 }
815 
816 /*
817 ** Sync the file. This just adds an entry to the write-op list, the
818 ** sync() is done later by sqlite3_async_flush().
819 */
820 static int asyncSync(sqlite3_file *pFile, int flags){
821   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
822   return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
823 }
824 
825 /*
826 ** Read the size of the file. First we read the size of the file system
827 ** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
828 ** currently in the write-op list.
829 **
830 ** This method holds the mutex from start to finish.
831 */
832 int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
833   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
834   int rc = SQLITE_OK;
835   sqlite3_int64 s = 0;
836   sqlite3_file *pBase;
837 
838   async_mutex_enter(ASYNC_MUTEX_QUEUE);
839 
840   /* Read the filesystem size from the base file. If pBaseRead is NULL, this
841   ** means the file hasn't been opened yet. In this case all relevant data
842   ** must be in the write-op queue anyway, so we can omit reading from the
843   ** file-system.
844   */
845   pBase = p->pBaseRead;
846   if( pBase->pMethods ){
847     rc = pBase->pMethods->xFileSize(pBase, &s);
848   }
849 
850   if( rc==SQLITE_OK ){
851     AsyncWrite *pWrite;
852     for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
853       if( pWrite->op==ASYNC_DELETE
854        && p->zName
855        && strcmp(p->zName, pWrite->zBuf)==0
856       ){
857         s = 0;
858       }else if( pWrite->pFileData && (
859           (pWrite->pFileData==p)
860        || (p->zName && pWrite->pFileData->zName==p->zName)
861       )){
862         switch( pWrite->op ){
863           case ASYNC_WRITE:
864             s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
865             break;
866           case ASYNC_TRUNCATE:
867             s = MIN(s, pWrite->iOffset);
868             break;
869         }
870       }
871     }
872     *piSize = s;
873   }
874   async_mutex_leave(ASYNC_MUTEX_QUEUE);
875   return rc;
876 }
877 
878 /*
879 ** Lock or unlock the actual file-system entry.
880 */
881 static int getFileLock(AsyncLock *pLock){
882   int rc = SQLITE_OK;
883   AsyncFileLock *pIter;
884   int eRequired = 0;
885 
886   if( pLock->pFile ){
887     for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
888       assert(pIter->eAsyncLock>=pIter->eLock);
889       if( pIter->eAsyncLock>eRequired ){
890         eRequired = pIter->eAsyncLock;
891         assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
892       }
893     }
894 
895     if( eRequired>pLock->eLock ){
896       rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
897       if( rc==SQLITE_OK ){
898         pLock->eLock = eRequired;
899       }
900     }
901     else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
902       rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
903       if( rc==SQLITE_OK ){
904         pLock->eLock = eRequired;
905       }
906     }
907   }
908 
909   return rc;
910 }
911 
912 /*
913 ** Return the AsyncLock structure from the global async.pLock list
914 ** associated with the file-system entry identified by path zName
915 ** (a string of nName bytes). If no such structure exists, return 0.
916 */
917 static AsyncLock *findLock(const char *zName, int nName){
918   AsyncLock *p = async.pLock;
919   while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
920     p = p->pNext;
921   }
922   return p;
923 }
924 
925 /*
926 ** The following two methods - asyncLock() and asyncUnlock() - are used
927 ** to obtain and release locks on database files opened with the
928 ** asynchronous backend.
929 */
930 static int asyncLock(sqlite3_file *pFile, int eLock){
931   int rc = SQLITE_OK;
932   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
933 
934   if( p->zName ){
935     async_mutex_enter(ASYNC_MUTEX_LOCK);
936     if( p->lock.eLock<eLock ){
937       AsyncLock *pLock = p->pLock;
938       AsyncFileLock *pIter;
939       assert(pLock && pLock->pList);
940       for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
941         if( pIter!=&p->lock && (
942           (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
943           (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
944           (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
945           (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
946         )){
947           rc = SQLITE_BUSY;
948         }
949       }
950       if( rc==SQLITE_OK ){
951         p->lock.eLock = eLock;
952         p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
953       }
954       assert(p->lock.eAsyncLock>=p->lock.eLock);
955       if( rc==SQLITE_OK ){
956         rc = getFileLock(pLock);
957       }
958     }
959     async_mutex_leave(ASYNC_MUTEX_LOCK);
960   }
961 
962   ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
963   return rc;
964 }
965 static int asyncUnlock(sqlite3_file *pFile, int eLock){
966   int rc = SQLITE_OK;
967   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
968   if( p->zName ){
969     AsyncFileLock *pLock = &p->lock;
970     async_mutex_enter(ASYNC_MUTEX_QUEUE);
971     async_mutex_enter(ASYNC_MUTEX_LOCK);
972     pLock->eLock = MIN(pLock->eLock, eLock);
973     rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
974     async_mutex_leave(ASYNC_MUTEX_LOCK);
975     async_mutex_leave(ASYNC_MUTEX_QUEUE);
976   }
977   return rc;
978 }
979 
980 /*
981 ** This function is called when the pager layer first opens a database file
982 ** and is checking for a hot-journal.
983 */
984 static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
985   int ret = 0;
986   AsyncFileLock *pIter;
987   AsyncFileData *p = ((AsyncFile *)pFile)->pData;
988 
989   async_mutex_enter(ASYNC_MUTEX_LOCK);
990   for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
991     if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
992       ret = 1;
993     }
994   }
995   async_mutex_leave(ASYNC_MUTEX_LOCK);
996 
997   ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
998   *pResOut = ret;
999   return SQLITE_OK;
1000 }
1001 
1002 /*
1003 ** sqlite3_file_control() implementation.
1004 */
1005 static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
1006   switch( op ){
1007     case SQLITE_FCNTL_LOCKSTATE: {
1008       async_mutex_enter(ASYNC_MUTEX_LOCK);
1009       *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
1010       async_mutex_leave(ASYNC_MUTEX_LOCK);
1011       return SQLITE_OK;
1012     }
1013   }
1014   return SQLITE_ERROR;
1015 }
1016 
1017 /*
1018 ** Return the device characteristics and sector-size of the device. It
1019 ** is not tricky to implement these correctly, as this backend might
1020 ** not have an open file handle at this point.
1021 */
1022 static int asyncSectorSize(sqlite3_file *pFile){
1023   return 512;
1024 }
1025 static int asyncDeviceCharacteristics(sqlite3_file *pFile){
1026   return 0;
1027 }
1028 
1029 static int unlinkAsyncFile(AsyncFileData *pData){
1030   AsyncFileLock **ppIter;
1031   int rc = SQLITE_OK;
1032 
1033   if( pData->zName ){
1034     AsyncLock *pLock = pData->pLock;
1035     for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
1036       if( (*ppIter)==&pData->lock ){
1037         *ppIter = pData->lock.pNext;
1038         break;
1039       }
1040     }
1041     if( !pLock->pList ){
1042       AsyncLock **pp;
1043       if( pLock->pFile ){
1044         pLock->pFile->pMethods->xClose(pLock->pFile);
1045       }
1046       for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
1047       *pp = pLock->pNext;
1048       sqlite3_free(pLock);
1049     }else{
1050       rc = getFileLock(pLock);
1051     }
1052   }
1053 
1054   return rc;
1055 }
1056 
1057 /*
1058 ** The parameter passed to this function is a copy of a 'flags' parameter
1059 ** passed to this modules xOpen() method. This function returns true
1060 ** if the file should be opened asynchronously, or false if it should
1061 ** be opened immediately.
1062 **
1063 ** If the file is to be opened asynchronously, then asyncOpen() will add
1064 ** an entry to the event queue and the file will not actually be opened
1065 ** until the event is processed. Otherwise, the file is opened directly
1066 ** by the caller.
1067 */
1068 static int doAsynchronousOpen(int flags){
1069   return (flags&SQLITE_OPEN_CREATE) && (
1070       (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1071       (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1072       (flags&SQLITE_OPEN_DELETEONCLOSE)
1073   );
1074 }
1075 
1076 /*
1077 ** Open a file.
1078 */
1079 static int asyncOpen(
1080   sqlite3_vfs *pAsyncVfs,
1081   const char *zName,
1082   sqlite3_file *pFile,
1083   int flags,
1084   int *pOutFlags
1085 ){
1086   static sqlite3_io_methods async_methods = {
1087     1,                               /* iVersion */
1088     asyncClose,                      /* xClose */
1089     asyncRead,                       /* xRead */
1090     asyncWrite,                      /* xWrite */
1091     asyncTruncate,                   /* xTruncate */
1092     asyncSync,                       /* xSync */
1093     asyncFileSize,                   /* xFileSize */
1094     asyncLock,                       /* xLock */
1095     asyncUnlock,                     /* xUnlock */
1096     asyncCheckReservedLock,          /* xCheckReservedLock */
1097     asyncFileControl,                /* xFileControl */
1098     asyncSectorSize,                 /* xSectorSize */
1099     asyncDeviceCharacteristics       /* xDeviceCharacteristics */
1100   };
1101 
1102   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1103   AsyncFile *p = (AsyncFile *)pFile;
1104   int nName = 0;
1105   int rc = SQLITE_OK;
1106   int nByte;
1107   AsyncFileData *pData;
1108   AsyncLock *pLock = 0;
1109   char *z;
1110   int isAsyncOpen = doAsynchronousOpen(flags);
1111 
1112   /* If zName is NULL, then the upper layer is requesting an anonymous file */
1113   if( zName ){
1114     nName = strlen(zName)+1;
1115   }
1116 
1117   nByte = (
1118     sizeof(AsyncFileData) +        /* AsyncFileData structure */
1119     2 * pVfs->szOsFile +           /* AsyncFileData.pBaseRead and pBaseWrite */
1120     nName                          /* AsyncFileData.zName */
1121   );
1122   z = sqlite3_malloc(nByte);
1123   if( !z ){
1124     return SQLITE_NOMEM;
1125   }
1126   memset(z, 0, nByte);
1127   pData = (AsyncFileData*)z;
1128   z += sizeof(pData[0]);
1129   pData->pBaseRead = (sqlite3_file*)z;
1130   z += pVfs->szOsFile;
1131   pData->pBaseWrite = (sqlite3_file*)z;
1132   pData->closeOp.pFileData = pData;
1133   pData->closeOp.op = ASYNC_CLOSE;
1134 
1135   if( zName ){
1136     z += pVfs->szOsFile;
1137     pData->zName = z;
1138     pData->nName = nName;
1139     memcpy(pData->zName, zName, nName);
1140   }
1141 
1142   if( !isAsyncOpen ){
1143     int flagsout;
1144     rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
1145     if( rc==SQLITE_OK && (flagsout&SQLITE_OPEN_READWRITE) ){
1146       rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1147     }
1148     if( pOutFlags ){
1149       *pOutFlags = flagsout;
1150     }
1151   }
1152 
1153   async_mutex_enter(ASYNC_MUTEX_LOCK);
1154 
1155   if( zName && rc==SQLITE_OK ){
1156     pLock = findLock(pData->zName, pData->nName);
1157     if( !pLock ){
1158       int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1159       pLock = (AsyncLock *)sqlite3_malloc(nByte);
1160       if( pLock ){
1161         memset(pLock, 0, nByte);
1162 #ifdef ENABLE_FILE_LOCKING
1163         if( flags&SQLITE_OPEN_MAIN_DB ){
1164           pLock->pFile = (sqlite3_file *)&pLock[1];
1165           rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1166           if( rc!=SQLITE_OK ){
1167             sqlite3_free(pLock);
1168             pLock = 0;
1169           }
1170         }
1171 #endif
1172         if( pLock ){
1173           pLock->nFile = pData->nName;
1174           pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1175           memcpy(pLock->zFile, pData->zName, pLock->nFile);
1176           pLock->pNext = async.pLock;
1177           async.pLock = pLock;
1178         }
1179       }else{
1180         rc = SQLITE_NOMEM;
1181       }
1182     }
1183   }
1184 
1185   if( rc==SQLITE_OK ){
1186     p->pMethod = &async_methods;
1187     p->pData = pData;
1188 
1189     /* Link AsyncFileData.lock into the linked list of
1190     ** AsyncFileLock structures for this file.
1191     */
1192     if( zName ){
1193       pData->lock.pNext = pLock->pList;
1194       pLock->pList = &pData->lock;
1195       pData->zName = pLock->zFile;
1196     }
1197   }else{
1198     if( pData->pBaseRead->pMethods ){
1199       pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1200     }
1201     if( pData->pBaseWrite->pMethods ){
1202       pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1203     }
1204     sqlite3_free(pData);
1205   }
1206 
1207   async_mutex_leave(ASYNC_MUTEX_LOCK);
1208 
1209   if( rc==SQLITE_OK ){
1210     incrOpenFileCount();
1211     pData->pLock = pLock;
1212   }
1213 
1214   if( rc==SQLITE_OK && isAsyncOpen ){
1215     rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1216     if( rc==SQLITE_OK ){
1217       if( pOutFlags ) *pOutFlags = flags;
1218     }else{
1219       async_mutex_enter(ASYNC_MUTEX_LOCK);
1220       unlinkAsyncFile(pData);
1221       async_mutex_leave(ASYNC_MUTEX_LOCK);
1222       sqlite3_free(pData);
1223     }
1224   }
1225   if( rc!=SQLITE_OK ){
1226     p->pMethod = 0;
1227   }
1228   return rc;
1229 }
1230 
1231 /*
1232 ** Implementation of sqlite3OsDelete. Add an entry to the end of the
1233 ** write-op queue to perform the delete.
1234 */
1235 static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1236   return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, strlen(z)+1, z);
1237 }
1238 
1239 /*
1240 ** Implementation of sqlite3OsAccess. This method holds the mutex from
1241 ** start to finish.
1242 */
1243 static int asyncAccess(
1244   sqlite3_vfs *pAsyncVfs,
1245   const char *zName,
1246   int flags,
1247   int *pResOut
1248 ){
1249   int rc;
1250   int ret;
1251   AsyncWrite *p;
1252   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1253 
1254   assert(flags==SQLITE_ACCESS_READWRITE
1255       || flags==SQLITE_ACCESS_READ
1256       || flags==SQLITE_ACCESS_EXISTS
1257   );
1258 
1259   async_mutex_enter(ASYNC_MUTEX_QUEUE);
1260   rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1261   if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1262     for(p=async.pQueueFirst; p; p = p->pNext){
1263       if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1264         ret = 0;
1265       }else if( p->op==ASYNC_OPENEXCLUSIVE
1266              && p->pFileData->zName
1267              && 0==strcmp(p->pFileData->zName, zName)
1268       ){
1269         ret = 1;
1270       }
1271     }
1272   }
1273   ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1274     flags==SQLITE_ACCESS_READWRITE?"read-write":
1275     flags==SQLITE_ACCESS_READ?"read":"exists"
1276     , zName, ret)
1277   );
1278   async_mutex_leave(ASYNC_MUTEX_QUEUE);
1279   *pResOut = ret;
1280   return rc;
1281 }
1282 
1283 /*
1284 ** Fill in zPathOut with the full path to the file identified by zPath.
1285 */
1286 static int asyncFullPathname(
1287   sqlite3_vfs *pAsyncVfs,
1288   const char *zPath,
1289   int nPathOut,
1290   char *zPathOut
1291 ){
1292   int rc;
1293   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1294   rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1295 
1296   /* Because of the way intra-process file locking works, this backend
1297   ** needs to return a canonical path. The following block assumes the
1298   ** file-system uses unix style paths.
1299   */
1300   if( rc==SQLITE_OK ){
1301     int i, j;
1302     int n = nPathOut;
1303     char *z = zPathOut;
1304     while( n>1 && z[n-1]=='/' ){ n--; }
1305     for(i=j=0; i<n; i++){
1306       if( z[i]=='/' ){
1307         if( z[i+1]=='/' ) continue;
1308         if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1309           i += 1;
1310           continue;
1311         }
1312         if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1313           while( j>0 && z[j-1]!='/' ){ j--; }
1314           if( j>0 ){ j--; }
1315           i += 2;
1316           continue;
1317         }
1318       }
1319       z[j++] = z[i];
1320     }
1321     z[j] = 0;
1322   }
1323 
1324   return rc;
1325 }
1326 static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1327   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1328   return pVfs->xDlOpen(pVfs, zPath);
1329 }
1330 static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1331   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1332   pVfs->xDlError(pVfs, nByte, zErrMsg);
1333 }
1334 static void (*asyncDlSym(
1335   sqlite3_vfs *pAsyncVfs,
1336   void *pHandle,
1337   const char *zSymbol
1338 ))(void){
1339   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1340   return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1341 }
1342 static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1343   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1344   pVfs->xDlClose(pVfs, pHandle);
1345 }
1346 static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1347   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1348   return pVfs->xRandomness(pVfs, nByte, zBufOut);
1349 }
1350 static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1351   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1352   return pVfs->xSleep(pVfs, nMicro);
1353 }
1354 static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1355   sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1356   return pVfs->xCurrentTime(pVfs, pTimeOut);
1357 }
1358 
1359 static sqlite3_vfs async_vfs = {
1360   1,                    /* iVersion */
1361   sizeof(AsyncFile),    /* szOsFile */
1362   0,                    /* mxPathname */
1363   0,                    /* pNext */
1364   SQLITEASYNC_VFSNAME,  /* zName */
1365   0,                    /* pAppData */
1366   asyncOpen,            /* xOpen */
1367   asyncDelete,          /* xDelete */
1368   asyncAccess,          /* xAccess */
1369   asyncFullPathname,    /* xFullPathname */
1370   asyncDlOpen,          /* xDlOpen */
1371   asyncDlError,         /* xDlError */
1372   asyncDlSym,           /* xDlSym */
1373   asyncDlClose,         /* xDlClose */
1374   asyncRandomness,      /* xDlError */
1375   asyncSleep,           /* xDlSym */
1376   asyncCurrentTime      /* xDlClose */
1377 };
1378 
1379 /*
1380 ** This procedure runs in a separate thread, reading messages off of the
1381 ** write queue and processing them one by one.
1382 **
1383 ** If async.writerHaltNow is true, then this procedure exits
1384 ** after processing a single message.
1385 **
1386 ** If async.writerHaltWhenIdle is true, then this procedure exits when
1387 ** the write queue is empty.
1388 **
1389 ** If both of the above variables are false, this procedure runs
1390 ** indefinately, waiting for operations to be added to the write queue
1391 ** and processing them in the order in which they arrive.
1392 **
1393 ** An artifical delay of async.ioDelay milliseconds is inserted before
1394 ** each write operation in order to simulate the effect of a slow disk.
1395 **
1396 ** Only one instance of this procedure may be running at a time.
1397 */
1398 static void asyncWriterThread(void){
1399   sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1400   AsyncWrite *p = 0;
1401   int rc = SQLITE_OK;
1402   int holdingMutex = 0;
1403 
1404   async_mutex_enter(ASYNC_MUTEX_WRITER);
1405 
1406   while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1407     int doNotFree = 0;
1408     sqlite3_file *pBase = 0;
1409 
1410     if( !holdingMutex ){
1411       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1412     }
1413     while( (p = async.pQueueFirst)==0 ){
1414       if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1415         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1416         break;
1417       }else{
1418         ASYNC_TRACE(("IDLE\n"));
1419         async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1420         ASYNC_TRACE(("WAKEUP\n"));
1421       }
1422     }
1423     if( p==0 ) break;
1424     holdingMutex = 1;
1425 
1426     /* Right now this thread is holding the mutex on the write-op queue.
1427     ** Variable 'p' points to the first entry in the write-op queue. In
1428     ** the general case, we hold on to the mutex for the entire body of
1429     ** the loop.
1430     **
1431     ** However in the cases enumerated below, we relinquish the mutex,
1432     ** perform the IO, and then re-request the mutex before removing 'p' from
1433     ** the head of the write-op queue. The idea is to increase concurrency with
1434     ** sqlite threads.
1435     **
1436     **     * An ASYNC_CLOSE operation.
1437     **     * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1438     **       the mutex, call the underlying xOpenExclusive() function, then
1439     **       re-aquire the mutex before seting the AsyncFile.pBaseRead
1440     **       variable.
1441     **     * ASYNC_SYNC and ASYNC_WRITE operations, if
1442     **       SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1443     **       file-handles are open for the particular file being "synced".
1444     */
1445     if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1446       p->op = ASYNC_NOOP;
1447     }
1448     if( p->pFileData ){
1449       pBase = p->pFileData->pBaseWrite;
1450       if(
1451         p->op==ASYNC_CLOSE ||
1452         p->op==ASYNC_OPENEXCLUSIVE ||
1453         (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1454       ){
1455         async_mutex_leave(ASYNC_MUTEX_QUEUE);
1456         holdingMutex = 0;
1457       }
1458       if( !pBase->pMethods ){
1459         pBase = p->pFileData->pBaseRead;
1460       }
1461     }
1462 
1463     switch( p->op ){
1464       case ASYNC_NOOP:
1465         break;
1466 
1467       case ASYNC_WRITE:
1468         assert( pBase );
1469         ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1470                 p->pFileData->zName, p->nByte, p->iOffset));
1471         rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1472         break;
1473 
1474       case ASYNC_SYNC:
1475         assert( pBase );
1476         ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1477         rc = pBase->pMethods->xSync(pBase, p->nByte);
1478         break;
1479 
1480       case ASYNC_TRUNCATE:
1481         assert( pBase );
1482         ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1483                 p->pFileData->zName, p->iOffset));
1484         rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1485         break;
1486 
1487       case ASYNC_CLOSE: {
1488         AsyncFileData *pData = p->pFileData;
1489         ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1490         if( pData->pBaseWrite->pMethods ){
1491           pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1492         }
1493         if( pData->pBaseRead->pMethods ){
1494           pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1495         }
1496 
1497         /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1498         ** structures for this file. Obtain the async.lockMutex mutex
1499         ** before doing so.
1500         */
1501         async_mutex_enter(ASYNC_MUTEX_LOCK);
1502         rc = unlinkAsyncFile(pData);
1503         async_mutex_leave(ASYNC_MUTEX_LOCK);
1504 
1505         if( !holdingMutex ){
1506           async_mutex_enter(ASYNC_MUTEX_QUEUE);
1507           holdingMutex = 1;
1508         }
1509         assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1510         async.pQueueFirst = p->pNext;
1511         sqlite3_free(pData);
1512         doNotFree = 1;
1513         break;
1514       }
1515 
1516       case ASYNC_UNLOCK: {
1517         AsyncWrite *pIter;
1518         AsyncFileData *pData = p->pFileData;
1519         int eLock = p->nByte;
1520 
1521         /* When a file is locked by SQLite using the async backend, it is
1522         ** locked within the 'real' file-system synchronously. When it is
1523         ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1524         ** unlock the file asynchronously. The design of the async backend
1525         ** requires that the 'real' file-system file be locked from the
1526         ** time that SQLite first locks it (and probably reads from it)
1527         ** until all asynchronous write events that were scheduled before
1528         ** SQLite unlocked the file have been processed.
1529         **
1530         ** This is more complex if SQLite locks and unlocks the file multiple
1531         ** times in quick succession. For example, if SQLite does:
1532         **
1533         **   lock, write, unlock, lock, write, unlock
1534         **
1535         ** Each "lock" operation locks the file immediately. Each "write"
1536         ** and "unlock" operation adds an event to the event queue. If the
1537         ** second "lock" operation is performed before the first "unlock"
1538         ** operation has been processed asynchronously, then the first
1539         ** "unlock" cannot be safely processed as is, since this would mean
1540         ** the file was unlocked when the second "write" operation is
1541         ** processed. To work around this, when processing an ASYNC_UNLOCK
1542         ** operation, SQLite:
1543         **
1544         **   1) Unlocks the file to the minimum of the argument passed to
1545         **      the xUnlock() call and the current lock from SQLite's point
1546         **      of view, and
1547         **
1548         **   2) Only unlocks the file at all if this event is the last
1549         **      ASYNC_UNLOCK event on this file in the write-queue.
1550         */
1551         assert( holdingMutex==1 );
1552         assert( async.pQueueFirst==p );
1553         for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1554           if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1555         }
1556         if( !pIter ){
1557           async_mutex_enter(ASYNC_MUTEX_LOCK);
1558           pData->lock.eAsyncLock = MIN(
1559               pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1560           );
1561           assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1562           rc = getFileLock(pData->pLock);
1563           async_mutex_leave(ASYNC_MUTEX_LOCK);
1564         }
1565         break;
1566       }
1567 
1568       case ASYNC_DELETE:
1569         ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1570         rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1571         break;
1572 
1573       case ASYNC_OPENEXCLUSIVE: {
1574         int flags = (int)p->iOffset;
1575         AsyncFileData *pData = p->pFileData;
1576         ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1577         assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1578         rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1579         assert( holdingMutex==0 );
1580         async_mutex_enter(ASYNC_MUTEX_QUEUE);
1581         holdingMutex = 1;
1582         break;
1583       }
1584 
1585       default: assert(!"Illegal value for AsyncWrite.op");
1586     }
1587 
1588     /* If we didn't hang on to the mutex during the IO op, obtain it now
1589     ** so that the AsyncWrite structure can be safely removed from the
1590     ** global write-op queue.
1591     */
1592     if( !holdingMutex ){
1593       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1594       holdingMutex = 1;
1595     }
1596     /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1597     if( p==async.pQueueLast ){
1598       async.pQueueLast = 0;
1599     }
1600     if( !doNotFree ){
1601       assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1602       async.pQueueFirst = p->pNext;
1603       sqlite3_free(p);
1604     }
1605     assert( holdingMutex );
1606 
1607     /* An IO error has occurred. We cannot report the error back to the
1608     ** connection that requested the I/O since the error happened
1609     ** asynchronously.  The connection has already moved on.  There
1610     ** really is nobody to report the error to.
1611     **
1612     ** The file for which the error occurred may have been a database or
1613     ** journal file. Regardless, none of the currently queued operations
1614     ** associated with the same database should now be performed. Nor should
1615     ** any subsequently requested IO on either a database or journal file
1616     ** handle for the same database be accepted until the main database
1617     ** file handle has been closed and reopened.
1618     **
1619     ** Furthermore, no further IO should be queued or performed on any file
1620     ** handle associated with a database that may have been part of a
1621     ** multi-file transaction that included the database associated with
1622     ** the IO error (i.e. a database ATTACHed to the same handle at some
1623     ** point in time).
1624     */
1625     if( rc!=SQLITE_OK ){
1626       async.ioError = rc;
1627     }
1628 
1629     if( async.ioError && !async.pQueueFirst ){
1630       async_mutex_enter(ASYNC_MUTEX_LOCK);
1631       if( 0==async.pLock ){
1632         async.ioError = SQLITE_OK;
1633       }
1634       async_mutex_leave(ASYNC_MUTEX_LOCK);
1635     }
1636 
1637     /* Drop the queue mutex before continuing to the next write operation
1638     ** in order to give other threads a chance to work with the write queue.
1639     */
1640     if( !async.pQueueFirst || !async.ioError ){
1641       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1642       holdingMutex = 0;
1643       if( async.ioDelay>0 ){
1644         pVfs->xSleep(pVfs, async.ioDelay);
1645       }else{
1646         async_sched_yield();
1647       }
1648     }
1649   }
1650 
1651   async_mutex_leave(ASYNC_MUTEX_WRITER);
1652   return;
1653 }
1654 
1655 /*
1656 ** Install the asynchronous VFS.
1657 */
1658 int sqlite3async_initialize(const char *zParent, int isDefault){
1659   int rc = SQLITE_OK;
1660   if( async_vfs.pAppData==0 ){
1661     sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1662     if( !pParent || async_os_initialize() ){
1663       rc = SQLITE_ERROR;
1664     }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1665       async_os_shutdown();
1666     }else{
1667       async_vfs.pAppData = (void *)pParent;
1668       async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1669     }
1670   }
1671   return rc;
1672 }
1673 
1674 /*
1675 ** Uninstall the asynchronous VFS.
1676 */
1677 void sqlite3async_shutdown(void){
1678   if( async_vfs.pAppData ){
1679     async_os_shutdown();
1680     sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1681     async_vfs.pAppData = 0;
1682   }
1683 }
1684 
1685 /*
1686 ** Process events on the write-queue.
1687 */
1688 void sqlite3async_run(void){
1689   asyncWriterThread();
1690 }
1691 
1692 /*
1693 ** Control/configure the asynchronous IO system.
1694 */
1695 int sqlite3async_control(int op, ...){
1696   va_list ap;
1697   va_start(ap, op);
1698   switch( op ){
1699     case SQLITEASYNC_HALT: {
1700       int eWhen = va_arg(ap, int);
1701       if( eWhen!=SQLITEASYNC_HALT_NEVER
1702        && eWhen!=SQLITEASYNC_HALT_NOW
1703        && eWhen!=SQLITEASYNC_HALT_IDLE
1704       ){
1705         return SQLITE_ERROR;
1706       }
1707       async.eHalt = eWhen;
1708       async_mutex_enter(ASYNC_MUTEX_QUEUE);
1709       async_cond_signal(ASYNC_COND_QUEUE);
1710       async_mutex_leave(ASYNC_MUTEX_QUEUE);
1711       break;
1712     }
1713 
1714     case SQLITEASYNC_DELAY: {
1715       int iDelay = va_arg(ap, int);
1716       async.ioDelay = iDelay;
1717       break;
1718     }
1719 
1720     case SQLITEASYNC_GET_HALT: {
1721       int *peWhen = va_arg(ap, int *);
1722       *peWhen = async.eHalt;
1723       break;
1724     }
1725     case SQLITEASYNC_GET_DELAY: {
1726       int *piDelay = va_arg(ap, int *);
1727       *piDelay = async.ioDelay;
1728       break;
1729     }
1730 
1731     default:
1732       return SQLITE_ERROR;
1733   }
1734   return SQLITE_OK;
1735 }
1736 
1737 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1738 
1739