xref: /sqlite-3.40.0/src/os_unix.c (revision 20a9ed1d)
1bbd42a6dSdrh /*
2bbd42a6dSdrh ** 2004 May 22
3bbd42a6dSdrh **
4bbd42a6dSdrh ** The author disclaims copyright to this source code.  In place of
5bbd42a6dSdrh ** a legal notice, here is a blessing:
6bbd42a6dSdrh **
7bbd42a6dSdrh **    May you do good and not evil.
8bbd42a6dSdrh **    May you find forgiveness for yourself and forgive others.
9bbd42a6dSdrh **    May you share freely, never taking more than you give.
10bbd42a6dSdrh **
11bbd42a6dSdrh ******************************************************************************
12bbd42a6dSdrh **
13734c9864Sdrh ** This file contains the VFS implementation for unix-like operating systems
14734c9864Sdrh ** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
15822a5162Sdanielk1977 **
16734c9864Sdrh ** There are actually several different VFS implementations in this file.
17734c9864Sdrh ** The differences are in the way that file locking is done.  The default
18734c9864Sdrh ** implementation uses Posix Advisory Locks.  Alternative implementations
19734c9864Sdrh ** use flock(), dot-files, various proprietary locking schemas, or simply
20734c9864Sdrh ** skip locking all together.
21734c9864Sdrh **
229b35ea62Sdrh ** This source file is organized into divisions where the logic for various
23734c9864Sdrh ** subfunctions is contained within the appropriate division.  PLEASE
24734c9864Sdrh ** KEEP THE STRUCTURE OF THIS FILE INTACT.  New code should be placed
25734c9864Sdrh ** in the correct division and should be clearly labeled.
26734c9864Sdrh **
276b9d6ddcSdrh ** The layout of divisions is as follows:
28734c9864Sdrh **
29734c9864Sdrh **   *  General-purpose declarations and utility functions.
30734c9864Sdrh **   *  Unique file ID logic used by VxWorks.
31715ff30eSdrh **   *  Various locking primitive implementations (all except proxy locking):
32734c9864Sdrh **      + for Posix Advisory Locks
33734c9864Sdrh **      + for no-op locks
34734c9864Sdrh **      + for dot-file locks
35734c9864Sdrh **      + for flock() locking
36734c9864Sdrh **      + for named semaphore locks (VxWorks only)
37734c9864Sdrh **      + for AFP filesystem locks (MacOSX only)
389b35ea62Sdrh **   *  sqlite3_file methods not associated with locking.
399b35ea62Sdrh **   *  Definitions of sqlite3_io_methods objects for all locking
409b35ea62Sdrh **      methods plus "finder" functions for each locking method.
416b9d6ddcSdrh **   *  sqlite3_vfs method implementations.
42715ff30eSdrh **   *  Locking primitives for the proxy uber-locking-method. (MacOSX only)
439b35ea62Sdrh **   *  Definitions of sqlite3_vfs objects for all locking methods
449b35ea62Sdrh **      plus implementations of sqlite3_os_init() and sqlite3_os_end().
45bbd42a6dSdrh */
46bbd42a6dSdrh #include "sqliteInt.h"
4729bafeabSdanielk1977 #if SQLITE_OS_UNIX              /* This file is used on unix only */
4866560adaSdrh 
49e339d65aSdanielk1977 /*
506b9d6ddcSdrh ** There are various methods for file locking used for concurrency
516b9d6ddcSdrh ** control:
52e339d65aSdanielk1977 **
53734c9864Sdrh **   1. POSIX locking (the default),
54734c9864Sdrh **   2. No locking,
55734c9864Sdrh **   3. Dot-file locking,
56734c9864Sdrh **   4. flock() locking,
57734c9864Sdrh **   5. AFP locking (OSX only),
58734c9864Sdrh **   6. Named POSIX semaphores (VXWorks only),
59734c9864Sdrh **   7. proxy locking. (OSX only)
60734c9864Sdrh **
61734c9864Sdrh ** Styles 4, 5, and 7 are only available of SQLITE_ENABLE_LOCKING_STYLE
62734c9864Sdrh ** is defined to 1.  The SQLITE_ENABLE_LOCKING_STYLE also enables automatic
63734c9864Sdrh ** selection of the appropriate locking style based on the filesystem
64734c9864Sdrh ** where the database is located.
65e339d65aSdanielk1977 */
6640bbb0a3Sdrh #if !defined(SQLITE_ENABLE_LOCKING_STYLE)
67d2cb50b7Sdrh #  if defined(__APPLE__)
6840bbb0a3Sdrh #    define SQLITE_ENABLE_LOCKING_STYLE 1
6940bbb0a3Sdrh #  else
7040bbb0a3Sdrh #    define SQLITE_ENABLE_LOCKING_STYLE 0
7140bbb0a3Sdrh #  endif
7240bbb0a3Sdrh #endif
73bfe6631eSdrh 
74e32a256aSdrh /* Use pread() and pwrite() if they are available */
7579a2ca39Sdrh #if defined(__APPLE__)
7679a2ca39Sdrh # define HAVE_PREAD 1
7779a2ca39Sdrh # define HAVE_PWRITE 1
7879a2ca39Sdrh #endif
79e32a256aSdrh #if defined(HAVE_PREAD64) && defined(HAVE_PWRITE64)
80e32a256aSdrh # undef USE_PREAD
81e32a256aSdrh # define USE_PREAD64 1
82e32a256aSdrh #elif defined(HAVE_PREAD) && defined(HAVE_PWRITE)
8379a2ca39Sdrh # undef USE_PREAD64
8479a2ca39Sdrh # define USE_PREAD 1
85e32a256aSdrh #endif
86e32a256aSdrh 
879cbe6352Sdrh /*
889cbe6352Sdrh ** standard include files.
899cbe6352Sdrh */
906b013afbSdrh #include <sys/types.h>   /* amalgamator: keep */
916b013afbSdrh #include <sys/stat.h>    /* amalgamator: keep */
929cbe6352Sdrh #include <fcntl.h>
93efe16971Sdan #include <sys/ioctl.h>
946b013afbSdrh #include <unistd.h>      /* amalgamator: keep */
95bbd42a6dSdrh #include <time.h>
96b126ec18Sdrh #include <sys/time.h>    /* amalgamator: keep */
97bbd42a6dSdrh #include <errno.h>
9832c12fe2Sdan #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
99f2424c52Sdrh # include <sys/mman.h>
100b469f460Sdrh #endif
1011da88f02Sdrh 
102e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
103bfe6631eSdrh # include <sys/ioctl.h>
1049b35ea62Sdrh # include <sys/file.h>
105bfe6631eSdrh # include <sys/param.h>
106bfe6631eSdrh #endif /* SQLITE_ENABLE_LOCKING_STYLE */
1079cbe6352Sdrh 
108e4079e1fSdrh /*
109e4079e1fSdrh ** Try to determine if gethostuuid() is available based on standard
110e4079e1fSdrh ** macros.  This might sometimes compute the wrong value for some
111e4079e1fSdrh ** obscure platforms.  For those cases, simply compile with one of
112e4079e1fSdrh ** the following:
113e4079e1fSdrh **
114e4079e1fSdrh **    -DHAVE_GETHOSTUUID=0
115e4079e1fSdrh **    -DHAVE_GETHOSTUUID=1
116e4079e1fSdrh **
117e4079e1fSdrh ** None if this matters except when building on Apple products with
118e4079e1fSdrh ** -DSQLITE_ENABLE_LOCKING_STYLE.
119e4079e1fSdrh */
120e4079e1fSdrh #ifndef HAVE_GETHOSTUUID
121e4079e1fSdrh # define HAVE_GETHOSTUUID 0
1226bca6511Sdrh # if defined(__APPLE__) && ((__MAC_OS_X_VERSION_MIN_REQUIRED > 1050) || \
1236bca6511Sdrh                             (__IPHONE_OS_VERSION_MIN_REQUIRED > 2000))
1246bca6511Sdrh #    if (!defined(TARGET_OS_EMBEDDED) || (TARGET_OS_EMBEDDED==0)) \
12514f38b3dSdrh         && (!defined(TARGET_IPHONE_SIMULATOR) || (TARGET_IPHONE_SIMULATOR==0))\
12614f38b3dSdrh         && (!defined(TARGET_OS_MACCATALYST) || (TARGET_OS_MACCATALYST==0))
127e4079e1fSdrh #      undef HAVE_GETHOSTUUID
1286bca6511Sdrh #      define HAVE_GETHOSTUUID 1
1296bca6511Sdrh #    else
1306bca6511Sdrh #      warning "gethostuuid() is disabled."
1316bca6511Sdrh #    endif
1326bca6511Sdrh #  endif
133e4079e1fSdrh #endif
1346bca6511Sdrh 
1356bca6511Sdrh 
136e89b2918Sdrh #if OS_VXWORKS
137e89b2918Sdrh # include <sys/ioctl.h>
138e89b2918Sdrh # include <semaphore.h>
139e89b2918Sdrh # include <limits.h>
140e89b2918Sdrh #endif /* OS_VXWORKS */
141e89b2918Sdrh 
142e89b2918Sdrh #if defined(__APPLE__) || SQLITE_ENABLE_LOCKING_STYLE
14384a2bf67Sdrh # include <sys/mount.h>
14484a2bf67Sdrh #endif
14584a2bf67Sdrh 
146dbe4b88aSdrh #ifdef HAVE_UTIME
147dbe4b88aSdrh # include <utime.h>
148dbe4b88aSdrh #endif
149dbe4b88aSdrh 
1509cbe6352Sdrh /*
1517ed97b9dSdrh ** Allowed values of unixFile.fsFlags
1527ed97b9dSdrh */
1537ed97b9dSdrh #define SQLITE_FSFLAGS_IS_MSDOS     0x1
1547ed97b9dSdrh 
1557ed97b9dSdrh /*
15624efa544Sdrh ** If we are to be thread-safe, include the pthreads header.
1579cbe6352Sdrh */
158d677b3d6Sdrh #if SQLITE_THREADSAFE
1599cbe6352Sdrh # include <pthread.h>
1609cbe6352Sdrh #endif
1619cbe6352Sdrh 
1629cbe6352Sdrh /*
1639cbe6352Sdrh ** Default permissions when creating a new file
1649cbe6352Sdrh */
1659cbe6352Sdrh #ifndef SQLITE_DEFAULT_FILE_PERMISSIONS
1669cbe6352Sdrh # define SQLITE_DEFAULT_FILE_PERMISSIONS 0644
1679cbe6352Sdrh #endif
1689cbe6352Sdrh 
169b4b47411Sdanielk1977 /*
170aebf413dSaswift ** Default permissions when creating auto proxy dir
171aebf413dSaswift */
172aebf413dSaswift #ifndef SQLITE_DEFAULT_PROXYDIR_PERMISSIONS
173aebf413dSaswift # define SQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
174aebf413dSaswift #endif
175aebf413dSaswift 
176aebf413dSaswift /*
177b4b47411Sdanielk1977 ** Maximum supported path-length.
178b4b47411Sdanielk1977 */
179b4b47411Sdanielk1977 #define MAX_PATHNAME 512
1809cbe6352Sdrh 
181e88ec187Sdan /*
182e88ec187Sdan ** Maximum supported symbolic links
183e88ec187Sdan */
184e88ec187Sdan #define SQLITE_MAX_SYMLINKS 100
185e88ec187Sdan 
18691eb93c7Sdrh /* Always cast the getpid() return type for compatibility with
18791eb93c7Sdrh ** kernel modules in VxWorks. */
18891eb93c7Sdrh #define osGetpid(X) (pid_t)getpid()
18991eb93c7Sdrh 
190734c9864Sdrh /*
191734c9864Sdrh ** Only set the lastErrno if the error code is a real error and not
192734c9864Sdrh ** a normal expected return code of SQLITE_BUSY or SQLITE_OK
193734c9864Sdrh */
194734c9864Sdrh #define IS_LOCK_ERROR(x)  ((x != SQLITE_OK) && (x != SQLITE_BUSY))
195734c9864Sdrh 
196d91c68f6Sdrh /* Forward references */
197d91c68f6Sdrh typedef struct unixShm unixShm;               /* Connection shared memory */
198d91c68f6Sdrh typedef struct unixShmNode unixShmNode;       /* Shared memory instance */
199d91c68f6Sdrh typedef struct unixInodeInfo unixInodeInfo;   /* An i-node */
200d91c68f6Sdrh typedef struct UnixUnusedFd UnixUnusedFd;     /* An unused file descriptor */
2019cbe6352Sdrh 
2029cbe6352Sdrh /*
203e946c396Sdan ** Sometimes, after a file handle is closed by SQLite, the file descriptor
204e946c396Sdan ** cannot be closed immediately. In these cases, instances of the following
205e946c396Sdan ** structure are used to store the file descriptor while waiting for an
206e946c396Sdan ** opportunity to either close or reuse it.
207e946c396Sdan */
208e946c396Sdan struct UnixUnusedFd {
209e946c396Sdan   int fd;                   /* File descriptor to close */
210e946c396Sdan   int flags;                /* Flags this file descriptor was opened with */
211e946c396Sdan   UnixUnusedFd *pNext;      /* Next unused file descriptor on same file */
212e946c396Sdan };
213e946c396Sdan 
214e946c396Sdan /*
2159b35ea62Sdrh ** The unixFile structure is subclass of sqlite3_file specific to the unix
2169b35ea62Sdrh ** VFS implementations.
2179cbe6352Sdrh */
218054889ecSdrh typedef struct unixFile unixFile;
219054889ecSdrh struct unixFile {
22062079060Sdanielk1977   sqlite3_io_methods const *pMethod;  /* Always the first entry */
221de60fc2dSdrh   sqlite3_vfs *pVfs;                  /* The VFS that created this unixFile */
222d91c68f6Sdrh   unixInodeInfo *pInode;              /* Info about locks on this inode */
2236c7d5c5bSdrh   int h;                              /* The file descriptor */
224308c2a5cSdrh   unsigned char eFileLock;            /* The type of lock held on this fd */
2253ee3484cSdrh   unsigned short int ctrlFlags;       /* Behavioral bits.  UNIXFILE_* flags */
2268af6c228Sdrh   int lastErrno;                      /* The unix errno from last I/O error */
2276c7d5c5bSdrh   void *lockingContext;               /* Locking style specific state */
228c68886bbSdrh   UnixUnusedFd *pPreallocatedUnused;  /* Pre-allocated UnixUnusedFd */
229d9e5c4f6Sdrh   const char *zPath;                  /* Name of the file */
230d9e5c4f6Sdrh   unixShm *pShm;                      /* Shared memory segment information */
2316e09d69cSdan   int szChunk;                        /* Configured by FCNTL_CHUNK_SIZE */
232e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
2330d0614bdSdrh   int nFetchOut;                      /* Number of outstanding xFetch refs */
2340d0614bdSdrh   sqlite3_int64 mmapSize;             /* Usable size of mapping at pMapRegion */
2359b4c59faSdrh   sqlite3_int64 mmapSizeActual;       /* Actual size of mapping at pMapRegion */
2369b4c59faSdrh   sqlite3_int64 mmapSizeMax;          /* Configured FCNTL_MMAP_SIZE value */
2370d0614bdSdrh   void *pMapRegion;                   /* Memory mapped region */
238e98844f7Smistachkin #endif
239537dddf0Sdrh   int sectorSize;                     /* Device sector size */
240537dddf0Sdrh   int deviceCharacteristics;          /* Precomputed device characteristics */
24108c6d446Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
24208c6d446Sdrh   int openFlags;                      /* The flags specified at open() */
24308c6d446Sdrh #endif
2447ed97b9dSdrh #if SQLITE_ENABLE_LOCKING_STYLE || defined(__APPLE__)
2457ed97b9dSdrh   unsigned fsFlags;                   /* cached details from statfs() */
2467ed97b9dSdrh #endif
247f0119b2eSdrh #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
248f0119b2eSdrh   unsigned iBusyTimeout;              /* Wait this many millisec on locks */
249f0119b2eSdrh #endif
2506c7d5c5bSdrh #if OS_VXWORKS
251107886abSdrh   struct vxworksFileId *pId;          /* Unique file ID */
2526c7d5c5bSdrh #endif
253d3d8c04fSdrh #ifdef SQLITE_DEBUG
2548f941bc7Sdrh   /* The next group of variables are used to track whether or not the
2558f941bc7Sdrh   ** transaction counter in bytes 24-27 of database files are updated
2568f941bc7Sdrh   ** whenever any part of the database changes.  An assertion fault will
2578f941bc7Sdrh   ** occur if a file is updated without also updating the transaction
2588f941bc7Sdrh   ** counter.  This test is made to avoid new problems similar to the
2598f941bc7Sdrh   ** one described by ticket #3584.
2608f941bc7Sdrh   */
2618f941bc7Sdrh   unsigned char transCntrChng;   /* True if the transaction counter changed */
2628f941bc7Sdrh   unsigned char dbUpdate;        /* True if any part of database file changed */
2638f941bc7Sdrh   unsigned char inNormalWrite;   /* True if in a normal write operation */
264f23da966Sdan 
2658f941bc7Sdrh #endif
266f23da966Sdan 
267967a4a1cSdanielk1977 #ifdef SQLITE_TEST
268967a4a1cSdanielk1977   /* In test mode, increase the size of this structure a bit so that
269967a4a1cSdanielk1977   ** it is larger than the struct CrashFile defined in test6.c.
270967a4a1cSdanielk1977   */
271967a4a1cSdanielk1977   char aPadding[32];
272967a4a1cSdanielk1977 #endif
2739cbe6352Sdrh };
2749cbe6352Sdrh 
275b00d8621Sdrh /* This variable holds the process id (pid) from when the xRandomness()
276b00d8621Sdrh ** method was called.  If xOpen() is called from a different process id,
277b00d8621Sdrh ** indicating that a fork() has occurred, the PRNG will be reset.
278b00d8621Sdrh */
2798cd5b254Sdrh static pid_t randomnessPid = 0;
280b00d8621Sdrh 
2810ccebe7eSdrh /*
282a7e61d8bSdrh ** Allowed values for the unixFile.ctrlFlags bitmask:
283a7e61d8bSdrh */
284a7e61d8bSdrh #define UNIXFILE_EXCL        0x01     /* Connections from one process only */
2857719711bSdrh #define UNIXFILE_RDONLY      0x02     /* Connection is read only */
286f0b190d9Sdrh #define UNIXFILE_PERSIST_WAL 0x04     /* Persistent WAL mode */
287ee140c4dSdan #ifndef SQLITE_DISABLE_DIRSYNC
2880059eae3Sdrh # define UNIXFILE_DIRSYNC    0x08     /* Directory sync needed */
289ee140c4dSdan #else
290ee140c4dSdan # define UNIXFILE_DIRSYNC    0x00
291ee140c4dSdan #endif
292cb15f35fSdrh #define UNIXFILE_PSOW        0x10     /* SQLITE_IOCAP_POWERSAFE_OVERWRITE */
293c02a43afSdrh #define UNIXFILE_DELETE      0x20     /* Delete on close */
294c02a43afSdrh #define UNIXFILE_URI         0x40     /* Filename might have query parameters */
295c02a43afSdrh #define UNIXFILE_NOLOCK      0x80     /* Do no file locking */
296a7e61d8bSdrh 
297a7e61d8bSdrh /*
298198bf391Sdrh ** Include code that is common to all os_*.c files
299198bf391Sdrh */
300198bf391Sdrh #include "os_common.h"
301198bf391Sdrh 
302198bf391Sdrh /*
3030ccebe7eSdrh ** Define various macros that are missing from some systems.
3040ccebe7eSdrh */
305bbd42a6dSdrh #ifndef O_LARGEFILE
306bbd42a6dSdrh # define O_LARGEFILE 0
307bbd42a6dSdrh #endif
308bbd42a6dSdrh #ifdef SQLITE_DISABLE_LFS
309bbd42a6dSdrh # undef O_LARGEFILE
310bbd42a6dSdrh # define O_LARGEFILE 0
311bbd42a6dSdrh #endif
312bbd42a6dSdrh #ifndef O_NOFOLLOW
313bbd42a6dSdrh # define O_NOFOLLOW 0
314bbd42a6dSdrh #endif
315bbd42a6dSdrh #ifndef O_BINARY
316bbd42a6dSdrh # define O_BINARY 0
317bbd42a6dSdrh #endif
318bbd42a6dSdrh 
319bbd42a6dSdrh /*
3202b4b5962Sdrh ** The threadid macro resolves to the thread-id or to 0.  Used for
3212b4b5962Sdrh ** testing and debugging only.
3222b4b5962Sdrh */
323d677b3d6Sdrh #if SQLITE_THREADSAFE
3242b4b5962Sdrh #define threadid pthread_self()
3252b4b5962Sdrh #else
3262b4b5962Sdrh #define threadid 0
3272b4b5962Sdrh #endif
3282b4b5962Sdrh 
32999ab3b12Sdrh /*
330e6ecd663Sdan ** HAVE_MREMAP defaults to true on Linux and false everywhere else.
331e6ecd663Sdan */
332e6ecd663Sdan #if !defined(HAVE_MREMAP)
333e6ecd663Sdan # if defined(__linux__) && defined(_GNU_SOURCE)
334e6ecd663Sdan #  define HAVE_MREMAP 1
335e6ecd663Sdan # else
336e6ecd663Sdan #  define HAVE_MREMAP 0
337e6ecd663Sdan # endif
338e6ecd663Sdan #endif
339e6ecd663Sdan 
340e6ecd663Sdan /*
3412ee53412Sdan ** Explicitly call the 64-bit version of lseek() on Android. Otherwise, lseek()
3422ee53412Sdan ** is the 32-bit version, even if _FILE_OFFSET_BITS=64 is defined.
3432ee53412Sdan */
3442ee53412Sdan #ifdef __ANDROID__
3452ee53412Sdan # define lseek lseek64
3462ee53412Sdan #endif
3472ee53412Sdan 
348d76dba7eSdrh #ifdef __linux__
349d76dba7eSdrh /*
350d76dba7eSdrh ** Linux-specific IOCTL magic numbers used for controlling F2FS
351d76dba7eSdrh */
352efe16971Sdan #define F2FS_IOCTL_MAGIC        0xf5
353efe16971Sdan #define F2FS_IOC_START_ATOMIC_WRITE     _IO(F2FS_IOCTL_MAGIC, 1)
354efe16971Sdan #define F2FS_IOC_COMMIT_ATOMIC_WRITE    _IO(F2FS_IOCTL_MAGIC, 2)
355efe16971Sdan #define F2FS_IOC_START_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 3)
356efe16971Sdan #define F2FS_IOC_ABORT_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 5)
3579d709540Sdan #define F2FS_IOC_GET_FEATURES           _IOR(F2FS_IOCTL_MAGIC, 12, u32)
3589d709540Sdan #define F2FS_FEATURE_ATOMIC_WRITE 0x0004
359d76dba7eSdrh #endif /* __linux__ */
360efe16971Sdan 
361efe16971Sdan 
3622ee53412Sdan /*
3639a3baf10Sdrh ** Different Unix systems declare open() in different ways.  Same use
3649a3baf10Sdrh ** open(const char*,int,mode_t).  Others use open(const char*,int,...).
3659a3baf10Sdrh ** The difference is important when using a pointer to the function.
3669a3baf10Sdrh **
3679a3baf10Sdrh ** The safest way to deal with the problem is to always use this wrapper
3689a3baf10Sdrh ** which always has the same well-defined interface.
3699a3baf10Sdrh */
posixOpen(const char * zFile,int flags,int mode)3709a3baf10Sdrh static int posixOpen(const char *zFile, int flags, int mode){
3719a3baf10Sdrh   return open(zFile, flags, mode);
3729a3baf10Sdrh }
3739a3baf10Sdrh 
37490315a24Sdrh /* Forward reference */
37590315a24Sdrh static int openDirectory(const char*, int*);
376bc76063cSdan static int unixGetpagesize(void);
37790315a24Sdrh 
3789a3baf10Sdrh /*
37999ab3b12Sdrh ** Many system calls are accessed through pointer-to-functions so that
38099ab3b12Sdrh ** they may be overridden at runtime to facilitate fault injection during
38199ab3b12Sdrh ** testing and sandboxing.  The following array holds the names and pointers
38299ab3b12Sdrh ** to all overrideable system calls.
38399ab3b12Sdrh */
38499ab3b12Sdrh static struct unix_syscall {
38548864df9Smistachkin   const char *zName;            /* Name of the system call */
38658ad580fSdrh   sqlite3_syscall_ptr pCurrent; /* Current value of the system call */
38758ad580fSdrh   sqlite3_syscall_ptr pDefault; /* Default value */
38899ab3b12Sdrh } aSyscall[] = {
3899a3baf10Sdrh   { "open",         (sqlite3_syscall_ptr)posixOpen,  0  },
3909a3baf10Sdrh #define osOpen      ((int(*)(const char*,int,int))aSyscall[0].pCurrent)
39199ab3b12Sdrh 
39258ad580fSdrh   { "close",        (sqlite3_syscall_ptr)close,      0  },
39399ab3b12Sdrh #define osClose     ((int(*)(int))aSyscall[1].pCurrent)
39499ab3b12Sdrh 
39558ad580fSdrh   { "access",       (sqlite3_syscall_ptr)access,     0  },
39699ab3b12Sdrh #define osAccess    ((int(*)(const char*,int))aSyscall[2].pCurrent)
39799ab3b12Sdrh 
39858ad580fSdrh   { "getcwd",       (sqlite3_syscall_ptr)getcwd,     0  },
39999ab3b12Sdrh #define osGetcwd    ((char*(*)(char*,size_t))aSyscall[3].pCurrent)
40099ab3b12Sdrh 
40158ad580fSdrh   { "stat",         (sqlite3_syscall_ptr)stat,       0  },
40299ab3b12Sdrh #define osStat      ((int(*)(const char*,struct stat*))aSyscall[4].pCurrent)
40399ab3b12Sdrh 
40499ab3b12Sdrh /*
40599ab3b12Sdrh ** The DJGPP compiler environment looks mostly like Unix, but it
40699ab3b12Sdrh ** lacks the fcntl() system call.  So redefine fcntl() to be something
40799ab3b12Sdrh ** that always succeeds.  This means that locking does not occur under
40899ab3b12Sdrh ** DJGPP.  But it is DOS - what did you expect?
40999ab3b12Sdrh */
41099ab3b12Sdrh #ifdef __DJGPP__
41199ab3b12Sdrh   { "fstat",        0,                 0  },
41299ab3b12Sdrh #define osFstat(a,b,c)    0
41399ab3b12Sdrh #else
41458ad580fSdrh   { "fstat",        (sqlite3_syscall_ptr)fstat,      0  },
41599ab3b12Sdrh #define osFstat     ((int(*)(int,struct stat*))aSyscall[5].pCurrent)
41699ab3b12Sdrh #endif
41799ab3b12Sdrh 
41858ad580fSdrh   { "ftruncate",    (sqlite3_syscall_ptr)ftruncate,  0  },
41999ab3b12Sdrh #define osFtruncate ((int(*)(int,off_t))aSyscall[6].pCurrent)
42099ab3b12Sdrh 
42158ad580fSdrh   { "fcntl",        (sqlite3_syscall_ptr)fcntl,      0  },
42299ab3b12Sdrh #define osFcntl     ((int(*)(int,int,...))aSyscall[7].pCurrent)
423e562be52Sdrh 
42458ad580fSdrh   { "read",         (sqlite3_syscall_ptr)read,       0  },
425e562be52Sdrh #define osRead      ((ssize_t(*)(int,void*,size_t))aSyscall[8].pCurrent)
426e562be52Sdrh 
427e89b2918Sdrh #if defined(USE_PREAD) || SQLITE_ENABLE_LOCKING_STYLE
42858ad580fSdrh   { "pread",        (sqlite3_syscall_ptr)pread,      0  },
429e562be52Sdrh #else
43058ad580fSdrh   { "pread",        (sqlite3_syscall_ptr)0,          0  },
431e562be52Sdrh #endif
432e562be52Sdrh #define osPread     ((ssize_t(*)(int,void*,size_t,off_t))aSyscall[9].pCurrent)
433e562be52Sdrh 
434e562be52Sdrh #if defined(USE_PREAD64)
43558ad580fSdrh   { "pread64",      (sqlite3_syscall_ptr)pread64,    0  },
436e562be52Sdrh #else
43758ad580fSdrh   { "pread64",      (sqlite3_syscall_ptr)0,          0  },
438e562be52Sdrh #endif
439f9986d90Sdrh #define osPread64 ((ssize_t(*)(int,void*,size_t,off64_t))aSyscall[10].pCurrent)
440e562be52Sdrh 
44158ad580fSdrh   { "write",        (sqlite3_syscall_ptr)write,      0  },
442e562be52Sdrh #define osWrite     ((ssize_t(*)(int,const void*,size_t))aSyscall[11].pCurrent)
443e562be52Sdrh 
444e89b2918Sdrh #if defined(USE_PREAD) || SQLITE_ENABLE_LOCKING_STYLE
44558ad580fSdrh   { "pwrite",       (sqlite3_syscall_ptr)pwrite,     0  },
446e562be52Sdrh #else
44758ad580fSdrh   { "pwrite",       (sqlite3_syscall_ptr)0,          0  },
448e562be52Sdrh #endif
449e562be52Sdrh #define osPwrite    ((ssize_t(*)(int,const void*,size_t,off_t))\
450e562be52Sdrh                     aSyscall[12].pCurrent)
451e562be52Sdrh 
452e562be52Sdrh #if defined(USE_PREAD64)
45358ad580fSdrh   { "pwrite64",     (sqlite3_syscall_ptr)pwrite64,   0  },
454e562be52Sdrh #else
45558ad580fSdrh   { "pwrite64",     (sqlite3_syscall_ptr)0,          0  },
456e562be52Sdrh #endif
457f9986d90Sdrh #define osPwrite64  ((ssize_t(*)(int,const void*,size_t,off64_t))\
458e562be52Sdrh                     aSyscall[13].pCurrent)
459e562be52Sdrh 
46058ad580fSdrh   { "fchmod",       (sqlite3_syscall_ptr)fchmod,          0  },
4612aa5a00eSdrh #define osFchmod    ((int(*)(int,mode_t))aSyscall[14].pCurrent)
462e562be52Sdrh 
463e562be52Sdrh #if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
46458ad580fSdrh   { "fallocate",    (sqlite3_syscall_ptr)posix_fallocate,  0 },
465e562be52Sdrh #else
46658ad580fSdrh   { "fallocate",    (sqlite3_syscall_ptr)0,                0 },
467e562be52Sdrh #endif
4680fd7d860Sdan #define osFallocate ((int(*)(int,off_t,off_t))aSyscall[15].pCurrent)
469e562be52Sdrh 
470036ac7faSdrh   { "unlink",       (sqlite3_syscall_ptr)unlink,           0 },
471036ac7faSdrh #define osUnlink    ((int(*)(const char*))aSyscall[16].pCurrent)
472036ac7faSdrh 
47390315a24Sdrh   { "openDirectory",    (sqlite3_syscall_ptr)openDirectory,      0 },
47490315a24Sdrh #define osOpenDirectory ((int(*)(const char*,int*))aSyscall[17].pCurrent)
47590315a24Sdrh 
4769ef6bc42Sdrh   { "mkdir",        (sqlite3_syscall_ptr)mkdir,           0 },
4779ef6bc42Sdrh #define osMkdir     ((int(*)(const char*,mode_t))aSyscall[18].pCurrent)
4789ef6bc42Sdrh 
4799ef6bc42Sdrh   { "rmdir",        (sqlite3_syscall_ptr)rmdir,           0 },
4809ef6bc42Sdrh #define osRmdir     ((int(*)(const char*))aSyscall[19].pCurrent)
4819ef6bc42Sdrh 
482e2258a20Sdrh #if defined(HAVE_FCHOWN)
4836226ca2aSdrh   { "fchown",       (sqlite3_syscall_ptr)fchown,          0 },
484e2258a20Sdrh #else
485e2258a20Sdrh   { "fchown",       (sqlite3_syscall_ptr)0,               0 },
486e2258a20Sdrh #endif
487d3eaebd4Sdan #define osFchown    ((int(*)(int,uid_t,gid_t))aSyscall[20].pCurrent)
48823c4b973Sdrh 
48926f625fbSdrh #if defined(HAVE_FCHOWN)
4906226ca2aSdrh   { "geteuid",      (sqlite3_syscall_ptr)geteuid,         0 },
49126f625fbSdrh #else
49226f625fbSdrh   { "geteuid",      (sqlite3_syscall_ptr)0,               0 },
49326f625fbSdrh #endif
4946226ca2aSdrh #define osGeteuid   ((uid_t(*)(void))aSyscall[21].pCurrent)
4956226ca2aSdrh 
4964dd51443Sdan #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
497893c0ffcSdan   { "mmap",         (sqlite3_syscall_ptr)mmap,            0 },
498e4a08f9eSdrh #else
499e4a08f9eSdrh   { "mmap",         (sqlite3_syscall_ptr)0,               0 },
500e4a08f9eSdrh #endif
5016226ca2aSdrh #define osMmap ((void*(*)(void*,size_t,int,int,int,off_t))aSyscall[22].pCurrent)
502893c0ffcSdan 
503e4a08f9eSdrh #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
504d1ab8065Sdrh   { "munmap",       (sqlite3_syscall_ptr)munmap,          0 },
505e4a08f9eSdrh #else
506a8299929Sdrh   { "munmap",       (sqlite3_syscall_ptr)0,               0 },
507e4a08f9eSdrh #endif
50862be1fabSdrh #define osMunmap ((int(*)(void*,size_t))aSyscall[23].pCurrent)
509d1ab8065Sdrh 
510e4a08f9eSdrh #if HAVE_MREMAP && (!defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0)
511d1ab8065Sdrh   { "mremap",       (sqlite3_syscall_ptr)mremap,          0 },
512d1ab8065Sdrh #else
513d1ab8065Sdrh   { "mremap",       (sqlite3_syscall_ptr)0,               0 },
514d1ab8065Sdrh #endif
5156226ca2aSdrh #define osMremap ((void*(*)(void*,size_t,size_t,int,...))aSyscall[24].pCurrent)
5166226ca2aSdrh 
51724dbeae6Sdrh #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
518bc76063cSdan   { "getpagesize",  (sqlite3_syscall_ptr)unixGetpagesize, 0 },
51924dbeae6Sdrh #else
52024dbeae6Sdrh   { "getpagesize",  (sqlite3_syscall_ptr)0,               0 },
52124dbeae6Sdrh #endif
5226226ca2aSdrh #define osGetpagesize ((int(*)(void))aSyscall[25].pCurrent)
523bc76063cSdan 
524e2258a20Sdrh #if defined(HAVE_READLINK)
525245fdc60Sdan   { "readlink",     (sqlite3_syscall_ptr)readlink,        0 },
526e2258a20Sdrh #else
527e2258a20Sdrh   { "readlink",     (sqlite3_syscall_ptr)0,               0 },
528e2258a20Sdrh #endif
5296226ca2aSdrh #define osReadlink ((ssize_t(*)(const char*,char*,size_t))aSyscall[26].pCurrent)
530245fdc60Sdan 
531af1b36b1Sdan #if defined(HAVE_LSTAT)
532caf6b150Sdan   { "lstat",         (sqlite3_syscall_ptr)lstat,          0 },
533af1b36b1Sdan #else
534af1b36b1Sdan   { "lstat",         (sqlite3_syscall_ptr)0,              0 },
535af1b36b1Sdan #endif
536caf6b150Sdan #define osLstat      ((int(*)(const char*,struct stat*))aSyscall[27].pCurrent)
537702eec1cSdan 
538b5d013edSdrh #if defined(__linux__) && defined(SQLITE_ENABLE_BATCH_ATOMIC_WRITE)
53916f39b6eSdan # ifdef __ANDROID__
54016f39b6eSdan   { "ioctl", (sqlite3_syscall_ptr)(int(*)(int, int, ...))ioctl, 0 },
541ec9b2a13Sdan #define osIoctl ((int(*)(int,int,...))aSyscall[28].pCurrent)
54216f39b6eSdan # else
543efe16971Sdan   { "ioctl",         (sqlite3_syscall_ptr)ioctl,          0 },
544ec9b2a13Sdan #define osIoctl ((int(*)(int,unsigned long,...))aSyscall[28].pCurrent)
54516f39b6eSdan # endif
546b5d013edSdrh #else
547b5d013edSdrh   { "ioctl",         (sqlite3_syscall_ptr)0,              0 },
548b5d013edSdrh #endif
549efe16971Sdan 
550e562be52Sdrh }; /* End of the overrideable system calls */
55199ab3b12Sdrh 
5526226ca2aSdrh 
5536226ca2aSdrh /*
5546226ca2aSdrh ** On some systems, calls to fchown() will trigger a message in a security
5556226ca2aSdrh ** log if they come from non-root processes.  So avoid calling fchown() if
5566226ca2aSdrh ** we are not running as root.
5576226ca2aSdrh */
robustFchown(int fd,uid_t uid,gid_t gid)5586226ca2aSdrh static int robustFchown(int fd, uid_t uid, gid_t gid){
559e2258a20Sdrh #if defined(HAVE_FCHOWN)
5606226ca2aSdrh   return osGeteuid() ? 0 : osFchown(fd,uid,gid);
561e2258a20Sdrh #else
562e2258a20Sdrh   return 0;
5636226ca2aSdrh #endif
5646226ca2aSdrh }
5656226ca2aSdrh 
56699ab3b12Sdrh /*
56799ab3b12Sdrh ** This is the xSetSystemCall() method of sqlite3_vfs for all of the
5681df30967Sdrh ** "unix" VFSes.  Return SQLITE_OK opon successfully updating the
5691df30967Sdrh ** system call pointer, or SQLITE_NOTFOUND if there is no configurable
5701df30967Sdrh ** system call named zName.
57199ab3b12Sdrh */
unixSetSystemCall(sqlite3_vfs * pNotUsed,const char * zName,sqlite3_syscall_ptr pNewFunc)57299ab3b12Sdrh static int unixSetSystemCall(
57399ab3b12Sdrh   sqlite3_vfs *pNotUsed,        /* The VFS pointer.  Not used */
57499ab3b12Sdrh   const char *zName,            /* Name of system call to override */
57558ad580fSdrh   sqlite3_syscall_ptr pNewFunc  /* Pointer to new system call value */
57699ab3b12Sdrh ){
57758ad580fSdrh   unsigned int i;
5781df30967Sdrh   int rc = SQLITE_NOTFOUND;
57958ad580fSdrh 
58058ad580fSdrh   UNUSED_PARAMETER(pNotUsed);
58199ab3b12Sdrh   if( zName==0 ){
58299ab3b12Sdrh     /* If no zName is given, restore all system calls to their default
58399ab3b12Sdrh     ** settings and return NULL
58499ab3b12Sdrh     */
58551438a79Sdan     rc = SQLITE_OK;
58699ab3b12Sdrh     for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
58799ab3b12Sdrh       if( aSyscall[i].pDefault ){
58899ab3b12Sdrh         aSyscall[i].pCurrent = aSyscall[i].pDefault;
58999ab3b12Sdrh       }
59099ab3b12Sdrh     }
59199ab3b12Sdrh   }else{
59299ab3b12Sdrh     /* If zName is specified, operate on only the one system call
59399ab3b12Sdrh     ** specified.
59499ab3b12Sdrh     */
59599ab3b12Sdrh     for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
59699ab3b12Sdrh       if( strcmp(zName, aSyscall[i].zName)==0 ){
59799ab3b12Sdrh         if( aSyscall[i].pDefault==0 ){
59899ab3b12Sdrh           aSyscall[i].pDefault = aSyscall[i].pCurrent;
59999ab3b12Sdrh         }
6001df30967Sdrh         rc = SQLITE_OK;
60199ab3b12Sdrh         if( pNewFunc==0 ) pNewFunc = aSyscall[i].pDefault;
60299ab3b12Sdrh         aSyscall[i].pCurrent = pNewFunc;
60399ab3b12Sdrh         break;
60499ab3b12Sdrh       }
60599ab3b12Sdrh     }
60699ab3b12Sdrh   }
60799ab3b12Sdrh   return rc;
60899ab3b12Sdrh }
60999ab3b12Sdrh 
6101df30967Sdrh /*
6111df30967Sdrh ** Return the value of a system call.  Return NULL if zName is not a
6121df30967Sdrh ** recognized system call name.  NULL is also returned if the system call
6131df30967Sdrh ** is currently undefined.
6141df30967Sdrh */
unixGetSystemCall(sqlite3_vfs * pNotUsed,const char * zName)61558ad580fSdrh static sqlite3_syscall_ptr unixGetSystemCall(
61658ad580fSdrh   sqlite3_vfs *pNotUsed,
61758ad580fSdrh   const char *zName
61858ad580fSdrh ){
61958ad580fSdrh   unsigned int i;
62058ad580fSdrh 
62158ad580fSdrh   UNUSED_PARAMETER(pNotUsed);
6221df30967Sdrh   for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
6231df30967Sdrh     if( strcmp(zName, aSyscall[i].zName)==0 ) return aSyscall[i].pCurrent;
6241df30967Sdrh   }
6251df30967Sdrh   return 0;
6261df30967Sdrh }
6271df30967Sdrh 
6281df30967Sdrh /*
6291df30967Sdrh ** Return the name of the first system call after zName.  If zName==NULL
6301df30967Sdrh ** then return the name of the first system call.  Return NULL if zName
6311df30967Sdrh ** is the last system call or if zName is not the name of a valid
6321df30967Sdrh ** system call.
6331df30967Sdrh */
unixNextSystemCall(sqlite3_vfs * p,const char * zName)6341df30967Sdrh static const char *unixNextSystemCall(sqlite3_vfs *p, const char *zName){
6350fd7d860Sdan   int i = -1;
63658ad580fSdrh 
63758ad580fSdrh   UNUSED_PARAMETER(p);
6380fd7d860Sdan   if( zName ){
6390fd7d860Sdan     for(i=0; i<ArraySize(aSyscall)-1; i++){
6400fd7d860Sdan       if( strcmp(zName, aSyscall[i].zName)==0 ) break;
6411df30967Sdrh     }
6421df30967Sdrh   }
6430fd7d860Sdan   for(i++; i<ArraySize(aSyscall); i++){
6440fd7d860Sdan     if( aSyscall[i].pCurrent!=0 ) return aSyscall[i].zName;
6451df30967Sdrh   }
6461df30967Sdrh   return 0;
6471df30967Sdrh }
6481df30967Sdrh 
649ad4f1e54Sdrh /*
65077a3fdc1Sdrh ** Do not accept any file descriptor less than this value, in order to avoid
65177a3fdc1Sdrh ** opening database file using file descriptors that are commonly used for
65277a3fdc1Sdrh ** standard input, output, and error.
65377a3fdc1Sdrh */
65477a3fdc1Sdrh #ifndef SQLITE_MINIMUM_FILE_DESCRIPTOR
65577a3fdc1Sdrh # define SQLITE_MINIMUM_FILE_DESCRIPTOR 3
65677a3fdc1Sdrh #endif
65777a3fdc1Sdrh 
65877a3fdc1Sdrh /*
6598c815d14Sdrh ** Invoke open().  Do so multiple times, until it either succeeds or
6605adc60baSdrh ** fails for some reason other than EINTR.
6618c815d14Sdrh **
6628c815d14Sdrh ** If the file creation mode "m" is 0 then set it to the default for
6638c815d14Sdrh ** SQLite.  The default is SQLITE_DEFAULT_FILE_PERMISSIONS (normally
6648c815d14Sdrh ** 0644) as modified by the system umask.  If m is not 0, then
6658c815d14Sdrh ** make the file creation mode be exactly m ignoring the umask.
6668c815d14Sdrh **
6678c815d14Sdrh ** The m parameter will be non-zero only when creating -wal, -journal,
6688c815d14Sdrh ** and -shm files.  We want those files to have *exactly* the same
6698c815d14Sdrh ** permissions as their original database, unadulterated by the umask.
6708c815d14Sdrh ** In that way, if a database file is -rw-rw-rw or -rw-rw-r-, and a
6718c815d14Sdrh ** transaction crashes and leaves behind hot journals, then any
6728c815d14Sdrh ** process that is able to write to the database will also be able to
6738c815d14Sdrh ** recover the hot journals.
674ad4f1e54Sdrh */
robust_open(const char * z,int f,mode_t m)6758c815d14Sdrh static int robust_open(const char *z, int f, mode_t m){
6765adc60baSdrh   int fd;
677e1186ab2Sdrh   mode_t m2 = m ? m : SQLITE_DEFAULT_FILE_PERMISSIONS;
6785128d009Sdrh   while(1){
6795adc60baSdrh #if defined(O_CLOEXEC)
6805adc60baSdrh     fd = osOpen(z,f|O_CLOEXEC,m2);
6815adc60baSdrh #else
6825adc60baSdrh     fd = osOpen(z,f,m2);
6835adc60baSdrh #endif
6845128d009Sdrh     if( fd<0 ){
6855128d009Sdrh       if( errno==EINTR ) continue;
6865128d009Sdrh       break;
6875128d009Sdrh     }
68877a3fdc1Sdrh     if( fd>=SQLITE_MINIMUM_FILE_DESCRIPTOR ) break;
6895128d009Sdrh     osClose(fd);
6905128d009Sdrh     sqlite3_log(SQLITE_WARNING,
6915128d009Sdrh                 "attempt to open \"%s\" as file descriptor %d", z, fd);
6925128d009Sdrh     fd = -1;
6930ba36215Sdrh     if( osOpen("/dev/null", O_RDONLY, m)<0 ) break;
6945128d009Sdrh   }
695e1186ab2Sdrh   if( fd>=0 ){
696e1186ab2Sdrh     if( m!=0 ){
697e1186ab2Sdrh       struct stat statbuf;
698b83c21e6Sdan       if( osFstat(fd, &statbuf)==0
699b83c21e6Sdan        && statbuf.st_size==0
700cfc17697Sdrh        && (statbuf.st_mode&0777)!=m
701b83c21e6Sdan       ){
702e1186ab2Sdrh         osFchmod(fd, m);
703e1186ab2Sdrh       }
7048c815d14Sdrh     }
7055adc60baSdrh #if defined(FD_CLOEXEC) && (!defined(O_CLOEXEC) || O_CLOEXEC==0)
706e1186ab2Sdrh     osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
7075adc60baSdrh #endif
708e1186ab2Sdrh   }
7095adc60baSdrh   return fd;
710ad4f1e54Sdrh }
71113adf8a0Sdanielk1977 
712107886abSdrh /*
7139359c7b7Sdan ** Helper functions to obtain and relinquish the global mutex. The
7148af6c228Sdrh ** global mutex is used to protect the unixInodeInfo and
7159359c7b7Sdan ** vxworksFileId objects used by this file, all of which may be
7169359c7b7Sdan ** shared by multiple threads.
7179359c7b7Sdan **
7189359c7b7Sdan ** Function unixMutexHeld() is used to assert() that the global mutex
7199359c7b7Sdan ** is held when required. This function is only used as part of assert()
7209359c7b7Sdan ** statements. e.g.
7219359c7b7Sdan **
7229359c7b7Sdan **   unixEnterMutex()
7239359c7b7Sdan **     assert( unixMutexHeld() );
7249359c7b7Sdan **   unixEnterLeave()
725095908e1Sdrh **
726095908e1Sdrh ** To prevent deadlock, the global unixBigLock must must be acquired
727095908e1Sdrh ** before the unixInodeInfo.pLockMutex mutex, if both are held.  It is
728095908e1Sdrh ** OK to get the pLockMutex without holding unixBigLock first, but if
729095908e1Sdrh ** that happens, the unixBigLock mutex must not be acquired until after
730095908e1Sdrh ** pLockMutex is released.
731095908e1Sdrh **
732095908e1Sdrh **      OK:     enter(unixBigLock),  enter(pLockInfo)
733095908e1Sdrh **      OK:     enter(unixBigLock)
734095908e1Sdrh **      OK:     enter(pLockInfo)
735095908e1Sdrh **   ERROR:     enter(pLockInfo), enter(unixBigLock)
736107886abSdrh */
73756115893Sdrh static sqlite3_mutex *unixBigLock = 0;
unixEnterMutex(void)738107886abSdrh static void unixEnterMutex(void){
739095908e1Sdrh   assert( sqlite3_mutex_notheld(unixBigLock) );  /* Not a recursive mutex */
74056115893Sdrh   sqlite3_mutex_enter(unixBigLock);
741107886abSdrh }
unixLeaveMutex(void)742107886abSdrh static void unixLeaveMutex(void){
743095908e1Sdrh   assert( sqlite3_mutex_held(unixBigLock) );
74456115893Sdrh   sqlite3_mutex_leave(unixBigLock);
745107886abSdrh }
7469359c7b7Sdan #ifdef SQLITE_DEBUG
unixMutexHeld(void)7479359c7b7Sdan static int unixMutexHeld(void) {
74856115893Sdrh   return sqlite3_mutex_held(unixBigLock);
7499359c7b7Sdan }
7509359c7b7Sdan #endif
751107886abSdrh 
752734c9864Sdrh 
753fb383e92Smistachkin #ifdef SQLITE_HAVE_OS_TRACE
754734c9864Sdrh /*
755734c9864Sdrh ** Helper function for printing out trace information from debugging
75660ec914cSpeter.d.reid ** binaries. This returns the string representation of the supplied
757734c9864Sdrh ** integer lock-type.
758734c9864Sdrh */
azFileLock(int eFileLock)759308c2a5cSdrh static const char *azFileLock(int eFileLock){
760308c2a5cSdrh   switch( eFileLock ){
761734c9864Sdrh     case NO_LOCK: return "NONE";
762734c9864Sdrh     case SHARED_LOCK: return "SHARED";
763734c9864Sdrh     case RESERVED_LOCK: return "RESERVED";
764734c9864Sdrh     case PENDING_LOCK: return "PENDING";
765734c9864Sdrh     case EXCLUSIVE_LOCK: return "EXCLUSIVE";
766734c9864Sdrh   }
767734c9864Sdrh   return "ERROR";
768734c9864Sdrh }
769734c9864Sdrh #endif
770734c9864Sdrh 
771734c9864Sdrh #ifdef SQLITE_LOCK_TRACE
772734c9864Sdrh /*
773734c9864Sdrh ** Print out information about all locking operations.
7746c7d5c5bSdrh **
775734c9864Sdrh ** This routine is used for troubleshooting locks on multithreaded
776734c9864Sdrh ** platforms.  Enable by compiling with the -DSQLITE_LOCK_TRACE
777734c9864Sdrh ** command-line option on the compiler.  This code is normally
778734c9864Sdrh ** turned off.
779734c9864Sdrh */
lockTrace(int fd,int op,struct flock * p)780734c9864Sdrh static int lockTrace(int fd, int op, struct flock *p){
781734c9864Sdrh   char *zOpName, *zType;
782734c9864Sdrh   int s;
783734c9864Sdrh   int savedErrno;
784734c9864Sdrh   if( op==F_GETLK ){
785734c9864Sdrh     zOpName = "GETLK";
786734c9864Sdrh   }else if( op==F_SETLK ){
787734c9864Sdrh     zOpName = "SETLK";
788734c9864Sdrh   }else{
78999ab3b12Sdrh     s = osFcntl(fd, op, p);
790734c9864Sdrh     sqlite3DebugPrintf("fcntl unknown %d %d %d\n", fd, op, s);
791734c9864Sdrh     return s;
792734c9864Sdrh   }
793734c9864Sdrh   if( p->l_type==F_RDLCK ){
794734c9864Sdrh     zType = "RDLCK";
795734c9864Sdrh   }else if( p->l_type==F_WRLCK ){
796734c9864Sdrh     zType = "WRLCK";
797734c9864Sdrh   }else if( p->l_type==F_UNLCK ){
798734c9864Sdrh     zType = "UNLCK";
799734c9864Sdrh   }else{
800734c9864Sdrh     assert( 0 );
801734c9864Sdrh   }
802734c9864Sdrh   assert( p->l_whence==SEEK_SET );
80399ab3b12Sdrh   s = osFcntl(fd, op, p);
804734c9864Sdrh   savedErrno = errno;
805734c9864Sdrh   sqlite3DebugPrintf("fcntl %d %d %s %s %d %d %d %d\n",
806734c9864Sdrh      threadid, fd, zOpName, zType, (int)p->l_start, (int)p->l_len,
807734c9864Sdrh      (int)p->l_pid, s);
808734c9864Sdrh   if( s==(-1) && op==F_SETLK && (p->l_type==F_RDLCK || p->l_type==F_WRLCK) ){
809734c9864Sdrh     struct flock l2;
810734c9864Sdrh     l2 = *p;
81199ab3b12Sdrh     osFcntl(fd, F_GETLK, &l2);
812734c9864Sdrh     if( l2.l_type==F_RDLCK ){
813734c9864Sdrh       zType = "RDLCK";
814734c9864Sdrh     }else if( l2.l_type==F_WRLCK ){
815734c9864Sdrh       zType = "WRLCK";
816734c9864Sdrh     }else if( l2.l_type==F_UNLCK ){
817734c9864Sdrh       zType = "UNLCK";
818734c9864Sdrh     }else{
819734c9864Sdrh       assert( 0 );
820734c9864Sdrh     }
821734c9864Sdrh     sqlite3DebugPrintf("fcntl-failure-reason: %s %d %d %d\n",
822734c9864Sdrh        zType, (int)l2.l_start, (int)l2.l_len, (int)l2.l_pid);
823734c9864Sdrh   }
824734c9864Sdrh   errno = savedErrno;
825734c9864Sdrh   return s;
826734c9864Sdrh }
82799ab3b12Sdrh #undef osFcntl
82899ab3b12Sdrh #define osFcntl lockTrace
829734c9864Sdrh #endif /* SQLITE_LOCK_TRACE */
830734c9864Sdrh 
831ff81231eSdrh /*
832ff81231eSdrh ** Retry ftruncate() calls that fail due to EINTR
8332ee53412Sdan **
834e6d4173bSdrh ** All calls to ftruncate() within this file should be made through
835e6d4173bSdrh ** this wrapper.  On the Android platform, bypassing the logic below
836e6d4173bSdrh ** could lead to a corrupt database.
837ff81231eSdrh */
robust_ftruncate(int h,sqlite3_int64 sz)838ff81231eSdrh static int robust_ftruncate(int h, sqlite3_int64 sz){
839ff81231eSdrh   int rc;
8402ee53412Sdan #ifdef __ANDROID__
8412ee53412Sdan   /* On Android, ftruncate() always uses 32-bit offsets, even if
8422ee53412Sdan   ** _FILE_OFFSET_BITS=64 is defined. This means it is unsafe to attempt to
843524a733dSdan   ** truncate a file to any size larger than 2GiB. Silently ignore any
8442ee53412Sdan   ** such attempts.  */
8452ee53412Sdan   if( sz>(sqlite3_int64)0x7FFFFFFF ){
8462ee53412Sdan     rc = SQLITE_OK;
8472ee53412Sdan   }else
8482ee53412Sdan #endif
84999ab3b12Sdrh   do{ rc = osFtruncate(h,sz); }while( rc<0 && errno==EINTR );
850ff81231eSdrh   return rc;
851ff81231eSdrh }
852734c9864Sdrh 
853734c9864Sdrh /*
854734c9864Sdrh ** This routine translates a standard POSIX errno code into something
855734c9864Sdrh ** useful to the clients of the sqlite3 functions.  Specifically, it is
856734c9864Sdrh ** intended to translate a variety of "try again" errors into SQLITE_BUSY
857734c9864Sdrh ** and a variety of "please close the file descriptor NOW" errors into
858734c9864Sdrh ** SQLITE_IOERR
859734c9864Sdrh **
860734c9864Sdrh ** Errors during initialization of locks, or file system support for locks,
861734c9864Sdrh ** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
862734c9864Sdrh */
sqliteErrorFromPosixError(int posixError,int sqliteIOErr)863734c9864Sdrh static int sqliteErrorFromPosixError(int posixError, int sqliteIOErr) {
86491c4defbSdrh   assert( (sqliteIOErr == SQLITE_IOERR_LOCK) ||
86591c4defbSdrh           (sqliteIOErr == SQLITE_IOERR_UNLOCK) ||
86691c4defbSdrh           (sqliteIOErr == SQLITE_IOERR_RDLOCK) ||
86791c4defbSdrh           (sqliteIOErr == SQLITE_IOERR_CHECKRESERVEDLOCK) );
868734c9864Sdrh   switch (posixError) {
86991c4defbSdrh   case EACCES:
870734c9864Sdrh   case EAGAIN:
871734c9864Sdrh   case ETIMEDOUT:
872734c9864Sdrh   case EBUSY:
873734c9864Sdrh   case EINTR:
874734c9864Sdrh   case ENOLCK:
875734c9864Sdrh     /* random NFS retry error, unless during file system support
876734c9864Sdrh      * introspection, in which it actually means what it says */
877734c9864Sdrh     return SQLITE_BUSY;
878734c9864Sdrh 
879734c9864Sdrh   case EPERM:
880734c9864Sdrh     return SQLITE_PERM;
881734c9864Sdrh 
882734c9864Sdrh   default:
883734c9864Sdrh     return sqliteIOErr;
884734c9864Sdrh   }
885734c9864Sdrh }
886734c9864Sdrh 
887734c9864Sdrh 
888734c9864Sdrh /******************************************************************************
889734c9864Sdrh ****************** Begin Unique File ID Utility Used By VxWorks ***************
890734c9864Sdrh **
891734c9864Sdrh ** On most versions of unix, we can get a unique ID for a file by concatenating
892734c9864Sdrh ** the device number and the inode number.  But this does not work on VxWorks.
893734c9864Sdrh ** On VxWorks, a unique file id must be based on the canonical filename.
894734c9864Sdrh **
895734c9864Sdrh ** A pointer to an instance of the following structure can be used as a
896734c9864Sdrh ** unique file ID in VxWorks.  Each instance of this structure contains
897734c9864Sdrh ** a copy of the canonical filename.  There is also a reference count.
898734c9864Sdrh ** The structure is reclaimed when the number of pointers to it drops to
899734c9864Sdrh ** zero.
900734c9864Sdrh **
901734c9864Sdrh ** There are never very many files open at one time and lookups are not
902734c9864Sdrh ** a performance-critical path, so it is sufficient to put these
903734c9864Sdrh ** structures on a linked list.
904734c9864Sdrh */
905734c9864Sdrh struct vxworksFileId {
906734c9864Sdrh   struct vxworksFileId *pNext;  /* Next in a list of them all */
907734c9864Sdrh   int nRef;                     /* Number of references to this one */
908734c9864Sdrh   int nName;                    /* Length of the zCanonicalName[] string */
909734c9864Sdrh   char *zCanonicalName;         /* Canonical filename */
910734c9864Sdrh };
911734c9864Sdrh 
912734c9864Sdrh #if OS_VXWORKS
913734c9864Sdrh /*
9149b35ea62Sdrh ** All unique filenames are held on a linked list headed by this
915734c9864Sdrh ** variable:
916734c9864Sdrh */
917734c9864Sdrh static struct vxworksFileId *vxworksFileList = 0;
918734c9864Sdrh 
919734c9864Sdrh /*
920734c9864Sdrh ** Simplify a filename into its canonical form
921734c9864Sdrh ** by making the following changes:
922734c9864Sdrh **
923734c9864Sdrh **  * removing any trailing and duplicate /
9249b35ea62Sdrh **  * convert /./ into just /
9259b35ea62Sdrh **  * convert /A/../ where A is any simple name into just /
926734c9864Sdrh **
927734c9864Sdrh ** Changes are made in-place.  Return the new name length.
928734c9864Sdrh **
929734c9864Sdrh ** The original filename is in z[0..n-1].  Return the number of
930734c9864Sdrh ** characters in the simplified name.
931734c9864Sdrh */
vxworksSimplifyName(char * z,int n)932734c9864Sdrh static int vxworksSimplifyName(char *z, int n){
933734c9864Sdrh   int i, j;
934734c9864Sdrh   while( n>1 && z[n-1]=='/' ){ n--; }
935734c9864Sdrh   for(i=j=0; i<n; i++){
936734c9864Sdrh     if( z[i]=='/' ){
937734c9864Sdrh       if( z[i+1]=='/' ) continue;
938734c9864Sdrh       if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
939734c9864Sdrh         i += 1;
940734c9864Sdrh         continue;
941734c9864Sdrh       }
942734c9864Sdrh       if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
943734c9864Sdrh         while( j>0 && z[j-1]!='/' ){ j--; }
944734c9864Sdrh         if( j>0 ){ j--; }
945734c9864Sdrh         i += 2;
946734c9864Sdrh         continue;
947734c9864Sdrh       }
948734c9864Sdrh     }
949734c9864Sdrh     z[j++] = z[i];
950734c9864Sdrh   }
951734c9864Sdrh   z[j] = 0;
952734c9864Sdrh   return j;
953734c9864Sdrh }
954734c9864Sdrh 
955734c9864Sdrh /*
956734c9864Sdrh ** Find a unique file ID for the given absolute pathname.  Return
957734c9864Sdrh ** a pointer to the vxworksFileId object.  This pointer is the unique
958734c9864Sdrh ** file ID.
959734c9864Sdrh **
960734c9864Sdrh ** The nRef field of the vxworksFileId object is incremented before
961734c9864Sdrh ** the object is returned.  A new vxworksFileId object is created
962734c9864Sdrh ** and added to the global list if necessary.
963734c9864Sdrh **
964734c9864Sdrh ** If a memory allocation error occurs, return NULL.
965734c9864Sdrh */
vxworksFindFileId(const char * zAbsoluteName)966734c9864Sdrh static struct vxworksFileId *vxworksFindFileId(const char *zAbsoluteName){
967734c9864Sdrh   struct vxworksFileId *pNew;         /* search key and new file ID */
968734c9864Sdrh   struct vxworksFileId *pCandidate;   /* For looping over existing file IDs */
969734c9864Sdrh   int n;                              /* Length of zAbsoluteName string */
970734c9864Sdrh 
971734c9864Sdrh   assert( zAbsoluteName[0]=='/' );
972ea678832Sdrh   n = (int)strlen(zAbsoluteName);
973f3cdcdccSdrh   pNew = sqlite3_malloc64( sizeof(*pNew) + (n+1) );
974734c9864Sdrh   if( pNew==0 ) return 0;
975734c9864Sdrh   pNew->zCanonicalName = (char*)&pNew[1];
976734c9864Sdrh   memcpy(pNew->zCanonicalName, zAbsoluteName, n+1);
977734c9864Sdrh   n = vxworksSimplifyName(pNew->zCanonicalName, n);
978734c9864Sdrh 
979734c9864Sdrh   /* Search for an existing entry that matching the canonical name.
980734c9864Sdrh   ** If found, increment the reference count and return a pointer to
981734c9864Sdrh   ** the existing file ID.
982734c9864Sdrh   */
983734c9864Sdrh   unixEnterMutex();
984734c9864Sdrh   for(pCandidate=vxworksFileList; pCandidate; pCandidate=pCandidate->pNext){
985734c9864Sdrh     if( pCandidate->nName==n
986734c9864Sdrh      && memcmp(pCandidate->zCanonicalName, pNew->zCanonicalName, n)==0
987734c9864Sdrh     ){
988734c9864Sdrh        sqlite3_free(pNew);
989734c9864Sdrh        pCandidate->nRef++;
990734c9864Sdrh        unixLeaveMutex();
991734c9864Sdrh        return pCandidate;
992734c9864Sdrh     }
993734c9864Sdrh   }
994734c9864Sdrh 
995734c9864Sdrh   /* No match was found.  We will make a new file ID */
996734c9864Sdrh   pNew->nRef = 1;
997734c9864Sdrh   pNew->nName = n;
998734c9864Sdrh   pNew->pNext = vxworksFileList;
999734c9864Sdrh   vxworksFileList = pNew;
1000734c9864Sdrh   unixLeaveMutex();
1001734c9864Sdrh   return pNew;
1002734c9864Sdrh }
1003734c9864Sdrh 
1004734c9864Sdrh /*
1005734c9864Sdrh ** Decrement the reference count on a vxworksFileId object.  Free
1006734c9864Sdrh ** the object when the reference count reaches zero.
1007734c9864Sdrh */
vxworksReleaseFileId(struct vxworksFileId * pId)1008734c9864Sdrh static void vxworksReleaseFileId(struct vxworksFileId *pId){
1009734c9864Sdrh   unixEnterMutex();
1010734c9864Sdrh   assert( pId->nRef>0 );
1011734c9864Sdrh   pId->nRef--;
1012734c9864Sdrh   if( pId->nRef==0 ){
1013734c9864Sdrh     struct vxworksFileId **pp;
1014734c9864Sdrh     for(pp=&vxworksFileList; *pp && *pp!=pId; pp = &((*pp)->pNext)){}
1015734c9864Sdrh     assert( *pp==pId );
1016734c9864Sdrh     *pp = pId->pNext;
1017734c9864Sdrh     sqlite3_free(pId);
1018734c9864Sdrh   }
1019734c9864Sdrh   unixLeaveMutex();
1020734c9864Sdrh }
1021734c9864Sdrh #endif /* OS_VXWORKS */
1022734c9864Sdrh /*************** End of Unique File ID Utility Used By VxWorks ****************
1023734c9864Sdrh ******************************************************************************/
1024734c9864Sdrh 
1025734c9864Sdrh 
1026734c9864Sdrh /******************************************************************************
1027734c9864Sdrh *************************** Posix Advisory Locking ****************************
1028734c9864Sdrh **
10299b35ea62Sdrh ** POSIX advisory locks are broken by design.  ANSI STD 1003.1 (1996)
1030bbd42a6dSdrh ** section 6.5.2.2 lines 483 through 490 specify that when a process
1031bbd42a6dSdrh ** sets or clears a lock, that operation overrides any prior locks set
1032bbd42a6dSdrh ** by the same process.  It does not explicitly say so, but this implies
1033bbd42a6dSdrh ** that it overrides locks set by the same process using a different
1034bbd42a6dSdrh ** file descriptor.  Consider this test case:
10356c7d5c5bSdrh **
10366c7d5c5bSdrh **       int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
1037bbd42a6dSdrh **       int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
1038bbd42a6dSdrh **
1039bbd42a6dSdrh ** Suppose ./file1 and ./file2 are really the same file (because
1040bbd42a6dSdrh ** one is a hard or symbolic link to the other) then if you set
1041bbd42a6dSdrh ** an exclusive lock on fd1, then try to get an exclusive lock
1042bbd42a6dSdrh ** on fd2, it works.  I would have expected the second lock to
1043bbd42a6dSdrh ** fail since there was already a lock on the file due to fd1.
1044bbd42a6dSdrh ** But not so.  Since both locks came from the same process, the
1045bbd42a6dSdrh ** second overrides the first, even though they were on different
1046bbd42a6dSdrh ** file descriptors opened on different file names.
1047bbd42a6dSdrh **
1048734c9864Sdrh ** This means that we cannot use POSIX locks to synchronize file access
1049734c9864Sdrh ** among competing threads of the same process.  POSIX locks will work fine
1050bbd42a6dSdrh ** to synchronize access for threads in separate processes, but not
1051bbd42a6dSdrh ** threads within the same process.
1052bbd42a6dSdrh **
1053bbd42a6dSdrh ** To work around the problem, SQLite has to manage file locks internally
1054bbd42a6dSdrh ** on its own.  Whenever a new database is opened, we have to find the
1055bbd42a6dSdrh ** specific inode of the database file (the inode is determined by the
1056bbd42a6dSdrh ** st_dev and st_ino fields of the stat structure that fstat() fills in)
1057bbd42a6dSdrh ** and check for locks already existing on that inode.  When locks are
1058bbd42a6dSdrh ** created or removed, we have to look at our own internal record of the
1059bbd42a6dSdrh ** locks to see if another thread has previously set a lock on that same
1060bbd42a6dSdrh ** inode.
1061bbd42a6dSdrh **
10629b35ea62Sdrh ** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
10639b35ea62Sdrh ** For VxWorks, we have to use the alternative unique ID system based on
10649b35ea62Sdrh ** canonical filename and implemented in the previous division.)
10659b35ea62Sdrh **
1066ad94b58aSdanielk1977 ** The sqlite3_file structure for POSIX is no longer just an integer file
1067bbd42a6dSdrh ** descriptor.  It is now a structure that holds the integer file
1068bbd42a6dSdrh ** descriptor and a pointer to a structure that describes the internal
1069bbd42a6dSdrh ** locks on the corresponding inode.  There is one locking structure
1070ad94b58aSdanielk1977 ** per inode, so if the same inode is opened twice, both unixFile structures
1071bbd42a6dSdrh ** point to the same locking structure.  The locking structure keeps
1072bbd42a6dSdrh ** a reference count (so we will know when to delete it) and a "cnt"
1073bbd42a6dSdrh ** field that tells us its internal lock status.  cnt==0 means the
1074bbd42a6dSdrh ** file is unlocked.  cnt==-1 means the file has an exclusive lock.
1075bbd42a6dSdrh ** cnt>0 means there are cnt shared locks on the file.
1076bbd42a6dSdrh **
1077bbd42a6dSdrh ** Any attempt to lock or unlock a file first checks the locking
1078bbd42a6dSdrh ** structure.  The fcntl() system call is only invoked to set a
1079bbd42a6dSdrh ** POSIX lock if the internal lock structure transitions between
1080bbd42a6dSdrh ** a locked and an unlocked state.
1081bbd42a6dSdrh **
1082734c9864Sdrh ** But wait:  there are yet more problems with POSIX advisory locks.
1083bbd42a6dSdrh **
1084bbd42a6dSdrh ** If you close a file descriptor that points to a file that has locks,
1085bbd42a6dSdrh ** all locks on that file that are owned by the current process are
10868af6c228Sdrh ** released.  To work around this problem, each unixInodeInfo object
10878af6c228Sdrh ** maintains a count of the number of pending locks on tha inode.
10888af6c228Sdrh ** When an attempt is made to close an unixFile, if there are
1089ad94b58aSdanielk1977 ** other unixFile open on the same inode that are holding locks, the call
1090bbd42a6dSdrh ** to close() the file descriptor is deferred until all of the locks clear.
10918af6c228Sdrh ** The unixInodeInfo structure keeps a list of file descriptors that need to
1092bbd42a6dSdrh ** be closed and that list is walked (and cleared) when the last lock
1093bbd42a6dSdrh ** clears.
1094bbd42a6dSdrh **
10959b35ea62Sdrh ** Yet another problem:  LinuxThreads do not play well with posix locks.
10965fdae771Sdrh **
10979b35ea62Sdrh ** Many older versions of linux use the LinuxThreads library which is
10989b35ea62Sdrh ** not posix compliant.  Under LinuxThreads, a lock created by thread
1099734c9864Sdrh ** A cannot be modified or overridden by a different thread B.
1100734c9864Sdrh ** Only thread A can modify the lock.  Locking behavior is correct
1101734c9864Sdrh ** if the appliation uses the newer Native Posix Thread Library (NPTL)
1102734c9864Sdrh ** on linux - with NPTL a lock created by thread A can override locks
1103734c9864Sdrh ** in thread B.  But there is no way to know at compile-time which
1104734c9864Sdrh ** threading library is being used.  So there is no way to know at
1105734c9864Sdrh ** compile-time whether or not thread A can override locks on thread B.
11068af6c228Sdrh ** One has to do a run-time check to discover the behavior of the
1107734c9864Sdrh ** current process.
11085fdae771Sdrh **
11098af6c228Sdrh ** SQLite used to support LinuxThreads.  But support for LinuxThreads
11108af6c228Sdrh ** was dropped beginning with version 3.7.0.  SQLite will still work with
11118af6c228Sdrh ** LinuxThreads provided that (1) there is no more than one connection
11128af6c228Sdrh ** per database file in the same process and (2) database connections
11138af6c228Sdrh ** do not move across threads.
1114bbd42a6dSdrh */
1115bbd42a6dSdrh 
1116bbd42a6dSdrh /*
11176c7d5c5bSdrh ** An instance of the following structure serves as the key used
11188af6c228Sdrh ** to locate a particular unixInodeInfo object.
11196c7d5c5bSdrh */
11206c7d5c5bSdrh struct unixFileId {
1121bbd42a6dSdrh   dev_t dev;                  /* Device number */
11226c7d5c5bSdrh #if OS_VXWORKS
1123107886abSdrh   struct vxworksFileId *pId;  /* Unique file ID for vxworks. */
112497185489Schw #else
112525ef7f55Sdrh   /* We are told that some versions of Android contain a bug that
112625ef7f55Sdrh   ** sizes ino_t at only 32-bits instead of 64-bits. (See
112725ef7f55Sdrh   ** https://android-review.googlesource.com/#/c/115351/3/dist/sqlite3.c)
112825ef7f55Sdrh   ** To work around this, always allocate 64-bits for the inode number.
112925ef7f55Sdrh   ** On small machines that only have 32-bit inodes, this wastes 4 bytes,
113025ef7f55Sdrh   ** but that should not be a big deal. */
113125ef7f55Sdrh   /* WAS:  ino_t ino;   */
113225ef7f55Sdrh   u64 ino;                   /* Inode number */
113397185489Schw #endif
11346c7d5c5bSdrh };
11356c7d5c5bSdrh 
11366c7d5c5bSdrh /*
1137bbd42a6dSdrh ** An instance of the following structure is allocated for each open
113824efa544Sdrh ** inode.
1139bbd42a6dSdrh **
1140ad94b58aSdanielk1977 ** A single inode can have multiple file descriptors, so each unixFile
1141bbd42a6dSdrh ** structure contains a pointer to an instance of this object and this
1142ad94b58aSdanielk1977 ** object keeps a count of the number of unixFile pointing to it.
1143da6dc240Sdrh **
1144da6dc240Sdrh ** Mutex rules:
1145da6dc240Sdrh **
1146095908e1Sdrh **  (1) Only the pLockMutex mutex must be held in order to read or write
1147da6dc240Sdrh **      any of the locking fields:
1148ef52b36aSdrh **          nShared, nLock, eFileLock, bProcessLock, pUnused
1149da6dc240Sdrh **
1150da6dc240Sdrh **  (2) When nRef>0, then the following fields are unchanging and can
1151da6dc240Sdrh **      be read (but not written) without holding any mutex:
1152da6dc240Sdrh **          fileId, pLockMutex
1153da6dc240Sdrh **
1154ef52b36aSdrh **  (3) With the exceptions above, all the fields may only be read
1155da6dc240Sdrh **      or written while holding the global unixBigLock mutex.
1156095908e1Sdrh **
1157095908e1Sdrh ** Deadlock prevention:  The global unixBigLock mutex may not
1158095908e1Sdrh ** be acquired while holding the pLockMutex mutex.  If both unixBigLock
1159095908e1Sdrh ** and pLockMutex are needed, then unixBigLock must be acquired first.
1160bbd42a6dSdrh */
11618af6c228Sdrh struct unixInodeInfo {
11628af6c228Sdrh   struct unixFileId fileId;       /* The lookup key */
1163da6dc240Sdrh   sqlite3_mutex *pLockMutex;      /* Hold this mutex for... */
1164308c2a5cSdrh   int nShared;                      /* Number of SHARED locks held */
1165da6dc240Sdrh   int nLock;                        /* Number of outstanding file locks */
1166a7e61d8bSdrh   unsigned char eFileLock;          /* One of SHARED_LOCK, RESERVED_LOCK etc. */
1167a7e61d8bSdrh   unsigned char bProcessLock;       /* An exclusive process lock is held */
1168095908e1Sdrh   UnixUnusedFd *pUnused;            /* Unused file descriptors to close */
1169bbd42a6dSdrh   int nRef;                       /* Number of pointers to this structure */
1170d91c68f6Sdrh   unixShmNode *pShmNode;          /* Shared memory associated with this inode */
1171d91c68f6Sdrh   unixInodeInfo *pNext;           /* List of all unixInodeInfo objects */
1172d91c68f6Sdrh   unixInodeInfo *pPrev;           /*    .... doubly linked */
1173d4a80312Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
11747ed97b9dSdrh   unsigned long long sharedByte;  /* for AFP simulated shared lock */
11757ed97b9dSdrh #endif
11766c7d5c5bSdrh #if OS_VXWORKS
117797185489Schw   sem_t *pSem;                    /* Named POSIX semaphore */
11782238dcccSdrh   char aSemName[MAX_PATHNAME+2];  /* Name of that semaphore */
117997185489Schw #endif
1180bbd42a6dSdrh };
1181bbd42a6dSdrh 
1182bbd42a6dSdrh /*
11838af6c228Sdrh ** A lists of all unixInodeInfo objects.
118424efa544Sdrh **
118524efa544Sdrh ** Must hold unixBigLock in order to read or write this variable.
1186bbd42a6dSdrh */
1187c68886bbSdrh static unixInodeInfo *inodeList = 0;  /* All unixInodeInfo objects */
1188095908e1Sdrh 
1189095908e1Sdrh #ifdef SQLITE_DEBUG
1190095908e1Sdrh /*
119124efa544Sdrh ** True if the inode mutex (on the unixFile.pFileMutex field) is held, or not.
119224efa544Sdrh ** This routine is used only within assert() to help verify correct mutex
119324efa544Sdrh ** usage.
1194095908e1Sdrh */
unixFileMutexHeld(unixFile * pFile)1195095908e1Sdrh int unixFileMutexHeld(unixFile *pFile){
1196095908e1Sdrh   assert( pFile->pInode );
1197095908e1Sdrh   return sqlite3_mutex_held(pFile->pInode->pLockMutex);
1198095908e1Sdrh }
unixFileMutexNotheld(unixFile * pFile)1199095908e1Sdrh int unixFileMutexNotheld(unixFile *pFile){
1200095908e1Sdrh   assert( pFile->pInode );
1201095908e1Sdrh   return sqlite3_mutex_notheld(pFile->pInode->pLockMutex);
1202095908e1Sdrh }
1203095908e1Sdrh #endif
12045fdae771Sdrh 
12055fdae771Sdrh /*
1206e18d4953Sdan **
1207aaeaa18eSdrh ** This function - unixLogErrorAtLine(), is only ever called via the macro
1208e18d4953Sdan ** unixLogError().
1209e18d4953Sdan **
1210e18d4953Sdan ** It is invoked after an error occurs in an OS function and errno has been
1211e18d4953Sdan ** set. It logs a message using sqlite3_log() containing the current value of
1212e18d4953Sdan ** errno and, if possible, the human-readable equivalent from strerror() or
1213e18d4953Sdan ** strerror_r().
1214e18d4953Sdan **
1215e18d4953Sdan ** The first argument passed to the macro should be the error code that
1216e18d4953Sdan ** will be returned to SQLite (e.g. SQLITE_IOERR_DELETE, SQLITE_CANTOPEN).
1217e18d4953Sdan ** The two subsequent arguments should be the name of the OS function that
1218d5578433Smistachkin ** failed (e.g. "unlink", "open") and the associated file-system path,
1219e18d4953Sdan ** if any.
1220e18d4953Sdan */
12210e9365ceSdrh #define unixLogError(a,b,c)     unixLogErrorAtLine(a,b,c,__LINE__)
unixLogErrorAtLine(int errcode,const char * zFunc,const char * zPath,int iLine)12220e9365ceSdrh static int unixLogErrorAtLine(
1223e18d4953Sdan   int errcode,                    /* SQLite error code */
1224e18d4953Sdan   const char *zFunc,              /* Name of OS function that failed */
1225e18d4953Sdan   const char *zPath,              /* File path associated with error */
1226e18d4953Sdan   int iLine                       /* Source line number where error occurred */
1227e18d4953Sdan ){
1228e18d4953Sdan   char *zErr;                     /* Message from strerror() or equivalent */
12290e9365ceSdrh   int iErrno = errno;             /* Saved syscall error number */
1230e18d4953Sdan 
1231e18d4953Sdan   /* If this is not a threadsafe build (SQLITE_THREADSAFE==0), then use
1232e18d4953Sdan   ** the strerror() function to obtain the human-readable error message
1233e18d4953Sdan   ** equivalent to errno. Otherwise, use strerror_r().
1234e18d4953Sdan   */
1235e18d4953Sdan #if SQLITE_THREADSAFE && defined(HAVE_STRERROR_R)
1236e18d4953Sdan   char aErr[80];
1237e18d4953Sdan   memset(aErr, 0, sizeof(aErr));
1238e18d4953Sdan   zErr = aErr;
1239e18d4953Sdan 
1240e18d4953Sdan   /* If STRERROR_R_CHAR_P (set by autoconf scripts) or __USE_GNU is defined,
1241d5578433Smistachkin   ** assume that the system provides the GNU version of strerror_r() that
1242e18d4953Sdan   ** returns a pointer to a buffer containing the error message. That pointer
1243e18d4953Sdan   ** may point to aErr[], or it may point to some static storage somewhere.
1244e18d4953Sdan   ** Otherwise, assume that the system provides the POSIX version of
1245e18d4953Sdan   ** strerror_r(), which always writes an error message into aErr[].
1246e18d4953Sdan   **
1247e18d4953Sdan   ** If the code incorrectly assumes that it is the POSIX version that is
1248e18d4953Sdan   ** available, the error message will often be an empty string. Not a
1249e18d4953Sdan   ** huge problem. Incorrectly concluding that the GNU version is available
1250e18d4953Sdan   ** could lead to a segfault though.
1251e18d4953Sdan   */
1252e18d4953Sdan #if defined(STRERROR_R_CHAR_P) || defined(__USE_GNU)
1253e18d4953Sdan   zErr =
1254e18d4953Sdan # endif
12550e9365ceSdrh   strerror_r(iErrno, aErr, sizeof(aErr)-1);
1256e18d4953Sdan 
1257e18d4953Sdan #elif SQLITE_THREADSAFE
1258e18d4953Sdan   /* This is a threadsafe build, but strerror_r() is not available. */
1259e18d4953Sdan   zErr = "";
1260e18d4953Sdan #else
1261e18d4953Sdan   /* Non-threadsafe build, use strerror(). */
12620e9365ceSdrh   zErr = strerror(iErrno);
1263e18d4953Sdan #endif
1264e18d4953Sdan 
12650e9365ceSdrh   if( zPath==0 ) zPath = "";
1266e18d4953Sdan   sqlite3_log(errcode,
12670e9365ceSdrh       "os_unix.c:%d: (%d) %s(%s) - %s",
12680e9365ceSdrh       iLine, iErrno, zFunc, zPath, zErr
1269e18d4953Sdan   );
1270e18d4953Sdan 
1271e18d4953Sdan   return errcode;
1272e18d4953Sdan }
1273e18d4953Sdan 
12740e9365ceSdrh /*
12750e9365ceSdrh ** Close a file descriptor.
12760e9365ceSdrh **
12770e9365ceSdrh ** We assume that close() almost always works, since it is only in a
12780e9365ceSdrh ** very sick application or on a very sick platform that it might fail.
12790e9365ceSdrh ** If it does fail, simply leak the file descriptor, but do log the
12800e9365ceSdrh ** error.
12810e9365ceSdrh **
12820e9365ceSdrh ** Note that it is not safe to retry close() after EINTR since the
12830e9365ceSdrh ** file descriptor might have already been reused by another thread.
12840e9365ceSdrh ** So we don't even try to recover from an EINTR.  Just log the error
12850e9365ceSdrh ** and move on.
12860e9365ceSdrh */
robust_close(unixFile * pFile,int h,int lineno)12870e9365ceSdrh static void robust_close(unixFile *pFile, int h, int lineno){
128899ab3b12Sdrh   if( osClose(h) ){
12890e9365ceSdrh     unixLogErrorAtLine(SQLITE_IOERR_CLOSE, "close",
12900e9365ceSdrh                        pFile ? pFile->zPath : 0, lineno);
12910e9365ceSdrh   }
12920e9365ceSdrh }
1293e18d4953Sdan 
1294e18d4953Sdan /*
1295e6d4173bSdrh ** Set the pFile->lastErrno.  Do this in a subroutine as that provides
1296e6d4173bSdrh ** a convenient place to set a breakpoint.
12974bf66fd6Sdrh */
storeLastErrno(unixFile * pFile,int error)12984bf66fd6Sdrh static void storeLastErrno(unixFile *pFile, int error){
12994bf66fd6Sdrh   pFile->lastErrno = error;
13004bf66fd6Sdrh }
13014bf66fd6Sdrh 
13024bf66fd6Sdrh /*
1303b0ac3e3aSdan ** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
1304b0ac3e3aSdan */
closePendingFds(unixFile * pFile)13050e9365ceSdrh static void closePendingFds(unixFile *pFile){
1306b0ac3e3aSdan   unixInodeInfo *pInode = pFile->pInode;
1307b0ac3e3aSdan   UnixUnusedFd *p;
1308b0ac3e3aSdan   UnixUnusedFd *pNext;
1309ef52b36aSdrh   assert( unixFileMutexHeld(pFile) );
1310b0ac3e3aSdan   for(p=pInode->pUnused; p; p=pNext){
1311b0ac3e3aSdan     pNext = p->pNext;
13120e9365ceSdrh     robust_close(pFile, p->fd, __LINE__);
1313b0ac3e3aSdan     sqlite3_free(p);
1314b0ac3e3aSdan   }
13150e9365ceSdrh   pInode->pUnused = 0;
1316b0ac3e3aSdan }
1317b0ac3e3aSdan 
1318b0ac3e3aSdan /*
13198af6c228Sdrh ** Release a unixInodeInfo structure previously allocated by findInodeInfo().
13209359c7b7Sdan **
132124efa544Sdrh ** The global mutex must be held when this routine is called, but the mutex
132224efa544Sdrh ** on the inode being deleted must NOT be held.
13236c7d5c5bSdrh */
releaseInodeInfo(unixFile * pFile)1324b0ac3e3aSdan static void releaseInodeInfo(unixFile *pFile){
1325b0ac3e3aSdan   unixInodeInfo *pInode = pFile->pInode;
13269359c7b7Sdan   assert( unixMutexHeld() );
1327095908e1Sdrh   assert( unixFileMutexNotheld(pFile) );
1328661d71afSdan   if( ALWAYS(pInode) ){
13298af6c228Sdrh     pInode->nRef--;
13308af6c228Sdrh     if( pInode->nRef==0 ){
1331d91c68f6Sdrh       assert( pInode->pShmNode==0 );
1332ef52b36aSdrh       sqlite3_mutex_enter(pInode->pLockMutex);
1333b0ac3e3aSdan       closePendingFds(pFile);
1334ef52b36aSdrh       sqlite3_mutex_leave(pInode->pLockMutex);
13358af6c228Sdrh       if( pInode->pPrev ){
13368af6c228Sdrh         assert( pInode->pPrev->pNext==pInode );
13378af6c228Sdrh         pInode->pPrev->pNext = pInode->pNext;
1338da0e768bSdrh       }else{
13398af6c228Sdrh         assert( inodeList==pInode );
13408af6c228Sdrh         inodeList = pInode->pNext;
1341da0e768bSdrh       }
13428af6c228Sdrh       if( pInode->pNext ){
13438af6c228Sdrh         assert( pInode->pNext->pPrev==pInode );
13448af6c228Sdrh         pInode->pNext->pPrev = pInode->pPrev;
1345da0e768bSdrh       }
1346da6dc240Sdrh       sqlite3_mutex_free(pInode->pLockMutex);
13478af6c228Sdrh       sqlite3_free(pInode);
1348bbd42a6dSdrh     }
1349bbd42a6dSdrh   }
1350e339d65aSdanielk1977 }
1351bbd42a6dSdrh 
1352bbd42a6dSdrh /*
13538af6c228Sdrh ** Given a file descriptor, locate the unixInodeInfo object that
13548af6c228Sdrh ** describes that file descriptor.  Create a new one if necessary.  The
13558af6c228Sdrh ** return value might be uninitialized if an error occurs.
13566c7d5c5bSdrh **
135724efa544Sdrh ** The global mutex must held when calling this routine.
13589359c7b7Sdan **
13596c7d5c5bSdrh ** Return an appropriate error code.
13606c7d5c5bSdrh */
findInodeInfo(unixFile * pFile,unixInodeInfo ** ppInode)13618af6c228Sdrh static int findInodeInfo(
13626c7d5c5bSdrh   unixFile *pFile,               /* Unix file with file desc used in the key */
1363d91c68f6Sdrh   unixInodeInfo **ppInode        /* Return the unixInodeInfo object here */
13646c7d5c5bSdrh ){
13656c7d5c5bSdrh   int rc;                        /* System call return code */
13666c7d5c5bSdrh   int fd;                        /* The file descriptor for pFile */
13678af6c228Sdrh   struct unixFileId fileId;      /* Lookup key for the unixInodeInfo */
13686c7d5c5bSdrh   struct stat statbuf;           /* Low-level file information */
1369d91c68f6Sdrh   unixInodeInfo *pInode = 0;     /* Candidate unixInodeInfo object */
13706c7d5c5bSdrh 
13719359c7b7Sdan   assert( unixMutexHeld() );
13729359c7b7Sdan 
13736c7d5c5bSdrh   /* Get low-level information about the file that we can used to
13746c7d5c5bSdrh   ** create a unique name for the file.
13756c7d5c5bSdrh   */
13766c7d5c5bSdrh   fd = pFile->h;
137799ab3b12Sdrh   rc = osFstat(fd, &statbuf);
13786c7d5c5bSdrh   if( rc!=0 ){
13794bf66fd6Sdrh     storeLastErrno(pFile, errno);
138040fe8d31Sdrh #if defined(EOVERFLOW) && defined(SQLITE_DISABLE_LFS)
13816c7d5c5bSdrh     if( pFile->lastErrno==EOVERFLOW ) return SQLITE_NOLFS;
13826c7d5c5bSdrh #endif
13836c7d5c5bSdrh     return SQLITE_IOERR;
13846c7d5c5bSdrh   }
13856c7d5c5bSdrh 
1386eb0d74ffSdrh #ifdef __APPLE__
13876c7d5c5bSdrh   /* On OS X on an msdos filesystem, the inode number is reported
13886c7d5c5bSdrh   ** incorrectly for zero-size files.  See ticket #3260.  To work
13896c7d5c5bSdrh   ** around this problem (we consider it a bug in OS X, not SQLite)
13906c7d5c5bSdrh   ** we always increase the file size to 1 by writing a single byte
13916c7d5c5bSdrh   ** prior to accessing the inode number.  The one byte written is
13926c7d5c5bSdrh   ** an ASCII 'S' character which also happens to be the first byte
13936c7d5c5bSdrh   ** in the header of every SQLite database.  In this way, if there
13946c7d5c5bSdrh   ** is a race condition such that another thread has already populated
13956c7d5c5bSdrh   ** the first page of the database, no damage is done.
13966c7d5c5bSdrh   */
13977ed97b9dSdrh   if( statbuf.st_size==0 && (pFile->fsFlags & SQLITE_FSFLAGS_IS_MSDOS)!=0 ){
1398e562be52Sdrh     do{ rc = osWrite(fd, "S", 1); }while( rc<0 && errno==EINTR );
1399eb0d74ffSdrh     if( rc!=1 ){
14004bf66fd6Sdrh       storeLastErrno(pFile, errno);
1401eb0d74ffSdrh       return SQLITE_IOERR;
1402eb0d74ffSdrh     }
140399ab3b12Sdrh     rc = osFstat(fd, &statbuf);
14046c7d5c5bSdrh     if( rc!=0 ){
14054bf66fd6Sdrh       storeLastErrno(pFile, errno);
14066c7d5c5bSdrh       return SQLITE_IOERR;
14076c7d5c5bSdrh     }
14086c7d5c5bSdrh   }
1409eb0d74ffSdrh #endif
14106c7d5c5bSdrh 
14118af6c228Sdrh   memset(&fileId, 0, sizeof(fileId));
14128af6c228Sdrh   fileId.dev = statbuf.st_dev;
14136c7d5c5bSdrh #if OS_VXWORKS
14148af6c228Sdrh   fileId.pId = pFile->pId;
14156c7d5c5bSdrh #else
141625ef7f55Sdrh   fileId.ino = (u64)statbuf.st_ino;
14176c7d5c5bSdrh #endif
141824efa544Sdrh   assert( unixMutexHeld() );
14198af6c228Sdrh   pInode = inodeList;
14208af6c228Sdrh   while( pInode && memcmp(&fileId, &pInode->fileId, sizeof(fileId)) ){
14218af6c228Sdrh     pInode = pInode->pNext;
14226c7d5c5bSdrh   }
14238af6c228Sdrh   if( pInode==0 ){
1424f3cdcdccSdrh     pInode = sqlite3_malloc64( sizeof(*pInode) );
14258af6c228Sdrh     if( pInode==0 ){
1426fad3039cSmistachkin       return SQLITE_NOMEM_BKPT;
14276c7d5c5bSdrh     }
14288af6c228Sdrh     memset(pInode, 0, sizeof(*pInode));
14298af6c228Sdrh     memcpy(&pInode->fileId, &fileId, sizeof(fileId));
14306886d6dbSdrh     if( sqlite3GlobalConfig.bCoreMutex ){
1431da6dc240Sdrh       pInode->pLockMutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
14326886d6dbSdrh       if( pInode->pLockMutex==0 ){
14336886d6dbSdrh         sqlite3_free(pInode);
14346886d6dbSdrh         return SQLITE_NOMEM_BKPT;
14356886d6dbSdrh       }
14366886d6dbSdrh     }
14378af6c228Sdrh     pInode->nRef = 1;
143824efa544Sdrh     assert( unixMutexHeld() );
14398af6c228Sdrh     pInode->pNext = inodeList;
14408af6c228Sdrh     pInode->pPrev = 0;
14418af6c228Sdrh     if( inodeList ) inodeList->pPrev = pInode;
14428af6c228Sdrh     inodeList = pInode;
14436c7d5c5bSdrh   }else{
14448af6c228Sdrh     pInode->nRef++;
14456c7d5c5bSdrh   }
14468af6c228Sdrh   *ppInode = pInode;
14477708e972Sdrh   return SQLITE_OK;
14487708e972Sdrh }
14497708e972Sdrh 
1450b959a017Sdrh /*
1451b959a017Sdrh ** Return TRUE if pFile has been renamed or unlinked since it was first opened.
1452b959a017Sdrh */
fileHasMoved(unixFile * pFile)1453b959a017Sdrh static int fileHasMoved(unixFile *pFile){
145461ffea54Sdrh #if OS_VXWORKS
145561ffea54Sdrh   return pFile->pInode!=0 && pFile->pId!=pFile->pInode->fileId.pId;
145661ffea54Sdrh #else
1457b959a017Sdrh   struct stat buf;
1458b959a017Sdrh   return pFile->pInode!=0 &&
145925ef7f55Sdrh       (osStat(pFile->zPath, &buf)!=0
146025ef7f55Sdrh          || (u64)buf.st_ino!=pFile->pInode->fileId.ino);
146191be7dc3Sdrh #endif
1462b959a017Sdrh }
1463b959a017Sdrh 
14645b1a2566Saswift 
14655b1a2566Saswift /*
1466fbc7e884Sdrh ** Check a unixFile that is a database.  Verify the following:
1467fbc7e884Sdrh **
1468fbc7e884Sdrh ** (1) There is exactly one hard link on the file
1469fbc7e884Sdrh ** (2) The file is not a symbolic link
1470fbc7e884Sdrh ** (3) The file has not been renamed or unlinked
1471fbc7e884Sdrh **
1472fbc7e884Sdrh ** Issue sqlite3_log(SQLITE_WARNING,...) messages if anything is not right.
1473fbc7e884Sdrh */
verifyDbFile(unixFile * pFile)1474fbc7e884Sdrh static void verifyDbFile(unixFile *pFile){
1475fbc7e884Sdrh   struct stat buf;
1476fbc7e884Sdrh   int rc;
147786151e89Sdrh 
147886151e89Sdrh   /* These verifications occurs for the main database only */
147986151e89Sdrh   if( pFile->ctrlFlags & UNIXFILE_NOLOCK ) return;
148086151e89Sdrh 
1481fbc7e884Sdrh   rc = osFstat(pFile->h, &buf);
1482fbc7e884Sdrh   if( rc!=0 ){
1483fbc7e884Sdrh     sqlite3_log(SQLITE_WARNING, "cannot fstat db file %s", pFile->zPath);
1484fbc7e884Sdrh     return;
1485fbc7e884Sdrh   }
14866369bc3fSdrh   if( buf.st_nlink==0 ){
1487fbc7e884Sdrh     sqlite3_log(SQLITE_WARNING, "file unlinked while open: %s", pFile->zPath);
1488fbc7e884Sdrh     return;
1489fbc7e884Sdrh   }
1490fbc7e884Sdrh   if( buf.st_nlink>1 ){
1491fbc7e884Sdrh     sqlite3_log(SQLITE_WARNING, "multiple links to file: %s", pFile->zPath);
1492fbc7e884Sdrh     return;
1493fbc7e884Sdrh   }
1494b959a017Sdrh   if( fileHasMoved(pFile) ){
1495fbc7e884Sdrh     sqlite3_log(SQLITE_WARNING, "file renamed while open: %s", pFile->zPath);
1496fbc7e884Sdrh     return;
1497fbc7e884Sdrh   }
1498fbc7e884Sdrh }
1499fbc7e884Sdrh 
1500fbc7e884Sdrh 
1501fbc7e884Sdrh /*
150213adf8a0Sdanielk1977 ** This routine checks if there is a RESERVED lock held on the specified
15035b1a2566Saswift ** file by this or any other process. If such a lock is held, set *pResOut
15045b1a2566Saswift ** to a non-zero value otherwise *pResOut is set to zero.  The return value
15055b1a2566Saswift ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
150613adf8a0Sdanielk1977 */
unixCheckReservedLock(sqlite3_file * id,int * pResOut)1507861f7456Sdanielk1977 static int unixCheckReservedLock(sqlite3_file *id, int *pResOut){
15085b1a2566Saswift   int rc = SQLITE_OK;
15095b1a2566Saswift   int reserved = 0;
1510054889ecSdrh   unixFile *pFile = (unixFile*)id;
151113adf8a0Sdanielk1977 
1512861f7456Sdanielk1977   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
1513861f7456Sdanielk1977 
1514054889ecSdrh   assert( pFile );
1515a8de1e1cSdrh   assert( pFile->eFileLock<=SHARED_LOCK );
1516da6dc240Sdrh   sqlite3_mutex_enter(pFile->pInode->pLockMutex);
151713adf8a0Sdanielk1977 
151813adf8a0Sdanielk1977   /* Check if a thread in this process holds such a lock */
15198af6c228Sdrh   if( pFile->pInode->eFileLock>SHARED_LOCK ){
15205b1a2566Saswift     reserved = 1;
152113adf8a0Sdanielk1977   }
152213adf8a0Sdanielk1977 
15232ac3ee97Sdrh   /* Otherwise see if some other process holds it.
152413adf8a0Sdanielk1977   */
152509480a9dSdanielk1977 #ifndef __DJGPP__
1526a7e61d8bSdrh   if( !reserved && !pFile->pInode->bProcessLock ){
152713adf8a0Sdanielk1977     struct flock lock;
152813adf8a0Sdanielk1977     lock.l_whence = SEEK_SET;
15292ac3ee97Sdrh     lock.l_start = RESERVED_BYTE;
15302ac3ee97Sdrh     lock.l_len = 1;
15312ac3ee97Sdrh     lock.l_type = F_WRLCK;
1532ea83bc61Sdan     if( osFcntl(pFile->h, F_GETLK, &lock) ){
1533ea83bc61Sdan       rc = SQLITE_IOERR_CHECKRESERVEDLOCK;
15344bf66fd6Sdrh       storeLastErrno(pFile, errno);
15355b1a2566Saswift     } else if( lock.l_type!=F_UNLCK ){
15365b1a2566Saswift       reserved = 1;
153713adf8a0Sdanielk1977     }
153813adf8a0Sdanielk1977   }
153909480a9dSdanielk1977 #endif
154013adf8a0Sdanielk1977 
1541da6dc240Sdrh   sqlite3_mutex_leave(pFile->pInode->pLockMutex);
1542308c2a5cSdrh   OSTRACE(("TEST WR-LOCK %d %d %d (unix)\n", pFile->h, rc, reserved));
154313adf8a0Sdanielk1977 
15445b1a2566Saswift   *pResOut = reserved;
15455b1a2566Saswift   return rc;
154613adf8a0Sdanielk1977 }
154713adf8a0Sdanielk1977 
1548ddcfe921Sdrh /* Forward declaration*/
1549ddcfe921Sdrh static int unixSleep(sqlite3_vfs*,int);
1550ddcfe921Sdrh 
155113adf8a0Sdanielk1977 /*
1552f0119b2eSdrh ** Set a posix-advisory-lock.
1553f0119b2eSdrh **
1554f0119b2eSdrh ** There are two versions of this routine.  If compiled with
1555f0119b2eSdrh ** SQLITE_ENABLE_SETLK_TIMEOUT then the routine has an extra parameter
1556f0119b2eSdrh ** which is a pointer to a unixFile.  If the unixFile->iBusyTimeout
1557f0119b2eSdrh ** value is set, then it is the number of milliseconds to wait before
1558f0119b2eSdrh ** failing the lock.  The iBusyTimeout value is always reset back to
1559f0119b2eSdrh ** zero on each call.
1560f0119b2eSdrh **
1561f0119b2eSdrh ** If SQLITE_ENABLE_SETLK_TIMEOUT is not defined, then do a non-blocking
1562f0119b2eSdrh ** attempt to set the lock.
1563f0119b2eSdrh */
1564f0119b2eSdrh #ifndef SQLITE_ENABLE_SETLK_TIMEOUT
1565f0119b2eSdrh # define osSetPosixAdvisoryLock(h,x,t) osFcntl(h,F_SETLK,x)
1566f0119b2eSdrh #else
osSetPosixAdvisoryLock(int h,struct flock * pLock,unixFile * pFile)1567f0119b2eSdrh static int osSetPosixAdvisoryLock(
1568f0119b2eSdrh   int h,                /* The file descriptor on which to take the lock */
1569f0119b2eSdrh   struct flock *pLock,  /* The description of the lock */
1570f0119b2eSdrh   unixFile *pFile       /* Structure holding timeout value */
1571f0119b2eSdrh ){
15727bb8b8a4Sdan   int tm = pFile->iBusyTimeout;
1573f0119b2eSdrh   int rc = osFcntl(h,F_SETLK,pLock);
15747bb8b8a4Sdan   while( rc<0 && tm>0 ){
1575f0119b2eSdrh     /* On systems that support some kind of blocking file lock with a timeout,
1576f0119b2eSdrh     ** make appropriate changes here to invoke that blocking file lock.  On
1577f0119b2eSdrh     ** generic posix, however, there is no such API.  So we simply try the
1578f0119b2eSdrh     ** lock once every millisecond until either the timeout expires, or until
1579f0119b2eSdrh     ** the lock is obtained. */
1580ddcfe921Sdrh     unixSleep(0,1000);
1581f0119b2eSdrh     rc = osFcntl(h,F_SETLK,pLock);
15827bb8b8a4Sdan     tm--;
1583f0119b2eSdrh   }
1584f0119b2eSdrh   return rc;
1585f0119b2eSdrh }
1586f0119b2eSdrh #endif /* SQLITE_ENABLE_SETLK_TIMEOUT */
1587f0119b2eSdrh 
1588f0119b2eSdrh 
1589f0119b2eSdrh /*
1590a7e61d8bSdrh ** Attempt to set a system-lock on the file pFile.  The lock is
1591a7e61d8bSdrh ** described by pLock.
1592a7e61d8bSdrh **
15937719711bSdrh ** If the pFile was opened read/write from unix-excl, then the only lock
15947719711bSdrh ** ever obtained is an exclusive lock, and it is obtained exactly once
1595a7e61d8bSdrh ** the first time any lock is attempted.  All subsequent system locking
1596a7e61d8bSdrh ** operations become no-ops.  Locking operations still happen internally,
1597a7e61d8bSdrh ** in order to coordinate access between separate database connections
1598a7e61d8bSdrh ** within this process, but all of that is handled in memory and the
1599a7e61d8bSdrh ** operating system does not participate.
16007719711bSdrh **
16017719711bSdrh ** This function is a pass-through to fcntl(F_SETLK) if pFile is using
16027719711bSdrh ** any VFS other than "unix-excl" or if pFile is opened on "unix-excl"
16037719711bSdrh ** and is read-only.
1604661d71afSdan **
1605661d71afSdan ** Zero is returned if the call completes successfully, or -1 if a call
1606661d71afSdan ** to fcntl() fails. In this case, errno is set appropriately (by fcntl()).
1607a7e61d8bSdrh */
unixFileLock(unixFile * pFile,struct flock * pLock)1608a7e61d8bSdrh static int unixFileLock(unixFile *pFile, struct flock *pLock){
1609a7e61d8bSdrh   int rc;
16103cb9339aSdrh   unixInodeInfo *pInode = pFile->pInode;
16113cb9339aSdrh   assert( pInode!=0 );
1612da6dc240Sdrh   assert( sqlite3_mutex_held(pInode->pLockMutex) );
161350358adfSdrh   if( (pFile->ctrlFlags & (UNIXFILE_EXCL|UNIXFILE_RDONLY))==UNIXFILE_EXCL ){
16143cb9339aSdrh     if( pInode->bProcessLock==0 ){
1615a7e61d8bSdrh       struct flock lock;
16163cb9339aSdrh       assert( pInode->nLock==0 );
1617a7e61d8bSdrh       lock.l_whence = SEEK_SET;
1618a7e61d8bSdrh       lock.l_start = SHARED_FIRST;
1619a7e61d8bSdrh       lock.l_len = SHARED_SIZE;
1620a7e61d8bSdrh       lock.l_type = F_WRLCK;
1621f0119b2eSdrh       rc = osSetPosixAdvisoryLock(pFile->h, &lock, pFile);
1622a7e61d8bSdrh       if( rc<0 ) return rc;
16233cb9339aSdrh       pInode->bProcessLock = 1;
16243cb9339aSdrh       pInode->nLock++;
1625a7e61d8bSdrh     }else{
1626a7e61d8bSdrh       rc = 0;
1627a7e61d8bSdrh     }
1628a7e61d8bSdrh   }else{
1629f0119b2eSdrh     rc = osSetPosixAdvisoryLock(pFile->h, pLock, pFile);
1630a7e61d8bSdrh   }
1631a7e61d8bSdrh   return rc;
1632a7e61d8bSdrh }
1633a7e61d8bSdrh 
1634a7e61d8bSdrh /*
1635308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
16369a1d0abeSdanielk1977 ** of the following:
16379a1d0abeSdanielk1977 **
16382ac3ee97Sdrh **     (1) SHARED_LOCK
16392ac3ee97Sdrh **     (2) RESERVED_LOCK
16402ac3ee97Sdrh **     (3) PENDING_LOCK
16412ac3ee97Sdrh **     (4) EXCLUSIVE_LOCK
16422ac3ee97Sdrh **
1643b3e04346Sdrh ** Sometimes when requesting one lock state, additional lock states
1644b3e04346Sdrh ** are inserted in between.  The locking might fail on one of the later
1645b3e04346Sdrh ** transitions leaving the lock state different from what it started but
1646b3e04346Sdrh ** still short of its goal.  The following chart shows the allowed
1647b3e04346Sdrh ** transitions and the inserted intermediate states:
1648b3e04346Sdrh **
1649b3e04346Sdrh **    UNLOCKED -> SHARED
1650b3e04346Sdrh **    SHARED -> RESERVED
1651b3e04346Sdrh **    SHARED -> (PENDING) -> EXCLUSIVE
1652b3e04346Sdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
1653b3e04346Sdrh **    PENDING -> EXCLUSIVE
16542ac3ee97Sdrh **
1655a6abd041Sdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
1656a6abd041Sdrh ** routine to lower a locking level.
16579a1d0abeSdanielk1977 */
unixLock(sqlite3_file * id,int eFileLock)1658308c2a5cSdrh static int unixLock(sqlite3_file *id, int eFileLock){
1659f42f25c2Sdanielk1977   /* The following describes the implementation of the various locks and
1660f42f25c2Sdanielk1977   ** lock transitions in terms of the POSIX advisory shared and exclusive
1661f42f25c2Sdanielk1977   ** lock primitives (called read-locks and write-locks below, to avoid
1662f42f25c2Sdanielk1977   ** confusion with SQLite lock names). The algorithms are complicated
1663f878e6e1Sdrh   ** slightly in order to be compatible with Windows95 systems simultaneously
1664f42f25c2Sdanielk1977   ** accessing the same database file, in case that is ever required.
1665f42f25c2Sdanielk1977   **
1666f42f25c2Sdanielk1977   ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
1667f42f25c2Sdanielk1977   ** byte', each single bytes at well known offsets, and the 'shared byte
1668f42f25c2Sdanielk1977   ** range', a range of 510 bytes at a well known offset.
1669f42f25c2Sdanielk1977   **
1670f42f25c2Sdanielk1977   ** To obtain a SHARED lock, a read-lock is obtained on the 'pending
1671f878e6e1Sdrh   ** byte'.  If this is successful, 'shared byte range' is read-locked
1672f878e6e1Sdrh   ** and the lock on the 'pending byte' released.  (Legacy note:  When
1673f878e6e1Sdrh   ** SQLite was first developed, Windows95 systems were still very common,
1674f878e6e1Sdrh   ** and Widnows95 lacks a shared-lock capability.  So on Windows95, a
1675f878e6e1Sdrh   ** single randomly selected by from the 'shared byte range' is locked.
1676f878e6e1Sdrh   ** Windows95 is now pretty much extinct, but this work-around for the
1677f878e6e1Sdrh   ** lack of shared-locks on Windows95 lives on, for backwards
1678f878e6e1Sdrh   ** compatibility.)
1679f42f25c2Sdanielk1977   **
168090ba3bd0Sdanielk1977   ** A process may only obtain a RESERVED lock after it has a SHARED lock.
168190ba3bd0Sdanielk1977   ** A RESERVED lock is implemented by grabbing a write-lock on the
168290ba3bd0Sdanielk1977   ** 'reserved byte'.
1683f42f25c2Sdanielk1977   **
1684f42f25c2Sdanielk1977   ** A process may only obtain a PENDING lock after it has obtained a
168590ba3bd0Sdanielk1977   ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
168690ba3bd0Sdanielk1977   ** on the 'pending byte'. This ensures that no new SHARED locks can be
168790ba3bd0Sdanielk1977   ** obtained, but existing SHARED locks are allowed to persist. A process
168890ba3bd0Sdanielk1977   ** does not have to obtain a RESERVED lock on the way to a PENDING lock.
168990ba3bd0Sdanielk1977   ** This property is used by the algorithm for rolling back a journal file
169090ba3bd0Sdanielk1977   ** after a crash.
1691f42f25c2Sdanielk1977   **
169290ba3bd0Sdanielk1977   ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
169390ba3bd0Sdanielk1977   ** implemented by obtaining a write-lock on the entire 'shared byte
169490ba3bd0Sdanielk1977   ** range'. Since all other locks require a read-lock on one of the bytes
169590ba3bd0Sdanielk1977   ** within this range, this ensures that no other locks are held on the
169690ba3bd0Sdanielk1977   ** database.
1697f42f25c2Sdanielk1977   */
16989a1d0abeSdanielk1977   int rc = SQLITE_OK;
1699054889ecSdrh   unixFile *pFile = (unixFile*)id;
1700b07028f7Sdrh   unixInodeInfo *pInode;
1701bbd42a6dSdrh   struct flock lock;
1702383d30f4Sdrh   int tErrno = 0;
17039a1d0abeSdanielk1977 
1704054889ecSdrh   assert( pFile );
1705308c2a5cSdrh   OSTRACE(("LOCK    %d %s was %s(%s,%d) pid=%d (unix)\n", pFile->h,
1706308c2a5cSdrh       azFileLock(eFileLock), azFileLock(pFile->eFileLock),
170791eb93c7Sdrh       azFileLock(pFile->pInode->eFileLock), pFile->pInode->nShared,
17085ac93652Sdrh       osGetpid(0)));
17099a1d0abeSdanielk1977 
17109a1d0abeSdanielk1977   /* If there is already a lock of this type or more restrictive on the
1711ad94b58aSdanielk1977   ** unixFile, do nothing. Don't use the end_lock: exit path, as
17126c7d5c5bSdrh   ** unixEnterMutex() hasn't been called yet.
17139a1d0abeSdanielk1977   */
1714308c2a5cSdrh   if( pFile->eFileLock>=eFileLock ){
1715308c2a5cSdrh     OSTRACE(("LOCK    %d %s ok (already held) (unix)\n", pFile->h,
1716308c2a5cSdrh             azFileLock(eFileLock)));
17179a1d0abeSdanielk1977     return SQLITE_OK;
17189a1d0abeSdanielk1977   }
17199a1d0abeSdanielk1977 
17200c2694b7Sdrh   /* Make sure the locking sequence is correct.
17210c2694b7Sdrh   **  (1) We never move from unlocked to anything higher than shared lock.
17220c2694b7Sdrh   **  (2) SQLite never explicitly requests a pendig lock.
17230c2694b7Sdrh   **  (3) A shared lock is always held when a reserve lock is requested.
17242ac3ee97Sdrh   */
1725308c2a5cSdrh   assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
1726308c2a5cSdrh   assert( eFileLock!=PENDING_LOCK );
1727308c2a5cSdrh   assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
17282ac3ee97Sdrh 
17298af6c228Sdrh   /* This mutex is needed because pFile->pInode is shared across threads
1730b3e04346Sdrh   */
17318af6c228Sdrh   pInode = pFile->pInode;
1732da6dc240Sdrh   sqlite3_mutex_enter(pInode->pLockMutex);
1733029b44bdSdrh 
1734ad94b58aSdanielk1977   /* If some thread using this PID has a lock via a different unixFile*
17359a1d0abeSdanielk1977   ** handle that precludes the requested lock, return BUSY.
17369a1d0abeSdanielk1977   */
17378af6c228Sdrh   if( (pFile->eFileLock!=pInode->eFileLock &&
17388af6c228Sdrh           (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
17399a1d0abeSdanielk1977   ){
17409a1d0abeSdanielk1977     rc = SQLITE_BUSY;
17419a1d0abeSdanielk1977     goto end_lock;
17429a1d0abeSdanielk1977   }
17439a1d0abeSdanielk1977 
17449a1d0abeSdanielk1977   /* If a SHARED lock is requested, and some thread using this PID already
17459a1d0abeSdanielk1977   ** has a SHARED or RESERVED lock, then increment reference counts and
17469a1d0abeSdanielk1977   ** return SQLITE_OK.
17479a1d0abeSdanielk1977   */
1748308c2a5cSdrh   if( eFileLock==SHARED_LOCK &&
17498af6c228Sdrh       (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
1750308c2a5cSdrh     assert( eFileLock==SHARED_LOCK );
1751308c2a5cSdrh     assert( pFile->eFileLock==0 );
17528af6c228Sdrh     assert( pInode->nShared>0 );
1753308c2a5cSdrh     pFile->eFileLock = SHARED_LOCK;
17548af6c228Sdrh     pInode->nShared++;
17558af6c228Sdrh     pInode->nLock++;
17569a1d0abeSdanielk1977     goto end_lock;
17579a1d0abeSdanielk1977   }
17589a1d0abeSdanielk1977 
17599a1d0abeSdanielk1977 
17603cde3bb0Sdrh   /* A PENDING lock is needed before acquiring a SHARED lock and before
17613cde3bb0Sdrh   ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
17623cde3bb0Sdrh   ** be released.
17639a1d0abeSdanielk1977   */
17640c2694b7Sdrh   lock.l_len = 1L;
17650c2694b7Sdrh   lock.l_whence = SEEK_SET;
1766308c2a5cSdrh   if( eFileLock==SHARED_LOCK
1767308c2a5cSdrh       || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
17683cde3bb0Sdrh   ){
1769308c2a5cSdrh     lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
17702ac3ee97Sdrh     lock.l_start = PENDING_BYTE;
1771661d71afSdan     if( unixFileLock(pFile, &lock) ){
17720c2694b7Sdrh       tErrno = errno;
17735b1a2566Saswift       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1774661d71afSdan       if( rc!=SQLITE_BUSY ){
17754bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
17765b1a2566Saswift       }
17779a1d0abeSdanielk1977       goto end_lock;
17789a1d0abeSdanielk1977     }
17793cde3bb0Sdrh   }
17803cde3bb0Sdrh 
17813cde3bb0Sdrh 
17823cde3bb0Sdrh   /* If control gets to this point, then actually go ahead and make
17833cde3bb0Sdrh   ** operating system calls for the specified lock.
17843cde3bb0Sdrh   */
1785308c2a5cSdrh   if( eFileLock==SHARED_LOCK ){
17868af6c228Sdrh     assert( pInode->nShared==0 );
17878af6c228Sdrh     assert( pInode->eFileLock==0 );
1788661d71afSdan     assert( rc==SQLITE_OK );
17899a1d0abeSdanielk1977 
17902ac3ee97Sdrh     /* Now get the read-lock */
17917ed97b9dSdrh     lock.l_start = SHARED_FIRST;
17927ed97b9dSdrh     lock.l_len = SHARED_SIZE;
1793661d71afSdan     if( unixFileLock(pFile, &lock) ){
17947ed97b9dSdrh       tErrno = errno;
1795661d71afSdan       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
17967ed97b9dSdrh     }
1797661d71afSdan 
17982ac3ee97Sdrh     /* Drop the temporary PENDING lock */
17992ac3ee97Sdrh     lock.l_start = PENDING_BYTE;
18002ac3ee97Sdrh     lock.l_len = 1L;
18019a1d0abeSdanielk1977     lock.l_type = F_UNLCK;
1802661d71afSdan     if( unixFileLock(pFile, &lock) && rc==SQLITE_OK ){
18035b1a2566Saswift       /* This could happen with a network mount */
18045b1a2566Saswift       tErrno = errno;
1805ea83bc61Sdan       rc = SQLITE_IOERR_UNLOCK;
1806661d71afSdan     }
1807661d71afSdan 
1808661d71afSdan     if( rc ){
1809661d71afSdan       if( rc!=SQLITE_BUSY ){
18104bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
18115b1a2566Saswift       }
18122b4b5962Sdrh       goto end_lock;
1813bbd42a6dSdrh     }else{
1814308c2a5cSdrh       pFile->eFileLock = SHARED_LOCK;
18158af6c228Sdrh       pInode->nLock++;
18168af6c228Sdrh       pInode->nShared = 1;
1817bbd42a6dSdrh     }
18188af6c228Sdrh   }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
18193cde3bb0Sdrh     /* We are trying for an exclusive lock but another thread in this
18203cde3bb0Sdrh     ** same process is still holding a shared lock. */
18213cde3bb0Sdrh     rc = SQLITE_BUSY;
1822bbd42a6dSdrh   }else{
18233cde3bb0Sdrh     /* The request was for a RESERVED or EXCLUSIVE lock.  It is
18249a1d0abeSdanielk1977     ** assumed that there is a SHARED or greater lock on the file
18259a1d0abeSdanielk1977     ** already.
18269a1d0abeSdanielk1977     */
1827308c2a5cSdrh     assert( 0!=pFile->eFileLock );
18289a1d0abeSdanielk1977     lock.l_type = F_WRLCK;
1829661d71afSdan 
1830661d71afSdan     assert( eFileLock==RESERVED_LOCK || eFileLock==EXCLUSIVE_LOCK );
1831661d71afSdan     if( eFileLock==RESERVED_LOCK ){
18322ac3ee97Sdrh       lock.l_start = RESERVED_BYTE;
1833661d71afSdan       lock.l_len = 1L;
1834661d71afSdan     }else{
18357ed97b9dSdrh       lock.l_start = SHARED_FIRST;
18367ed97b9dSdrh       lock.l_len = SHARED_SIZE;
1837bbd42a6dSdrh     }
1838661d71afSdan 
1839661d71afSdan     if( unixFileLock(pFile, &lock) ){
18407ed97b9dSdrh       tErrno = errno;
18415b1a2566Saswift       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
1842661d71afSdan       if( rc!=SQLITE_BUSY ){
18434bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
18445b1a2566Saswift       }
18459a1d0abeSdanielk1977     }
18469a1d0abeSdanielk1977   }
18479a1d0abeSdanielk1977 
18488f941bc7Sdrh 
1849d3d8c04fSdrh #ifdef SQLITE_DEBUG
18508f941bc7Sdrh   /* Set up the transaction-counter change checking flags when
18518f941bc7Sdrh   ** transitioning from a SHARED to a RESERVED lock.  The change
18528f941bc7Sdrh   ** from SHARED to RESERVED marks the beginning of a normal
18538f941bc7Sdrh   ** write operation (not a hot journal rollback).
18548f941bc7Sdrh   */
18558f941bc7Sdrh   if( rc==SQLITE_OK
1856308c2a5cSdrh    && pFile->eFileLock<=SHARED_LOCK
1857308c2a5cSdrh    && eFileLock==RESERVED_LOCK
18588f941bc7Sdrh   ){
18598f941bc7Sdrh     pFile->transCntrChng = 0;
18608f941bc7Sdrh     pFile->dbUpdate = 0;
18618f941bc7Sdrh     pFile->inNormalWrite = 1;
18628f941bc7Sdrh   }
18638f941bc7Sdrh #endif
18648f941bc7Sdrh 
18658f941bc7Sdrh 
1866ecb2a964Sdanielk1977   if( rc==SQLITE_OK ){
1867308c2a5cSdrh     pFile->eFileLock = eFileLock;
18688af6c228Sdrh     pInode->eFileLock = eFileLock;
1869308c2a5cSdrh   }else if( eFileLock==EXCLUSIVE_LOCK ){
1870308c2a5cSdrh     pFile->eFileLock = PENDING_LOCK;
18718af6c228Sdrh     pInode->eFileLock = PENDING_LOCK;
1872ecb2a964Sdanielk1977   }
18739a1d0abeSdanielk1977 
18749a1d0abeSdanielk1977 end_lock:
1875da6dc240Sdrh   sqlite3_mutex_leave(pInode->pLockMutex);
1876308c2a5cSdrh   OSTRACE(("LOCK    %d %s %s (unix)\n", pFile->h, azFileLock(eFileLock),
1877308c2a5cSdrh       rc==SQLITE_OK ? "ok" : "failed"));
1878bbd42a6dSdrh   return rc;
1879bbd42a6dSdrh }
1880bbd42a6dSdrh 
1881bbd42a6dSdrh /*
188208da86a6Sdan ** Add the file descriptor used by file handle pFile to the corresponding
1883e946c396Sdan ** pUnused list.
188408da86a6Sdan */
setPendingFd(unixFile * pFile)188508da86a6Sdan static void setPendingFd(unixFile *pFile){
1886d91c68f6Sdrh   unixInodeInfo *pInode = pFile->pInode;
1887c68886bbSdrh   UnixUnusedFd *p = pFile->pPreallocatedUnused;
1888ef52b36aSdrh   assert( unixFileMutexHeld(pFile) );
18898af6c228Sdrh   p->pNext = pInode->pUnused;
18908af6c228Sdrh   pInode->pUnused = p;
189108da86a6Sdan   pFile->h = -1;
1892c68886bbSdrh   pFile->pPreallocatedUnused = 0;
189308da86a6Sdan }
189408da86a6Sdan 
189508da86a6Sdan /*
1896308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
1897a6abd041Sdrh ** must be either NO_LOCK or SHARED_LOCK.
1898a6abd041Sdrh **
1899a6abd041Sdrh ** If the locking level of the file descriptor is already at or below
1900a6abd041Sdrh ** the requested locking level, this routine is a no-op.
19017ed97b9dSdrh **
19027ed97b9dSdrh ** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
19037ed97b9dSdrh ** the byte range is divided into 2 parts and the first part is unlocked then
19047ed97b9dSdrh ** set to a read lock, then the other part is simply unlocked.  This works
19057ed97b9dSdrh ** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to
19067ed97b9dSdrh ** remove the write lock on a region when a read lock is set.
1907bbd42a6dSdrh */
posixUnlock(sqlite3_file * id,int eFileLock,int handleNFSUnlock)1908a7e61d8bSdrh static int posixUnlock(sqlite3_file *id, int eFileLock, int handleNFSUnlock){
19097ed97b9dSdrh   unixFile *pFile = (unixFile*)id;
1910d91c68f6Sdrh   unixInodeInfo *pInode;
19117ed97b9dSdrh   struct flock lock;
19127ed97b9dSdrh   int rc = SQLITE_OK;
1913a6abd041Sdrh 
1914054889ecSdrh   assert( pFile );
1915308c2a5cSdrh   OSTRACE(("UNLOCK  %d %d was %d(%d,%d) pid=%d (unix)\n", pFile->h, eFileLock,
19168af6c228Sdrh       pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
19175ac93652Sdrh       osGetpid(0)));
1918a6abd041Sdrh 
1919308c2a5cSdrh   assert( eFileLock<=SHARED_LOCK );
1920308c2a5cSdrh   if( pFile->eFileLock<=eFileLock ){
1921a6abd041Sdrh     return SQLITE_OK;
1922a6abd041Sdrh   }
19238af6c228Sdrh   pInode = pFile->pInode;
1924da6dc240Sdrh   sqlite3_mutex_enter(pInode->pLockMutex);
19258af6c228Sdrh   assert( pInode->nShared!=0 );
1926308c2a5cSdrh   if( pFile->eFileLock>SHARED_LOCK ){
19278af6c228Sdrh     assert( pInode->eFileLock==pFile->eFileLock );
19288f941bc7Sdrh 
1929d3d8c04fSdrh #ifdef SQLITE_DEBUG
19308f941bc7Sdrh     /* When reducing a lock such that other processes can start
19318f941bc7Sdrh     ** reading the database file again, make sure that the
19328f941bc7Sdrh     ** transaction counter was updated if any part of the database
19338f941bc7Sdrh     ** file changed.  If the transaction counter is not updated,
19348f941bc7Sdrh     ** other connections to the same file might not realize that
19358f941bc7Sdrh     ** the file has changed and hence might not know to flush their
19368f941bc7Sdrh     ** cache.  The use of a stale cache can lead to database corruption.
19378f941bc7Sdrh     */
19388f941bc7Sdrh     pFile->inNormalWrite = 0;
19398f941bc7Sdrh #endif
19408f941bc7Sdrh 
19417ed97b9dSdrh     /* downgrading to a shared lock on NFS involves clearing the write lock
19427ed97b9dSdrh     ** before establishing the readlock - to avoid a race condition we downgrade
19437ed97b9dSdrh     ** the lock in 2 blocks, so that part of the range will be covered by a
19447ed97b9dSdrh     ** write lock until the rest is covered by a read lock:
19457ed97b9dSdrh     **  1:   [WWWWW]
19467ed97b9dSdrh     **  2:   [....W]
19477ed97b9dSdrh     **  3:   [RRRRW]
19487ed97b9dSdrh     **  4:   [RRRR.]
19497ed97b9dSdrh     */
1950308c2a5cSdrh     if( eFileLock==SHARED_LOCK ){
195130f776faSdrh #if !defined(__APPLE__) || !SQLITE_ENABLE_LOCKING_STYLE
195287e79aefSdrh       (void)handleNFSUnlock;
195330f776faSdrh       assert( handleNFSUnlock==0 );
195430f776faSdrh #endif
195530f776faSdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
19567ed97b9dSdrh       if( handleNFSUnlock ){
1957a712b4bbSdrh         int tErrno;               /* Error code from system call errors */
19587ed97b9dSdrh         off_t divSize = SHARED_SIZE - 1;
19597ed97b9dSdrh 
19607ed97b9dSdrh         lock.l_type = F_UNLCK;
19617ed97b9dSdrh         lock.l_whence = SEEK_SET;
19627ed97b9dSdrh         lock.l_start = SHARED_FIRST;
19637ed97b9dSdrh         lock.l_len = divSize;
1964211fb084Sdan         if( unixFileLock(pFile, &lock)==(-1) ){
1965c05a9a8aSdrh           tErrno = errno;
1966ea83bc61Sdan           rc = SQLITE_IOERR_UNLOCK;
19674bf66fd6Sdrh           storeLastErrno(pFile, tErrno);
19687ed97b9dSdrh           goto end_unlock;
19697ed97b9dSdrh         }
19707ed97b9dSdrh         lock.l_type = F_RDLCK;
19717ed97b9dSdrh         lock.l_whence = SEEK_SET;
19727ed97b9dSdrh         lock.l_start = SHARED_FIRST;
19737ed97b9dSdrh         lock.l_len = divSize;
1974a7e61d8bSdrh         if( unixFileLock(pFile, &lock)==(-1) ){
1975c05a9a8aSdrh           tErrno = errno;
19765b1a2566Saswift           rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_RDLOCK);
19775b1a2566Saswift           if( IS_LOCK_ERROR(rc) ){
19784bf66fd6Sdrh             storeLastErrno(pFile, tErrno);
19795b1a2566Saswift           }
19805b1a2566Saswift           goto end_unlock;
19819c105bb9Sdrh         }
19827ed97b9dSdrh         lock.l_type = F_UNLCK;
19837ed97b9dSdrh         lock.l_whence = SEEK_SET;
19847ed97b9dSdrh         lock.l_start = SHARED_FIRST+divSize;
19857ed97b9dSdrh         lock.l_len = SHARED_SIZE-divSize;
1986a7e61d8bSdrh         if( unixFileLock(pFile, &lock)==(-1) ){
1987c05a9a8aSdrh           tErrno = errno;
1988ea83bc61Sdan           rc = SQLITE_IOERR_UNLOCK;
19894bf66fd6Sdrh           storeLastErrno(pFile, tErrno);
19907ed97b9dSdrh           goto end_unlock;
19917ed97b9dSdrh         }
199230f776faSdrh       }else
199330f776faSdrh #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
199430f776faSdrh       {
19957ed97b9dSdrh         lock.l_type = F_RDLCK;
19967ed97b9dSdrh         lock.l_whence = SEEK_SET;
19977ed97b9dSdrh         lock.l_start = SHARED_FIRST;
19987ed97b9dSdrh         lock.l_len = SHARED_SIZE;
1999661d71afSdan         if( unixFileLock(pFile, &lock) ){
2000ea83bc61Sdan           /* In theory, the call to unixFileLock() cannot fail because another
2001ea83bc61Sdan           ** process is holding an incompatible lock. If it does, this
2002ea83bc61Sdan           ** indicates that the other process is not following the locking
2003ea83bc61Sdan           ** protocol. If this happens, return SQLITE_IOERR_RDLOCK. Returning
2004ea83bc61Sdan           ** SQLITE_BUSY would confuse the upper layer (in practice it causes
2005ea83bc61Sdan           ** an assert to fail). */
2006ea83bc61Sdan           rc = SQLITE_IOERR_RDLOCK;
20074bf66fd6Sdrh           storeLastErrno(pFile, errno);
20087ed97b9dSdrh           goto end_unlock;
20097ed97b9dSdrh         }
20107ed97b9dSdrh       }
20119c105bb9Sdrh     }
2012a6abd041Sdrh     lock.l_type = F_UNLCK;
2013a6abd041Sdrh     lock.l_whence = SEEK_SET;
2014a6abd041Sdrh     lock.l_start = PENDING_BYTE;
2015a6abd041Sdrh     lock.l_len = 2L;  assert( PENDING_BYTE+1==RESERVED_BYTE );
2016661d71afSdan     if( unixFileLock(pFile, &lock)==0 ){
20178af6c228Sdrh       pInode->eFileLock = SHARED_LOCK;
20182b4b5962Sdrh     }else{
2019ea83bc61Sdan       rc = SQLITE_IOERR_UNLOCK;
20204bf66fd6Sdrh       storeLastErrno(pFile, errno);
20215b1a2566Saswift       goto end_unlock;
20222b4b5962Sdrh     }
2023a6abd041Sdrh   }
2024308c2a5cSdrh   if( eFileLock==NO_LOCK ){
2025a6abd041Sdrh     /* Decrement the shared lock counter.  Release the lock using an
2026a6abd041Sdrh     ** OS call only when all threads in this same process have released
2027a6abd041Sdrh     ** the lock.
2028a6abd041Sdrh     */
20298af6c228Sdrh     pInode->nShared--;
20308af6c228Sdrh     if( pInode->nShared==0 ){
2031bbd42a6dSdrh       lock.l_type = F_UNLCK;
2032bbd42a6dSdrh       lock.l_whence = SEEK_SET;
2033bbd42a6dSdrh       lock.l_start = lock.l_len = 0L;
2034661d71afSdan       if( unixFileLock(pFile, &lock)==0 ){
20358af6c228Sdrh         pInode->eFileLock = NO_LOCK;
20362b4b5962Sdrh       }else{
2037ea83bc61Sdan         rc = SQLITE_IOERR_UNLOCK;
20384bf66fd6Sdrh         storeLastErrno(pFile, errno);
20398af6c228Sdrh         pInode->eFileLock = NO_LOCK;
2040308c2a5cSdrh         pFile->eFileLock = NO_LOCK;
20412b4b5962Sdrh       }
2042bbd42a6dSdrh     }
2043ecb2a964Sdanielk1977 
2044bbd42a6dSdrh     /* Decrement the count of locks against this same file.  When the
2045bbd42a6dSdrh     ** count reaches zero, close any other file descriptors whose close
2046bbd42a6dSdrh     ** was deferred because of outstanding locks.
2047bbd42a6dSdrh     */
20488af6c228Sdrh     pInode->nLock--;
20498af6c228Sdrh     assert( pInode->nLock>=0 );
2050ef52b36aSdrh     if( pInode->nLock==0 ) closePendingFds(pFile);
205108da86a6Sdan   }
20525b1a2566Saswift 
20535b1a2566Saswift end_unlock:
2054da6dc240Sdrh   sqlite3_mutex_leave(pInode->pLockMutex);
2055095908e1Sdrh   if( rc==SQLITE_OK ){
2056095908e1Sdrh     pFile->eFileLock = eFileLock;
2057095908e1Sdrh   }
20589c105bb9Sdrh   return rc;
2059bbd42a6dSdrh }
2060bbd42a6dSdrh 
2061bbd42a6dSdrh /*
2062308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
20637ed97b9dSdrh ** must be either NO_LOCK or SHARED_LOCK.
20647ed97b9dSdrh **
20657ed97b9dSdrh ** If the locking level of the file descriptor is already at or below
20667ed97b9dSdrh ** the requested locking level, this routine is a no-op.
20677ed97b9dSdrh */
unixUnlock(sqlite3_file * id,int eFileLock)2068308c2a5cSdrh static int unixUnlock(sqlite3_file *id, int eFileLock){
2069f52a4690Sdan #if SQLITE_MAX_MMAP_SIZE>0
2070a1afc742Sdan   assert( eFileLock==SHARED_LOCK || ((unixFile *)id)->nFetchOut==0 );
2071f52a4690Sdan #endif
2072a7e61d8bSdrh   return posixUnlock(id, eFileLock, 0);
20737ed97b9dSdrh }
20747ed97b9dSdrh 
2075e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
2076f23da966Sdan static int unixMapfile(unixFile *pFd, i64 nByte);
2077f23da966Sdan static void unixUnmapfile(unixFile *pFd);
2078e98844f7Smistachkin #endif
2079f23da966Sdan 
20807ed97b9dSdrh /*
2081e339d65aSdanielk1977 ** This function performs the parts of the "close file" operation
2082e339d65aSdanielk1977 ** common to all locking schemes. It closes the directory and file
2083e339d65aSdanielk1977 ** handles, if they are valid, and sets all fields of the unixFile
2084e339d65aSdanielk1977 ** structure to 0.
20859b35ea62Sdrh **
20869b35ea62Sdrh ** It is *not* necessary to hold the mutex when this routine is called,
20879b35ea62Sdrh ** even on VxWorks.  A mutex will be acquired on VxWorks by the
20889b35ea62Sdrh ** vxworksReleaseFileId() routine.
2089e339d65aSdanielk1977 */
closeUnixFile(sqlite3_file * id)2090e339d65aSdanielk1977 static int closeUnixFile(sqlite3_file *id){
2091e339d65aSdanielk1977   unixFile *pFile = (unixFile*)id;
2092e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
2093f23da966Sdan   unixUnmapfile(pFile);
2094e98844f7Smistachkin #endif
2095e339d65aSdanielk1977   if( pFile->h>=0 ){
20960e9365ceSdrh     robust_close(pFile, pFile->h, __LINE__);
20970e9365ceSdrh     pFile->h = -1;
2098e339d65aSdanielk1977   }
20996c7d5c5bSdrh #if OS_VXWORKS
2100107886abSdrh   if( pFile->pId ){
2101c02a43afSdrh     if( pFile->ctrlFlags & UNIXFILE_DELETE ){
2102036ac7faSdrh       osUnlink(pFile->pId->zCanonicalName);
210397185489Schw     }
2104107886abSdrh     vxworksReleaseFileId(pFile->pId);
2105107886abSdrh     pFile->pId = 0;
210697185489Schw   }
210797185489Schw #endif
21080bdbc90dSdrh #ifdef SQLITE_UNLINK_AFTER_CLOSE
21090bdbc90dSdrh   if( pFile->ctrlFlags & UNIXFILE_DELETE ){
21100bdbc90dSdrh     osUnlink(pFile->zPath);
21110bdbc90dSdrh     sqlite3_free(*(char**)&pFile->zPath);
21120bdbc90dSdrh     pFile->zPath = 0;
21130bdbc90dSdrh   }
21140bdbc90dSdrh #endif
2115ff59a114Sdrh   OSTRACE(("CLOSE   %-3d\n", pFile->h));
2116e339d65aSdanielk1977   OpenCounter(-1);
2117c68886bbSdrh   sqlite3_free(pFile->pPreallocatedUnused);
2118ff59a114Sdrh   memset(pFile, 0, sizeof(unixFile));
2119e339d65aSdanielk1977   return SQLITE_OK;
2120e339d65aSdanielk1977 }
2121e339d65aSdanielk1977 
2122e339d65aSdanielk1977 /*
2123e3026636Sdanielk1977 ** Close a file.
2124e3026636Sdanielk1977 */
unixClose(sqlite3_file * id)212562079060Sdanielk1977 static int unixClose(sqlite3_file *id){
2126aebf413dSaswift   int rc = SQLITE_OK;
212762079060Sdanielk1977   unixFile *pFile = (unixFile *)id;
2128ef52b36aSdrh   unixInodeInfo *pInode = pFile->pInode;
2129ef52b36aSdrh 
2130ef52b36aSdrh   assert( pInode!=0 );
2131fbc7e884Sdrh   verifyDbFile(pFile);
213262079060Sdanielk1977   unixUnlock(id, NO_LOCK);
2133095908e1Sdrh   assert( unixFileMutexNotheld(pFile) );
21346c7d5c5bSdrh   unixEnterMutex();
2135661d71afSdan 
2136661d71afSdan   /* unixFile.pInode is always valid here. Otherwise, a different close
2137661d71afSdan   ** routine (e.g. nolockClose()) would be called instead.
2138661d71afSdan   */
2139661d71afSdan   assert( pFile->pInode->nLock>0 || pFile->pInode->bProcessLock==0 );
2140ef52b36aSdrh   sqlite3_mutex_enter(pInode->pLockMutex);
21413fcef1a3Sdrh   if( pInode->nLock ){
2142e3026636Sdanielk1977     /* If there are outstanding locks, do not actually close the file just
2143e3026636Sdanielk1977     ** yet because that would clear those locks.  Instead, add the file
21448af6c228Sdrh     ** descriptor to pInode->pUnused list.  It will be automatically closed
2145e946c396Sdan     ** when the last lock is cleared.
2146e3026636Sdanielk1977     */
214708da86a6Sdan     setPendingFd(pFile);
2148e3026636Sdanielk1977   }
2149ef52b36aSdrh   sqlite3_mutex_leave(pInode->pLockMutex);
2150b0ac3e3aSdan   releaseInodeInfo(pFile);
21512b06b076Sdan   assert( pFile->pShm==0 );
2152aebf413dSaswift   rc = closeUnixFile(id);
21536c7d5c5bSdrh   unixLeaveMutex();
2154aebf413dSaswift   return rc;
2155e3026636Sdanielk1977 }
2156e3026636Sdanielk1977 
2157734c9864Sdrh /************** End of the posix advisory lock implementation *****************
2158734c9864Sdrh ******************************************************************************/
2159bfe6631eSdrh 
2160734c9864Sdrh /******************************************************************************
2161734c9864Sdrh ****************************** No-op Locking **********************************
2162734c9864Sdrh **
2163734c9864Sdrh ** Of the various locking implementations available, this is by far the
2164734c9864Sdrh ** simplest:  locking is ignored.  No attempt is made to lock the database
2165734c9864Sdrh ** file for reading or writing.
2166734c9864Sdrh **
2167734c9864Sdrh ** This locking mode is appropriate for use on read-only databases
2168734c9864Sdrh ** (ex: databases that are burned into CD-ROM, for example.)  It can
2169734c9864Sdrh ** also be used if the application employs some external mechanism to
2170734c9864Sdrh ** prevent simultaneous access of the same database by two or more
2171734c9864Sdrh ** database connections.  But there is a serious risk of database
2172734c9864Sdrh ** corruption if this locking mode is used in situations where multiple
2173734c9864Sdrh ** database connections are accessing the same database file at the same
2174734c9864Sdrh ** time and one or more of those connections are writing.
2175734c9864Sdrh */
2176734c9864Sdrh 
nolockCheckReservedLock(sqlite3_file * NotUsed,int * pResOut)2177734c9864Sdrh static int nolockCheckReservedLock(sqlite3_file *NotUsed, int *pResOut){
2178734c9864Sdrh   UNUSED_PARAMETER(NotUsed);
2179734c9864Sdrh   *pResOut = 0;
2180734c9864Sdrh   return SQLITE_OK;
2181734c9864Sdrh }
nolockLock(sqlite3_file * NotUsed,int NotUsed2)2182734c9864Sdrh static int nolockLock(sqlite3_file *NotUsed, int NotUsed2){
2183734c9864Sdrh   UNUSED_PARAMETER2(NotUsed, NotUsed2);
2184734c9864Sdrh   return SQLITE_OK;
2185734c9864Sdrh }
nolockUnlock(sqlite3_file * NotUsed,int NotUsed2)2186734c9864Sdrh static int nolockUnlock(sqlite3_file *NotUsed, int NotUsed2){
2187734c9864Sdrh   UNUSED_PARAMETER2(NotUsed, NotUsed2);
2188734c9864Sdrh   return SQLITE_OK;
2189734c9864Sdrh }
2190734c9864Sdrh 
2191734c9864Sdrh /*
21929b35ea62Sdrh ** Close the file.
2193734c9864Sdrh */
nolockClose(sqlite3_file * id)2194734c9864Sdrh static int nolockClose(sqlite3_file *id) {
21959b35ea62Sdrh   return closeUnixFile(id);
2196734c9864Sdrh }
2197734c9864Sdrh 
2198734c9864Sdrh /******************* End of the no-op lock implementation *********************
2199734c9864Sdrh ******************************************************************************/
2200734c9864Sdrh 
2201734c9864Sdrh /******************************************************************************
2202734c9864Sdrh ************************* Begin dot-file Locking ******************************
2203734c9864Sdrh **
220448864df9Smistachkin ** The dotfile locking implementation uses the existence of separate lock
22059ef6bc42Sdrh ** files (really a directory) to control access to the database.  This works
22069ef6bc42Sdrh ** on just about every filesystem imaginable.  But there are serious downsides:
2207734c9864Sdrh **
2208734c9864Sdrh **    (1)  There is zero concurrency.  A single reader blocks all other
2209734c9864Sdrh **         connections from reading or writing the database.
2210734c9864Sdrh **
2211734c9864Sdrh **    (2)  An application crash or power loss can leave stale lock files
2212734c9864Sdrh **         sitting around that need to be cleared manually.
2213734c9864Sdrh **
2214734c9864Sdrh ** Nevertheless, a dotlock is an appropriate locking mode for use if no
2215734c9864Sdrh ** other locking strategy is available.
22167708e972Sdrh **
22179ef6bc42Sdrh ** Dotfile locking works by creating a subdirectory in the same directory as
22189ef6bc42Sdrh ** the database and with the same name but with a ".lock" extension added.
221948864df9Smistachkin ** The existence of a lock directory implies an EXCLUSIVE lock.  All other
22209ef6bc42Sdrh ** lock types (SHARED, RESERVED, PENDING) are mapped into EXCLUSIVE.
2221734c9864Sdrh */
2222734c9864Sdrh 
2223734c9864Sdrh /*
2224734c9864Sdrh ** The file suffix added to the data base filename in order to create the
22259ef6bc42Sdrh ** lock directory.
2226734c9864Sdrh */
2227734c9864Sdrh #define DOTLOCK_SUFFIX ".lock"
2228734c9864Sdrh 
22297708e972Sdrh /*
22307708e972Sdrh ** This routine checks if there is a RESERVED lock held on the specified
22317708e972Sdrh ** file by this or any other process. If such a lock is held, set *pResOut
22327708e972Sdrh ** to a non-zero value otherwise *pResOut is set to zero.  The return value
22337708e972Sdrh ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
22347708e972Sdrh **
22357708e972Sdrh ** In dotfile locking, either a lock exists or it does not.  So in this
22367708e972Sdrh ** variation of CheckReservedLock(), *pResOut is set to true if any lock
22377708e972Sdrh ** is held on the file and false if the file is unlocked.
22387708e972Sdrh */
dotlockCheckReservedLock(sqlite3_file * id,int * pResOut)2239734c9864Sdrh static int dotlockCheckReservedLock(sqlite3_file *id, int *pResOut) {
2240734c9864Sdrh   int rc = SQLITE_OK;
2241734c9864Sdrh   int reserved = 0;
2242734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2243734c9864Sdrh 
2244734c9864Sdrh   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2245734c9864Sdrh 
2246734c9864Sdrh   assert( pFile );
2247a8de1e1cSdrh   reserved = osAccess((const char*)pFile->lockingContext, 0)==0;
2248308c2a5cSdrh   OSTRACE(("TEST WR-LOCK %d %d %d (dotlock)\n", pFile->h, rc, reserved));
2249734c9864Sdrh   *pResOut = reserved;
2250734c9864Sdrh   return rc;
2251734c9864Sdrh }
2252734c9864Sdrh 
22537708e972Sdrh /*
2254308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
22557708e972Sdrh ** of the following:
22567708e972Sdrh **
22577708e972Sdrh **     (1) SHARED_LOCK
22587708e972Sdrh **     (2) RESERVED_LOCK
22597708e972Sdrh **     (3) PENDING_LOCK
22607708e972Sdrh **     (4) EXCLUSIVE_LOCK
22617708e972Sdrh **
22627708e972Sdrh ** Sometimes when requesting one lock state, additional lock states
22637708e972Sdrh ** are inserted in between.  The locking might fail on one of the later
22647708e972Sdrh ** transitions leaving the lock state different from what it started but
22657708e972Sdrh ** still short of its goal.  The following chart shows the allowed
22667708e972Sdrh ** transitions and the inserted intermediate states:
22677708e972Sdrh **
22687708e972Sdrh **    UNLOCKED -> SHARED
22697708e972Sdrh **    SHARED -> RESERVED
22707708e972Sdrh **    SHARED -> (PENDING) -> EXCLUSIVE
22717708e972Sdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
22727708e972Sdrh **    PENDING -> EXCLUSIVE
22737708e972Sdrh **
22747708e972Sdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
22757708e972Sdrh ** routine to lower a locking level.
22767708e972Sdrh **
22777708e972Sdrh ** With dotfile locking, we really only support state (4): EXCLUSIVE.
22787708e972Sdrh ** But we track the other locking levels internally.
22797708e972Sdrh */
dotlockLock(sqlite3_file * id,int eFileLock)2280308c2a5cSdrh static int dotlockLock(sqlite3_file *id, int eFileLock) {
2281734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2282734c9864Sdrh   char *zLockFile = (char *)pFile->lockingContext;
2283734c9864Sdrh   int rc = SQLITE_OK;
2284734c9864Sdrh 
22857708e972Sdrh 
22867708e972Sdrh   /* If we have any lock, then the lock file already exists.  All we have
22877708e972Sdrh   ** to do is adjust our internal record of the lock level.
22887708e972Sdrh   */
2289308c2a5cSdrh   if( pFile->eFileLock > NO_LOCK ){
2290308c2a5cSdrh     pFile->eFileLock = eFileLock;
2291734c9864Sdrh     /* Always update the timestamp on the old file */
2292dbe4b88aSdrh #ifdef HAVE_UTIME
2293dbe4b88aSdrh     utime(zLockFile, NULL);
2294dbe4b88aSdrh #else
2295734c9864Sdrh     utimes(zLockFile, NULL);
2296734c9864Sdrh #endif
22977708e972Sdrh     return SQLITE_OK;
2298734c9864Sdrh   }
2299734c9864Sdrh 
2300734c9864Sdrh   /* grab an exclusive lock */
23019ef6bc42Sdrh   rc = osMkdir(zLockFile, 0777);
23029ef6bc42Sdrh   if( rc<0 ){
23039ef6bc42Sdrh     /* failed to open/create the lock directory */
2304734c9864Sdrh     int tErrno = errno;
2305734c9864Sdrh     if( EEXIST == tErrno ){
2306734c9864Sdrh       rc = SQLITE_BUSY;
2307734c9864Sdrh     } else {
2308734c9864Sdrh       rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
2309a8de1e1cSdrh       if( rc!=SQLITE_BUSY ){
23104bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
2311734c9864Sdrh       }
2312734c9864Sdrh     }
23137708e972Sdrh     return rc;
2314734c9864Sdrh   }
2315734c9864Sdrh 
2316734c9864Sdrh   /* got it, set the type and return ok */
2317308c2a5cSdrh   pFile->eFileLock = eFileLock;
2318734c9864Sdrh   return rc;
2319734c9864Sdrh }
2320734c9864Sdrh 
23217708e972Sdrh /*
2322308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
23237708e972Sdrh ** must be either NO_LOCK or SHARED_LOCK.
23247708e972Sdrh **
23257708e972Sdrh ** If the locking level of the file descriptor is already at or below
23267708e972Sdrh ** the requested locking level, this routine is a no-op.
23277708e972Sdrh **
23287708e972Sdrh ** When the locking level reaches NO_LOCK, delete the lock file.
23297708e972Sdrh */
dotlockUnlock(sqlite3_file * id,int eFileLock)2330308c2a5cSdrh static int dotlockUnlock(sqlite3_file *id, int eFileLock) {
2331734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2332734c9864Sdrh   char *zLockFile = (char *)pFile->lockingContext;
23339ef6bc42Sdrh   int rc;
2334734c9864Sdrh 
2335734c9864Sdrh   assert( pFile );
2336308c2a5cSdrh   OSTRACE(("UNLOCK  %d %d was %d pid=%d (dotlock)\n", pFile->h, eFileLock,
23375ac93652Sdrh            pFile->eFileLock, osGetpid(0)));
2338308c2a5cSdrh   assert( eFileLock<=SHARED_LOCK );
2339734c9864Sdrh 
2340734c9864Sdrh   /* no-op if possible */
2341308c2a5cSdrh   if( pFile->eFileLock==eFileLock ){
2342734c9864Sdrh     return SQLITE_OK;
2343734c9864Sdrh   }
2344734c9864Sdrh 
23457708e972Sdrh   /* To downgrade to shared, simply update our internal notion of the
23467708e972Sdrh   ** lock state.  No need to mess with the file on disk.
23477708e972Sdrh   */
2348308c2a5cSdrh   if( eFileLock==SHARED_LOCK ){
2349308c2a5cSdrh     pFile->eFileLock = SHARED_LOCK;
2350734c9864Sdrh     return SQLITE_OK;
2351734c9864Sdrh   }
2352734c9864Sdrh 
23537708e972Sdrh   /* To fully unlock the database, delete the lock file */
2354308c2a5cSdrh   assert( eFileLock==NO_LOCK );
23559ef6bc42Sdrh   rc = osRmdir(zLockFile);
23569ef6bc42Sdrh   if( rc<0 ){
23570d588bbcSdrh     int tErrno = errno;
2358a8de1e1cSdrh     if( tErrno==ENOENT ){
2359a8de1e1cSdrh       rc = SQLITE_OK;
2360a8de1e1cSdrh     }else{
2361ea83bc61Sdan       rc = SQLITE_IOERR_UNLOCK;
23624bf66fd6Sdrh       storeLastErrno(pFile, tErrno);
2363734c9864Sdrh     }
2364734c9864Sdrh     return rc;
2365734c9864Sdrh   }
2366308c2a5cSdrh   pFile->eFileLock = NO_LOCK;
2367734c9864Sdrh   return SQLITE_OK;
2368734c9864Sdrh }
2369734c9864Sdrh 
2370734c9864Sdrh /*
23719b35ea62Sdrh ** Close a file.  Make sure the lock has been released before closing.
2372734c9864Sdrh */
dotlockClose(sqlite3_file * id)2373734c9864Sdrh static int dotlockClose(sqlite3_file *id) {
2374734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2375a8de1e1cSdrh   assert( id!=0 );
2376734c9864Sdrh   dotlockUnlock(id, NO_LOCK);
2377734c9864Sdrh   sqlite3_free(pFile->lockingContext);
2378a8de1e1cSdrh   return closeUnixFile(id);
2379734c9864Sdrh }
2380734c9864Sdrh /****************** End of the dot-file lock implementation *******************
2381734c9864Sdrh ******************************************************************************/
2382734c9864Sdrh 
2383734c9864Sdrh /******************************************************************************
2384734c9864Sdrh ************************** Begin flock Locking ********************************
2385734c9864Sdrh **
2386734c9864Sdrh ** Use the flock() system call to do file locking.
2387734c9864Sdrh **
23886b9d6ddcSdrh ** flock() locking is like dot-file locking in that the various
23896b9d6ddcSdrh ** fine-grain locking levels supported by SQLite are collapsed into
23906b9d6ddcSdrh ** a single exclusive lock.  In other words, SHARED, RESERVED, and
23916b9d6ddcSdrh ** PENDING locks are the same thing as an EXCLUSIVE lock.  SQLite
23926b9d6ddcSdrh ** still works when you do this, but concurrency is reduced since
23936b9d6ddcSdrh ** only a single process can be reading the database at a time.
23946b9d6ddcSdrh **
2395e89b2918Sdrh ** Omit this section if SQLITE_ENABLE_LOCKING_STYLE is turned off
2396734c9864Sdrh */
2397e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
2398734c9864Sdrh 
23996b9d6ddcSdrh /*
2400ff81231eSdrh ** Retry flock() calls that fail with EINTR
2401ff81231eSdrh */
2402ff81231eSdrh #ifdef EINTR
robust_flock(int fd,int op)2403ff81231eSdrh static int robust_flock(int fd, int op){
2404ff81231eSdrh   int rc;
2405ff81231eSdrh   do{ rc = flock(fd,op); }while( rc<0 && errno==EINTR );
2406ff81231eSdrh   return rc;
2407ff81231eSdrh }
2408ff81231eSdrh #else
24095c81927cSdrh # define robust_flock(a,b) flock(a,b)
2410ff81231eSdrh #endif
2411ff81231eSdrh 
2412ff81231eSdrh 
2413ff81231eSdrh /*
24146b9d6ddcSdrh ** This routine checks if there is a RESERVED lock held on the specified
24156b9d6ddcSdrh ** file by this or any other process. If such a lock is held, set *pResOut
24166b9d6ddcSdrh ** to a non-zero value otherwise *pResOut is set to zero.  The return value
24176b9d6ddcSdrh ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
24186b9d6ddcSdrh */
flockCheckReservedLock(sqlite3_file * id,int * pResOut)2419734c9864Sdrh static int flockCheckReservedLock(sqlite3_file *id, int *pResOut){
2420734c9864Sdrh   int rc = SQLITE_OK;
2421734c9864Sdrh   int reserved = 0;
2422734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2423734c9864Sdrh 
2424734c9864Sdrh   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2425734c9864Sdrh 
2426734c9864Sdrh   assert( pFile );
2427734c9864Sdrh 
2428734c9864Sdrh   /* Check if a thread in this process holds such a lock */
2429308c2a5cSdrh   if( pFile->eFileLock>SHARED_LOCK ){
2430734c9864Sdrh     reserved = 1;
2431734c9864Sdrh   }
2432734c9864Sdrh 
2433734c9864Sdrh   /* Otherwise see if some other process holds it. */
2434734c9864Sdrh   if( !reserved ){
2435734c9864Sdrh     /* attempt to get the lock */
2436ff81231eSdrh     int lrc = robust_flock(pFile->h, LOCK_EX | LOCK_NB);
2437734c9864Sdrh     if( !lrc ){
2438734c9864Sdrh       /* got the lock, unlock it */
2439ff81231eSdrh       lrc = robust_flock(pFile->h, LOCK_UN);
2440734c9864Sdrh       if ( lrc ) {
2441734c9864Sdrh         int tErrno = errno;
2442734c9864Sdrh         /* unlock failed with an error */
2443ea83bc61Sdan         lrc = SQLITE_IOERR_UNLOCK;
24444bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
2445734c9864Sdrh         rc = lrc;
2446734c9864Sdrh       }
2447734c9864Sdrh     } else {
2448734c9864Sdrh       int tErrno = errno;
2449734c9864Sdrh       reserved = 1;
2450734c9864Sdrh       /* someone else might have it reserved */
2451734c9864Sdrh       lrc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
2452734c9864Sdrh       if( IS_LOCK_ERROR(lrc) ){
24534bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
2454734c9864Sdrh         rc = lrc;
2455734c9864Sdrh       }
2456734c9864Sdrh     }
2457734c9864Sdrh   }
2458308c2a5cSdrh   OSTRACE(("TEST WR-LOCK %d %d %d (flock)\n", pFile->h, rc, reserved));
2459734c9864Sdrh 
2460734c9864Sdrh #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
24612e233810Sdrh   if( (rc & 0xff) == SQLITE_IOERR ){
2462734c9864Sdrh     rc = SQLITE_OK;
2463734c9864Sdrh     reserved=1;
2464734c9864Sdrh   }
2465734c9864Sdrh #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2466734c9864Sdrh   *pResOut = reserved;
2467734c9864Sdrh   return rc;
2468734c9864Sdrh }
2469734c9864Sdrh 
24706b9d6ddcSdrh /*
2471308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
24726b9d6ddcSdrh ** of the following:
24736b9d6ddcSdrh **
24746b9d6ddcSdrh **     (1) SHARED_LOCK
24756b9d6ddcSdrh **     (2) RESERVED_LOCK
24766b9d6ddcSdrh **     (3) PENDING_LOCK
24776b9d6ddcSdrh **     (4) EXCLUSIVE_LOCK
24786b9d6ddcSdrh **
24796b9d6ddcSdrh ** Sometimes when requesting one lock state, additional lock states
24806b9d6ddcSdrh ** are inserted in between.  The locking might fail on one of the later
24816b9d6ddcSdrh ** transitions leaving the lock state different from what it started but
24826b9d6ddcSdrh ** still short of its goal.  The following chart shows the allowed
24836b9d6ddcSdrh ** transitions and the inserted intermediate states:
24846b9d6ddcSdrh **
24856b9d6ddcSdrh **    UNLOCKED -> SHARED
24866b9d6ddcSdrh **    SHARED -> RESERVED
24876b9d6ddcSdrh **    SHARED -> (PENDING) -> EXCLUSIVE
24886b9d6ddcSdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
24896b9d6ddcSdrh **    PENDING -> EXCLUSIVE
24906b9d6ddcSdrh **
24916b9d6ddcSdrh ** flock() only really support EXCLUSIVE locks.  We track intermediate
24926b9d6ddcSdrh ** lock states in the sqlite3_file structure, but all locks SHARED or
24936b9d6ddcSdrh ** above are really EXCLUSIVE locks and exclude all other processes from
24946b9d6ddcSdrh ** access the file.
24956b9d6ddcSdrh **
24966b9d6ddcSdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
24976b9d6ddcSdrh ** routine to lower a locking level.
24986b9d6ddcSdrh */
flockLock(sqlite3_file * id,int eFileLock)2499308c2a5cSdrh static int flockLock(sqlite3_file *id, int eFileLock) {
2500734c9864Sdrh   int rc = SQLITE_OK;
2501734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2502734c9864Sdrh 
2503734c9864Sdrh   assert( pFile );
2504734c9864Sdrh 
2505734c9864Sdrh   /* if we already have a lock, it is exclusive.
2506734c9864Sdrh   ** Just adjust level and punt on outta here. */
2507308c2a5cSdrh   if (pFile->eFileLock > NO_LOCK) {
2508308c2a5cSdrh     pFile->eFileLock = eFileLock;
2509734c9864Sdrh     return SQLITE_OK;
2510734c9864Sdrh   }
2511734c9864Sdrh 
2512734c9864Sdrh   /* grab an exclusive lock */
2513734c9864Sdrh 
2514ff81231eSdrh   if (robust_flock(pFile->h, LOCK_EX | LOCK_NB)) {
2515734c9864Sdrh     int tErrno = errno;
2516734c9864Sdrh     /* didn't get, must be busy */
2517734c9864Sdrh     rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
2518734c9864Sdrh     if( IS_LOCK_ERROR(rc) ){
25194bf66fd6Sdrh       storeLastErrno(pFile, tErrno);
2520734c9864Sdrh     }
2521734c9864Sdrh   } else {
2522734c9864Sdrh     /* got it, set the type and return ok */
2523308c2a5cSdrh     pFile->eFileLock = eFileLock;
2524734c9864Sdrh   }
2525308c2a5cSdrh   OSTRACE(("LOCK    %d %s %s (flock)\n", pFile->h, azFileLock(eFileLock),
2526308c2a5cSdrh            rc==SQLITE_OK ? "ok" : "failed"));
2527734c9864Sdrh #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
25282e233810Sdrh   if( (rc & 0xff) == SQLITE_IOERR ){
2529734c9864Sdrh     rc = SQLITE_BUSY;
2530734c9864Sdrh   }
2531734c9864Sdrh #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2532734c9864Sdrh   return rc;
2533734c9864Sdrh }
2534734c9864Sdrh 
25356b9d6ddcSdrh 
25366b9d6ddcSdrh /*
2537308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
25386b9d6ddcSdrh ** must be either NO_LOCK or SHARED_LOCK.
25396b9d6ddcSdrh **
25406b9d6ddcSdrh ** If the locking level of the file descriptor is already at or below
25416b9d6ddcSdrh ** the requested locking level, this routine is a no-op.
25426b9d6ddcSdrh */
flockUnlock(sqlite3_file * id,int eFileLock)2543308c2a5cSdrh static int flockUnlock(sqlite3_file *id, int eFileLock) {
2544734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2545734c9864Sdrh 
2546734c9864Sdrh   assert( pFile );
2547308c2a5cSdrh   OSTRACE(("UNLOCK  %d %d was %d pid=%d (flock)\n", pFile->h, eFileLock,
25485ac93652Sdrh            pFile->eFileLock, osGetpid(0)));
2549308c2a5cSdrh   assert( eFileLock<=SHARED_LOCK );
2550734c9864Sdrh 
2551734c9864Sdrh   /* no-op if possible */
2552308c2a5cSdrh   if( pFile->eFileLock==eFileLock ){
2553734c9864Sdrh     return SQLITE_OK;
2554734c9864Sdrh   }
2555734c9864Sdrh 
2556734c9864Sdrh   /* shared can just be set because we always have an exclusive */
2557308c2a5cSdrh   if (eFileLock==SHARED_LOCK) {
2558308c2a5cSdrh     pFile->eFileLock = eFileLock;
2559734c9864Sdrh     return SQLITE_OK;
2560734c9864Sdrh   }
2561734c9864Sdrh 
2562734c9864Sdrh   /* no, really, unlock. */
2563ea83bc61Sdan   if( robust_flock(pFile->h, LOCK_UN) ){
2564734c9864Sdrh #ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
2565ea83bc61Sdan     return SQLITE_OK;
2566734c9864Sdrh #endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
2567ea83bc61Sdan     return SQLITE_IOERR_UNLOCK;
2568734c9864Sdrh   }else{
2569308c2a5cSdrh     pFile->eFileLock = NO_LOCK;
2570734c9864Sdrh     return SQLITE_OK;
2571734c9864Sdrh   }
2572734c9864Sdrh }
2573734c9864Sdrh 
2574734c9864Sdrh /*
2575734c9864Sdrh ** Close a file.
2576734c9864Sdrh */
flockClose(sqlite3_file * id)2577734c9864Sdrh static int flockClose(sqlite3_file *id) {
2578a8de1e1cSdrh   assert( id!=0 );
2579734c9864Sdrh   flockUnlock(id, NO_LOCK);
2580a8de1e1cSdrh   return closeUnixFile(id);
2581734c9864Sdrh }
2582734c9864Sdrh 
2583734c9864Sdrh #endif /* SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORK */
2584734c9864Sdrh 
2585734c9864Sdrh /******************* End of the flock lock implementation *********************
2586734c9864Sdrh ******************************************************************************/
2587734c9864Sdrh 
2588734c9864Sdrh /******************************************************************************
2589734c9864Sdrh ************************ Begin Named Semaphore Locking ************************
2590734c9864Sdrh **
2591734c9864Sdrh ** Named semaphore locking is only supported on VxWorks.
25926b9d6ddcSdrh **
25936b9d6ddcSdrh ** Semaphore locking is like dot-lock and flock in that it really only
25946b9d6ddcSdrh ** supports EXCLUSIVE locking.  Only a single process can read or write
25956b9d6ddcSdrh ** the database file at a time.  This reduces potential concurrency, but
25966b9d6ddcSdrh ** makes the lock implementation much easier.
2597734c9864Sdrh */
2598734c9864Sdrh #if OS_VXWORKS
2599734c9864Sdrh 
26006b9d6ddcSdrh /*
26016b9d6ddcSdrh ** This routine checks if there is a RESERVED lock held on the specified
26026b9d6ddcSdrh ** file by this or any other process. If such a lock is held, set *pResOut
26036b9d6ddcSdrh ** to a non-zero value otherwise *pResOut is set to zero.  The return value
26046b9d6ddcSdrh ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
26056b9d6ddcSdrh */
semXCheckReservedLock(sqlite3_file * id,int * pResOut)26068cd5b254Sdrh static int semXCheckReservedLock(sqlite3_file *id, int *pResOut) {
2607734c9864Sdrh   int rc = SQLITE_OK;
2608734c9864Sdrh   int reserved = 0;
2609734c9864Sdrh   unixFile *pFile = (unixFile*)id;
2610734c9864Sdrh 
2611734c9864Sdrh   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
2612734c9864Sdrh 
2613734c9864Sdrh   assert( pFile );
2614734c9864Sdrh 
2615734c9864Sdrh   /* Check if a thread in this process holds such a lock */
2616308c2a5cSdrh   if( pFile->eFileLock>SHARED_LOCK ){
2617734c9864Sdrh     reserved = 1;
2618734c9864Sdrh   }
2619734c9864Sdrh 
2620734c9864Sdrh   /* Otherwise see if some other process holds it. */
2621734c9864Sdrh   if( !reserved ){
26228af6c228Sdrh     sem_t *pSem = pFile->pInode->pSem;
2623734c9864Sdrh 
2624734c9864Sdrh     if( sem_trywait(pSem)==-1 ){
2625734c9864Sdrh       int tErrno = errno;
2626734c9864Sdrh       if( EAGAIN != tErrno ){
2627734c9864Sdrh         rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_CHECKRESERVEDLOCK);
26284bf66fd6Sdrh         storeLastErrno(pFile, tErrno);
2629734c9864Sdrh       } else {
2630734c9864Sdrh         /* someone else has the lock when we are in NO_LOCK */
2631308c2a5cSdrh         reserved = (pFile->eFileLock < SHARED_LOCK);
2632734c9864Sdrh       }
2633734c9864Sdrh     }else{
2634734c9864Sdrh       /* we could have it if we want it */
2635734c9864Sdrh       sem_post(pSem);
2636734c9864Sdrh     }
2637734c9864Sdrh   }
2638308c2a5cSdrh   OSTRACE(("TEST WR-LOCK %d %d %d (sem)\n", pFile->h, rc, reserved));
2639734c9864Sdrh 
2640734c9864Sdrh   *pResOut = reserved;
2641734c9864Sdrh   return rc;
2642734c9864Sdrh }
2643734c9864Sdrh 
26446b9d6ddcSdrh /*
2645308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
26466b9d6ddcSdrh ** of the following:
26476b9d6ddcSdrh **
26486b9d6ddcSdrh **     (1) SHARED_LOCK
26496b9d6ddcSdrh **     (2) RESERVED_LOCK
26506b9d6ddcSdrh **     (3) PENDING_LOCK
26516b9d6ddcSdrh **     (4) EXCLUSIVE_LOCK
26526b9d6ddcSdrh **
26536b9d6ddcSdrh ** Sometimes when requesting one lock state, additional lock states
26546b9d6ddcSdrh ** are inserted in between.  The locking might fail on one of the later
26556b9d6ddcSdrh ** transitions leaving the lock state different from what it started but
26566b9d6ddcSdrh ** still short of its goal.  The following chart shows the allowed
26576b9d6ddcSdrh ** transitions and the inserted intermediate states:
26586b9d6ddcSdrh **
26596b9d6ddcSdrh **    UNLOCKED -> SHARED
26606b9d6ddcSdrh **    SHARED -> RESERVED
26616b9d6ddcSdrh **    SHARED -> (PENDING) -> EXCLUSIVE
26626b9d6ddcSdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
26636b9d6ddcSdrh **    PENDING -> EXCLUSIVE
26646b9d6ddcSdrh **
26656b9d6ddcSdrh ** Semaphore locks only really support EXCLUSIVE locks.  We track intermediate
26666b9d6ddcSdrh ** lock states in the sqlite3_file structure, but all locks SHARED or
26676b9d6ddcSdrh ** above are really EXCLUSIVE locks and exclude all other processes from
26686b9d6ddcSdrh ** access the file.
26696b9d6ddcSdrh **
26706b9d6ddcSdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
26716b9d6ddcSdrh ** routine to lower a locking level.
26726b9d6ddcSdrh */
semXLock(sqlite3_file * id,int eFileLock)26738cd5b254Sdrh static int semXLock(sqlite3_file *id, int eFileLock) {
2674734c9864Sdrh   unixFile *pFile = (unixFile*)id;
26758af6c228Sdrh   sem_t *pSem = pFile->pInode->pSem;
2676734c9864Sdrh   int rc = SQLITE_OK;
2677734c9864Sdrh 
2678734c9864Sdrh   /* if we already have a lock, it is exclusive.
2679734c9864Sdrh   ** Just adjust level and punt on outta here. */
2680308c2a5cSdrh   if (pFile->eFileLock > NO_LOCK) {
2681308c2a5cSdrh     pFile->eFileLock = eFileLock;
2682734c9864Sdrh     rc = SQLITE_OK;
2683734c9864Sdrh     goto sem_end_lock;
2684734c9864Sdrh   }
2685734c9864Sdrh 
2686734c9864Sdrh   /* lock semaphore now but bail out when already locked. */
2687734c9864Sdrh   if( sem_trywait(pSem)==-1 ){
2688734c9864Sdrh     rc = SQLITE_BUSY;
2689734c9864Sdrh     goto sem_end_lock;
2690734c9864Sdrh   }
2691734c9864Sdrh 
2692734c9864Sdrh   /* got it, set the type and return ok */
2693308c2a5cSdrh   pFile->eFileLock = eFileLock;
2694734c9864Sdrh 
2695734c9864Sdrh  sem_end_lock:
2696734c9864Sdrh   return rc;
2697734c9864Sdrh }
2698734c9864Sdrh 
26996b9d6ddcSdrh /*
2700308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
27016b9d6ddcSdrh ** must be either NO_LOCK or SHARED_LOCK.
27026b9d6ddcSdrh **
27036b9d6ddcSdrh ** If the locking level of the file descriptor is already at or below
27046b9d6ddcSdrh ** the requested locking level, this routine is a no-op.
27056b9d6ddcSdrh */
semXUnlock(sqlite3_file * id,int eFileLock)27068cd5b254Sdrh static int semXUnlock(sqlite3_file *id, int eFileLock) {
2707734c9864Sdrh   unixFile *pFile = (unixFile*)id;
27088af6c228Sdrh   sem_t *pSem = pFile->pInode->pSem;
2709734c9864Sdrh 
2710734c9864Sdrh   assert( pFile );
2711734c9864Sdrh   assert( pSem );
2712308c2a5cSdrh   OSTRACE(("UNLOCK  %d %d was %d pid=%d (sem)\n", pFile->h, eFileLock,
27135ac93652Sdrh            pFile->eFileLock, osGetpid(0)));
2714308c2a5cSdrh   assert( eFileLock<=SHARED_LOCK );
2715734c9864Sdrh 
2716734c9864Sdrh   /* no-op if possible */
2717308c2a5cSdrh   if( pFile->eFileLock==eFileLock ){
2718734c9864Sdrh     return SQLITE_OK;
2719734c9864Sdrh   }
2720734c9864Sdrh 
2721734c9864Sdrh   /* shared can just be set because we always have an exclusive */
2722308c2a5cSdrh   if (eFileLock==SHARED_LOCK) {
2723308c2a5cSdrh     pFile->eFileLock = eFileLock;
2724734c9864Sdrh     return SQLITE_OK;
2725734c9864Sdrh   }
2726734c9864Sdrh 
2727734c9864Sdrh   /* no, really unlock. */
2728734c9864Sdrh   if ( sem_post(pSem)==-1 ) {
2729734c9864Sdrh     int rc, tErrno = errno;
2730734c9864Sdrh     rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_UNLOCK);
2731734c9864Sdrh     if( IS_LOCK_ERROR(rc) ){
27324bf66fd6Sdrh       storeLastErrno(pFile, tErrno);
2733734c9864Sdrh     }
2734734c9864Sdrh     return rc;
2735734c9864Sdrh   }
2736308c2a5cSdrh   pFile->eFileLock = NO_LOCK;
2737734c9864Sdrh   return SQLITE_OK;
2738734c9864Sdrh }
2739734c9864Sdrh 
2740734c9864Sdrh /*
2741734c9864Sdrh  ** Close a file.
2742734c9864Sdrh  */
semXClose(sqlite3_file * id)27438cd5b254Sdrh static int semXClose(sqlite3_file *id) {
2744734c9864Sdrh   if( id ){
2745734c9864Sdrh     unixFile *pFile = (unixFile*)id;
27468cd5b254Sdrh     semXUnlock(id, NO_LOCK);
2747734c9864Sdrh     assert( pFile );
2748095908e1Sdrh     assert( unixFileMutexNotheld(pFile) );
2749734c9864Sdrh     unixEnterMutex();
2750b0ac3e3aSdan     releaseInodeInfo(pFile);
2751734c9864Sdrh     unixLeaveMutex();
275278a1318bSchw     closeUnixFile(id);
2753734c9864Sdrh   }
2754734c9864Sdrh   return SQLITE_OK;
2755734c9864Sdrh }
2756734c9864Sdrh 
2757734c9864Sdrh #endif /* OS_VXWORKS */
2758734c9864Sdrh /*
2759734c9864Sdrh ** Named semaphore locking is only available on VxWorks.
2760734c9864Sdrh **
2761734c9864Sdrh *************** End of the named semaphore lock implementation ****************
2762734c9864Sdrh ******************************************************************************/
2763734c9864Sdrh 
2764734c9864Sdrh 
2765734c9864Sdrh /******************************************************************************
2766734c9864Sdrh *************************** Begin AFP Locking *********************************
2767734c9864Sdrh **
2768734c9864Sdrh ** AFP is the Apple Filing Protocol.  AFP is a network filesystem found
2769734c9864Sdrh ** on Apple Macintosh computers - both OS9 and OSX.
2770734c9864Sdrh **
2771734c9864Sdrh ** Third-party implementations of AFP are available.  But this code here
2772734c9864Sdrh ** only works on OSX.
2773734c9864Sdrh */
2774734c9864Sdrh 
2775d2cb50b7Sdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
2776bfe6631eSdrh /*
2777bfe6631eSdrh ** The afpLockingContext structure contains all afp lock specific state
2778bfe6631eSdrh */
2779bfe6631eSdrh typedef struct afpLockingContext afpLockingContext;
2780bfe6631eSdrh struct afpLockingContext {
27817ed97b9dSdrh   int reserved;
27826b9d6ddcSdrh   const char *dbPath;             /* Name of the open file */
2783bfe6631eSdrh };
2784bfe6631eSdrh 
2785bfe6631eSdrh struct ByteRangeLockPB2
2786bfe6631eSdrh {
2787bfe6631eSdrh   unsigned long long offset;        /* offset to first byte to lock */
2788bfe6631eSdrh   unsigned long long length;        /* nbr of bytes to lock */
2789bfe6631eSdrh   unsigned long long retRangeStart; /* nbr of 1st byte locked if successful */
2790bfe6631eSdrh   unsigned char unLockFlag;         /* 1 = unlock, 0 = lock */
2791bfe6631eSdrh   unsigned char startEndFlag;       /* 1=rel to end of fork, 0=rel to start */
2792bfe6631eSdrh   int fd;                           /* file desc to assoc this lock with */
2793bfe6631eSdrh };
2794bfe6631eSdrh 
2795bfe6631eSdrh #define afpfsByteRangeLock2FSCTL        _IOWR('z', 23, struct ByteRangeLockPB2)
2796bfe6631eSdrh 
2797ad94b58aSdanielk1977 /*
27986b9d6ddcSdrh ** This is a utility for setting or clearing a bit-range lock on an
27996b9d6ddcSdrh ** AFP filesystem.
28006b9d6ddcSdrh **
28015b1a2566Saswift ** Return SQLITE_OK on success, SQLITE_BUSY on failure.
2802ad94b58aSdanielk1977 */
afpSetLock(const char * path,unixFile * pFile,unsigned long long offset,unsigned long long length,int setLockFlag)28036b9d6ddcSdrh static int afpSetLock(
28046b9d6ddcSdrh   const char *path,              /* Name of the file to be locked or unlocked */
28056b9d6ddcSdrh   unixFile *pFile,               /* Open file descriptor on path */
28066b9d6ddcSdrh   unsigned long long offset,     /* First byte to be locked */
28076b9d6ddcSdrh   unsigned long long length,     /* Number of bytes to lock */
28086b9d6ddcSdrh   int setLockFlag                /* True to set lock.  False to clear lock */
2809ad94b58aSdanielk1977 ){
2810bfe6631eSdrh   struct ByteRangeLockPB2 pb;
2811bfe6631eSdrh   int err;
2812bfe6631eSdrh 
2813bfe6631eSdrh   pb.unLockFlag = setLockFlag ? 0 : 1;
2814bfe6631eSdrh   pb.startEndFlag = 0;
2815bfe6631eSdrh   pb.offset = offset;
2816bfe6631eSdrh   pb.length = length;
28175b1a2566Saswift   pb.fd = pFile->h;
2818aebf413dSaswift 
2819308c2a5cSdrh   OSTRACE(("AFPSETLOCK [%s] for %d%s in range %llx:%llx\n",
2820734c9864Sdrh     (setLockFlag?"ON":"OFF"), pFile->h, (pb.fd==-1?"[testval-1]":""),
2821308c2a5cSdrh     offset, length));
2822bfe6631eSdrh   err = fsctl(path, afpfsByteRangeLock2FSCTL, &pb, 0);
2823bfe6631eSdrh   if ( err==-1 ) {
28245b1a2566Saswift     int rc;
28255b1a2566Saswift     int tErrno = errno;
2826308c2a5cSdrh     OSTRACE(("AFPSETLOCK failed to fsctl() '%s' %d %s\n",
2827308c2a5cSdrh              path, tErrno, strerror(tErrno)));
2828aebf413dSaswift #ifdef SQLITE_IGNORE_AFP_LOCK_ERRORS
2829aebf413dSaswift     rc = SQLITE_BUSY;
2830aebf413dSaswift #else
2831734c9864Sdrh     rc = sqliteErrorFromPosixError(tErrno,
2832734c9864Sdrh                     setLockFlag ? SQLITE_IOERR_LOCK : SQLITE_IOERR_UNLOCK);
2833aebf413dSaswift #endif /* SQLITE_IGNORE_AFP_LOCK_ERRORS */
28345b1a2566Saswift     if( IS_LOCK_ERROR(rc) ){
28354bf66fd6Sdrh       storeLastErrno(pFile, tErrno);
28365b1a2566Saswift     }
28375b1a2566Saswift     return rc;
2838bfe6631eSdrh   } else {
28395b1a2566Saswift     return SQLITE_OK;
2840bfe6631eSdrh   }
2841bfe6631eSdrh }
2842bfe6631eSdrh 
28436b9d6ddcSdrh /*
28446b9d6ddcSdrh ** This routine checks if there is a RESERVED lock held on the specified
28456b9d6ddcSdrh ** file by this or any other process. If such a lock is held, set *pResOut
28466b9d6ddcSdrh ** to a non-zero value otherwise *pResOut is set to zero.  The return value
28476b9d6ddcSdrh ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
28486b9d6ddcSdrh */
afpCheckReservedLock(sqlite3_file * id,int * pResOut)2849e339d65aSdanielk1977 static int afpCheckReservedLock(sqlite3_file *id, int *pResOut){
28505b1a2566Saswift   int rc = SQLITE_OK;
28515b1a2566Saswift   int reserved = 0;
2852bfe6631eSdrh   unixFile *pFile = (unixFile*)id;
28533d4435b2Sdrh   afpLockingContext *context;
2854bfe6631eSdrh 
28555b1a2566Saswift   SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
28565b1a2566Saswift 
2857bfe6631eSdrh   assert( pFile );
28583d4435b2Sdrh   context = (afpLockingContext *) pFile->lockingContext;
28597ed97b9dSdrh   if( context->reserved ){
28607ed97b9dSdrh     *pResOut = 1;
28617ed97b9dSdrh     return SQLITE_OK;
28627ed97b9dSdrh   }
2863da6dc240Sdrh   sqlite3_mutex_enter(pFile->pInode->pLockMutex);
2864bfe6631eSdrh   /* Check if a thread in this process holds such a lock */
28658af6c228Sdrh   if( pFile->pInode->eFileLock>SHARED_LOCK ){
28665b1a2566Saswift     reserved = 1;
2867bfe6631eSdrh   }
2868bfe6631eSdrh 
2869bfe6631eSdrh   /* Otherwise see if some other process holds it.
2870bfe6631eSdrh    */
28715b1a2566Saswift   if( !reserved ){
28725b1a2566Saswift     /* lock the RESERVED byte */
28736b9d6ddcSdrh     int lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
28745b1a2566Saswift     if( SQLITE_OK==lrc ){
2875bfe6631eSdrh       /* if we succeeded in taking the reserved lock, unlock it to restore
2876bfe6631eSdrh       ** the original state */
28776b9d6ddcSdrh       lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
28785b1a2566Saswift     } else {
28795b1a2566Saswift       /* if we failed to get the lock then someone else must have it */
28805b1a2566Saswift       reserved = 1;
28815b1a2566Saswift     }
28825b1a2566Saswift     if( IS_LOCK_ERROR(lrc) ){
28835b1a2566Saswift       rc=lrc;
2884bfe6631eSdrh     }
2885bfe6631eSdrh   }
2886bfe6631eSdrh 
2887da6dc240Sdrh   sqlite3_mutex_leave(pFile->pInode->pLockMutex);
2888308c2a5cSdrh   OSTRACE(("TEST WR-LOCK %d %d %d (afp)\n", pFile->h, rc, reserved));
28895b1a2566Saswift 
28905b1a2566Saswift   *pResOut = reserved;
28915b1a2566Saswift   return rc;
2892bfe6631eSdrh }
2893bfe6631eSdrh 
28946b9d6ddcSdrh /*
2895308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
28966b9d6ddcSdrh ** of the following:
28976b9d6ddcSdrh **
28986b9d6ddcSdrh **     (1) SHARED_LOCK
28996b9d6ddcSdrh **     (2) RESERVED_LOCK
29006b9d6ddcSdrh **     (3) PENDING_LOCK
29016b9d6ddcSdrh **     (4) EXCLUSIVE_LOCK
29026b9d6ddcSdrh **
29036b9d6ddcSdrh ** Sometimes when requesting one lock state, additional lock states
29046b9d6ddcSdrh ** are inserted in between.  The locking might fail on one of the later
29056b9d6ddcSdrh ** transitions leaving the lock state different from what it started but
29066b9d6ddcSdrh ** still short of its goal.  The following chart shows the allowed
29076b9d6ddcSdrh ** transitions and the inserted intermediate states:
29086b9d6ddcSdrh **
29096b9d6ddcSdrh **    UNLOCKED -> SHARED
29106b9d6ddcSdrh **    SHARED -> RESERVED
29116b9d6ddcSdrh **    SHARED -> (PENDING) -> EXCLUSIVE
29126b9d6ddcSdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
29136b9d6ddcSdrh **    PENDING -> EXCLUSIVE
29146b9d6ddcSdrh **
29156b9d6ddcSdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
29166b9d6ddcSdrh ** routine to lower a locking level.
29176b9d6ddcSdrh */
afpLock(sqlite3_file * id,int eFileLock)2918308c2a5cSdrh static int afpLock(sqlite3_file *id, int eFileLock){
2919bfe6631eSdrh   int rc = SQLITE_OK;
2920bfe6631eSdrh   unixFile *pFile = (unixFile*)id;
2921d91c68f6Sdrh   unixInodeInfo *pInode = pFile->pInode;
2922bfe6631eSdrh   afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
2923bfe6631eSdrh 
2924bfe6631eSdrh   assert( pFile );
2925308c2a5cSdrh   OSTRACE(("LOCK    %d %s was %s(%s,%d) pid=%d (afp)\n", pFile->h,
2926308c2a5cSdrh            azFileLock(eFileLock), azFileLock(pFile->eFileLock),
29275ac93652Sdrh            azFileLock(pInode->eFileLock), pInode->nShared , osGetpid(0)));
2928339eb0b8Sdrh 
2929bfe6631eSdrh   /* If there is already a lock of this type or more restrictive on the
2930ad94b58aSdanielk1977   ** unixFile, do nothing. Don't use the afp_end_lock: exit path, as
29316c7d5c5bSdrh   ** unixEnterMutex() hasn't been called yet.
2932bfe6631eSdrh   */
2933308c2a5cSdrh   if( pFile->eFileLock>=eFileLock ){
2934308c2a5cSdrh     OSTRACE(("LOCK    %d %s ok (already held) (afp)\n", pFile->h,
2935308c2a5cSdrh            azFileLock(eFileLock)));
2936bfe6631eSdrh     return SQLITE_OK;
2937bfe6631eSdrh   }
2938bfe6631eSdrh 
2939bfe6631eSdrh   /* Make sure the locking sequence is correct
29407ed97b9dSdrh   **  (1) We never move from unlocked to anything higher than shared lock.
29417ed97b9dSdrh   **  (2) SQLite never explicitly requests a pendig lock.
29427ed97b9dSdrh   **  (3) A shared lock is always held when a reserve lock is requested.
2943bfe6631eSdrh   */
2944308c2a5cSdrh   assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
2945308c2a5cSdrh   assert( eFileLock!=PENDING_LOCK );
2946308c2a5cSdrh   assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
2947bfe6631eSdrh 
29488af6c228Sdrh   /* This mutex is needed because pFile->pInode is shared across threads
2949bfe6631eSdrh   */
29508af6c228Sdrh   pInode = pFile->pInode;
2951da6dc240Sdrh   sqlite3_mutex_enter(pInode->pLockMutex);
29527ed97b9dSdrh 
29537ed97b9dSdrh   /* If some thread using this PID has a lock via a different unixFile*
29547ed97b9dSdrh   ** handle that precludes the requested lock, return BUSY.
29557ed97b9dSdrh   */
29568af6c228Sdrh   if( (pFile->eFileLock!=pInode->eFileLock &&
29578af6c228Sdrh        (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
29587ed97b9dSdrh      ){
29597ed97b9dSdrh     rc = SQLITE_BUSY;
29607ed97b9dSdrh     goto afp_end_lock;
29617ed97b9dSdrh   }
29627ed97b9dSdrh 
29637ed97b9dSdrh   /* If a SHARED lock is requested, and some thread using this PID already
29647ed97b9dSdrh   ** has a SHARED or RESERVED lock, then increment reference counts and
29657ed97b9dSdrh   ** return SQLITE_OK.
29667ed97b9dSdrh   */
2967308c2a5cSdrh   if( eFileLock==SHARED_LOCK &&
29688af6c228Sdrh      (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
2969308c2a5cSdrh     assert( eFileLock==SHARED_LOCK );
2970308c2a5cSdrh     assert( pFile->eFileLock==0 );
29718af6c228Sdrh     assert( pInode->nShared>0 );
2972308c2a5cSdrh     pFile->eFileLock = SHARED_LOCK;
29738af6c228Sdrh     pInode->nShared++;
29748af6c228Sdrh     pInode->nLock++;
29757ed97b9dSdrh     goto afp_end_lock;
29767ed97b9dSdrh   }
2977bfe6631eSdrh 
2978bfe6631eSdrh   /* A PENDING lock is needed before acquiring a SHARED lock and before
2979bfe6631eSdrh   ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
2980bfe6631eSdrh   ** be released.
2981bfe6631eSdrh   */
2982308c2a5cSdrh   if( eFileLock==SHARED_LOCK
2983308c2a5cSdrh       || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
2984bfe6631eSdrh   ){
2985339eb0b8Sdrh     int failed;
29866b9d6ddcSdrh     failed = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 1);
2987bfe6631eSdrh     if (failed) {
29885b1a2566Saswift       rc = failed;
2989bfe6631eSdrh       goto afp_end_lock;
2990bfe6631eSdrh     }
2991bfe6631eSdrh   }
2992bfe6631eSdrh 
2993bfe6631eSdrh   /* If control gets to this point, then actually go ahead and make
2994bfe6631eSdrh   ** operating system calls for the specified lock.
2995bfe6631eSdrh   */
2996308c2a5cSdrh   if( eFileLock==SHARED_LOCK ){
29973d4435b2Sdrh     int lrc1, lrc2, lrc1Errno = 0;
29987ed97b9dSdrh     long lk, mask;
2999bfe6631eSdrh 
30008af6c228Sdrh     assert( pInode->nShared==0 );
30018af6c228Sdrh     assert( pInode->eFileLock==0 );
30027ed97b9dSdrh 
30037ed97b9dSdrh     mask = (sizeof(long)==8) ? LARGEST_INT64 : 0x7fffffff;
30045b1a2566Saswift     /* Now get the read-lock SHARED_LOCK */
3005bfe6631eSdrh     /* note that the quality of the randomness doesn't matter that much */
3006bfe6631eSdrh     lk = random();
30078af6c228Sdrh     pInode->sharedByte = (lk & mask)%(SHARED_SIZE - 1);
30086b9d6ddcSdrh     lrc1 = afpSetLock(context->dbPath, pFile,
30098af6c228Sdrh           SHARED_FIRST+pInode->sharedByte, 1, 1);
30105b1a2566Saswift     if( IS_LOCK_ERROR(lrc1) ){
30115b1a2566Saswift       lrc1Errno = pFile->lastErrno;
3012bfe6631eSdrh     }
30135b1a2566Saswift     /* Drop the temporary PENDING lock */
30146b9d6ddcSdrh     lrc2 = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
3015bfe6631eSdrh 
30165b1a2566Saswift     if( IS_LOCK_ERROR(lrc1) ) {
30174bf66fd6Sdrh       storeLastErrno(pFile, lrc1Errno);
30185b1a2566Saswift       rc = lrc1;
30195b1a2566Saswift       goto afp_end_lock;
30205b1a2566Saswift     } else if( IS_LOCK_ERROR(lrc2) ){
30215b1a2566Saswift       rc = lrc2;
30225b1a2566Saswift       goto afp_end_lock;
30235b1a2566Saswift     } else if( lrc1 != SQLITE_OK ) {
30245b1a2566Saswift       rc = lrc1;
3025bfe6631eSdrh     } else {
3026308c2a5cSdrh       pFile->eFileLock = SHARED_LOCK;
30278af6c228Sdrh       pInode->nLock++;
30288af6c228Sdrh       pInode->nShared = 1;
3029bfe6631eSdrh     }
30308af6c228Sdrh   }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
30317ed97b9dSdrh     /* We are trying for an exclusive lock but another thread in this
30327ed97b9dSdrh      ** same process is still holding a shared lock. */
30337ed97b9dSdrh     rc = SQLITE_BUSY;
3034bfe6631eSdrh   }else{
3035bfe6631eSdrh     /* The request was for a RESERVED or EXCLUSIVE lock.  It is
3036bfe6631eSdrh     ** assumed that there is a SHARED or greater lock on the file
3037bfe6631eSdrh     ** already.
3038bfe6631eSdrh     */
3039bfe6631eSdrh     int failed = 0;
3040308c2a5cSdrh     assert( 0!=pFile->eFileLock );
3041308c2a5cSdrh     if (eFileLock >= RESERVED_LOCK && pFile->eFileLock < RESERVED_LOCK) {
3042bfe6631eSdrh         /* Acquire a RESERVED lock */
30436b9d6ddcSdrh         failed = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
30447ed97b9dSdrh       if( !failed ){
30457ed97b9dSdrh         context->reserved = 1;
30467ed97b9dSdrh       }
3047bfe6631eSdrh     }
3048308c2a5cSdrh     if (!failed && eFileLock == EXCLUSIVE_LOCK) {
3049bfe6631eSdrh       /* Acquire an EXCLUSIVE lock */
3050bfe6631eSdrh 
3051bfe6631eSdrh       /* Remove the shared lock before trying the range.  we'll need to
3052e339d65aSdanielk1977       ** reestablish the shared lock if we can't get the  afpUnlock
3053bfe6631eSdrh       */
30546b9d6ddcSdrh       if( !(failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST +
30558af6c228Sdrh                          pInode->sharedByte, 1, 0)) ){
3056aebf413dSaswift         int failed2 = SQLITE_OK;
3057bfe6631eSdrh         /* now attemmpt to get the exclusive lock range */
30586b9d6ddcSdrh         failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST,
3059bfe6631eSdrh                                SHARED_SIZE, 1);
30606b9d6ddcSdrh         if( failed && (failed2 = afpSetLock(context->dbPath, pFile,
30618af6c228Sdrh                        SHARED_FIRST + pInode->sharedByte, 1, 1)) ){
3062aebf413dSaswift           /* Can't reestablish the shared lock.  Sqlite can't deal, this is
3063aebf413dSaswift           ** a critical I/O error
3064aebf413dSaswift           */
30652e233810Sdrh           rc = ((failed & 0xff) == SQLITE_IOERR) ? failed2 :
3066aebf413dSaswift                SQLITE_IOERR_LOCK;
3067aebf413dSaswift           goto afp_end_lock;
3068bfe6631eSdrh         }
3069bfe6631eSdrh       }else{
30705b1a2566Saswift         rc = failed;
3071bfe6631eSdrh       }
3072bfe6631eSdrh     }
30735b1a2566Saswift     if( failed ){
30745b1a2566Saswift       rc = failed;
3075bfe6631eSdrh     }
3076bfe6631eSdrh   }
3077bfe6631eSdrh 
3078bfe6631eSdrh   if( rc==SQLITE_OK ){
3079308c2a5cSdrh     pFile->eFileLock = eFileLock;
30808af6c228Sdrh     pInode->eFileLock = eFileLock;
3081308c2a5cSdrh   }else if( eFileLock==EXCLUSIVE_LOCK ){
3082308c2a5cSdrh     pFile->eFileLock = PENDING_LOCK;
30838af6c228Sdrh     pInode->eFileLock = PENDING_LOCK;
3084bfe6631eSdrh   }
3085bfe6631eSdrh 
3086bfe6631eSdrh afp_end_lock:
3087da6dc240Sdrh   sqlite3_mutex_leave(pInode->pLockMutex);
3088308c2a5cSdrh   OSTRACE(("LOCK    %d %s %s (afp)\n", pFile->h, azFileLock(eFileLock),
3089308c2a5cSdrh          rc==SQLITE_OK ? "ok" : "failed"));
3090bfe6631eSdrh   return rc;
3091bfe6631eSdrh }
3092bfe6631eSdrh 
3093bfe6631eSdrh /*
3094308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
3095bfe6631eSdrh ** must be either NO_LOCK or SHARED_LOCK.
3096bfe6631eSdrh **
3097bfe6631eSdrh ** If the locking level of the file descriptor is already at or below
3098bfe6631eSdrh ** the requested locking level, this routine is a no-op.
3099bfe6631eSdrh */
afpUnlock(sqlite3_file * id,int eFileLock)3100308c2a5cSdrh static int afpUnlock(sqlite3_file *id, int eFileLock) {
3101bfe6631eSdrh   int rc = SQLITE_OK;
3102bfe6631eSdrh   unixFile *pFile = (unixFile*)id;
3103d91c68f6Sdrh   unixInodeInfo *pInode;
31047ed97b9dSdrh   afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
31057ed97b9dSdrh   int skipShared = 0;
31067ed97b9dSdrh #ifdef SQLITE_TEST
31077ed97b9dSdrh   int h = pFile->h;
31087ed97b9dSdrh #endif
3109bfe6631eSdrh 
3110bfe6631eSdrh   assert( pFile );
3111308c2a5cSdrh   OSTRACE(("UNLOCK  %d %d was %d(%d,%d) pid=%d (afp)\n", pFile->h, eFileLock,
31128af6c228Sdrh            pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
31135ac93652Sdrh            osGetpid(0)));
3114bfe6631eSdrh 
3115308c2a5cSdrh   assert( eFileLock<=SHARED_LOCK );
3116308c2a5cSdrh   if( pFile->eFileLock<=eFileLock ){
3117bfe6631eSdrh     return SQLITE_OK;
3118bfe6631eSdrh   }
31198af6c228Sdrh   pInode = pFile->pInode;
3120da6dc240Sdrh   sqlite3_mutex_enter(pInode->pLockMutex);
31218af6c228Sdrh   assert( pInode->nShared!=0 );
3122308c2a5cSdrh   if( pFile->eFileLock>SHARED_LOCK ){
31238af6c228Sdrh     assert( pInode->eFileLock==pFile->eFileLock );
31247ed97b9dSdrh     SimulateIOErrorBenign(1);
31257ed97b9dSdrh     SimulateIOError( h=(-1) )
31267ed97b9dSdrh     SimulateIOErrorBenign(0);
31277ed97b9dSdrh 
3128d3d8c04fSdrh #ifdef SQLITE_DEBUG
31297ed97b9dSdrh     /* When reducing a lock such that other processes can start
31307ed97b9dSdrh     ** reading the database file again, make sure that the
31317ed97b9dSdrh     ** transaction counter was updated if any part of the database
31327ed97b9dSdrh     ** file changed.  If the transaction counter is not updated,
31337ed97b9dSdrh     ** other connections to the same file might not realize that
31347ed97b9dSdrh     ** the file has changed and hence might not know to flush their
31357ed97b9dSdrh     ** cache.  The use of a stale cache can lead to database corruption.
31367ed97b9dSdrh     */
31377ed97b9dSdrh     assert( pFile->inNormalWrite==0
31387ed97b9dSdrh            || pFile->dbUpdate==0
31397ed97b9dSdrh            || pFile->transCntrChng==1 );
31407ed97b9dSdrh     pFile->inNormalWrite = 0;
31417ed97b9dSdrh #endif
3142bfe6631eSdrh 
3143308c2a5cSdrh     if( pFile->eFileLock==EXCLUSIVE_LOCK ){
31447ed97b9dSdrh       rc = afpSetLock(context->dbPath, pFile, SHARED_FIRST, SHARED_SIZE, 0);
31458af6c228Sdrh       if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1) ){
3146aebf413dSaswift         /* only re-establish the shared lock if necessary */
31478af6c228Sdrh         int sharedLockByte = SHARED_FIRST+pInode->sharedByte;
31487ed97b9dSdrh         rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 1);
31497ed97b9dSdrh       } else {
31507ed97b9dSdrh         skipShared = 1;
3151bfe6631eSdrh       }
3152bfe6631eSdrh     }
3153308c2a5cSdrh     if( rc==SQLITE_OK && pFile->eFileLock>=PENDING_LOCK ){
31547ed97b9dSdrh       rc = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
3155bfe6631eSdrh     }
3156308c2a5cSdrh     if( rc==SQLITE_OK && pFile->eFileLock>=RESERVED_LOCK && context->reserved ){
31577ed97b9dSdrh       rc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
31587ed97b9dSdrh       if( !rc ){
31597ed97b9dSdrh         context->reserved = 0;
3160bfe6631eSdrh       }
3161bfe6631eSdrh     }
31628af6c228Sdrh     if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1)){
31638af6c228Sdrh       pInode->eFileLock = SHARED_LOCK;
31647ed97b9dSdrh     }
31657ed97b9dSdrh   }
3166308c2a5cSdrh   if( rc==SQLITE_OK && eFileLock==NO_LOCK ){
3167aebf413dSaswift 
31687ed97b9dSdrh     /* Decrement the shared lock counter.  Release the lock using an
31697ed97b9dSdrh     ** OS call only when all threads in this same process have released
31707ed97b9dSdrh     ** the lock.
31717ed97b9dSdrh     */
31728af6c228Sdrh     unsigned long long sharedLockByte = SHARED_FIRST+pInode->sharedByte;
31738af6c228Sdrh     pInode->nShared--;
31748af6c228Sdrh     if( pInode->nShared==0 ){
31757ed97b9dSdrh       SimulateIOErrorBenign(1);
31767ed97b9dSdrh       SimulateIOError( h=(-1) )
31777ed97b9dSdrh       SimulateIOErrorBenign(0);
31787ed97b9dSdrh       if( !skipShared ){
31797ed97b9dSdrh         rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 0);
31807ed97b9dSdrh       }
31817ed97b9dSdrh       if( !rc ){
31828af6c228Sdrh         pInode->eFileLock = NO_LOCK;
3183308c2a5cSdrh         pFile->eFileLock = NO_LOCK;
31847ed97b9dSdrh       }
31857ed97b9dSdrh     }
3186aebf413dSaswift     if( rc==SQLITE_OK ){
31878af6c228Sdrh       pInode->nLock--;
31888af6c228Sdrh       assert( pInode->nLock>=0 );
3189ef52b36aSdrh       if( pInode->nLock==0 ) closePendingFds(pFile);
3190aebf413dSaswift     }
3191aebf413dSaswift   }
31927ed97b9dSdrh 
3193da6dc240Sdrh   sqlite3_mutex_leave(pInode->pLockMutex);
3194095908e1Sdrh   if( rc==SQLITE_OK ){
3195095908e1Sdrh     pFile->eFileLock = eFileLock;
3196095908e1Sdrh   }
3197bfe6631eSdrh   return rc;
3198bfe6631eSdrh }
3199bfe6631eSdrh 
3200bfe6631eSdrh /*
3201bfe6631eSdrh ** Close a file & cleanup AFP specific locking context
3202bfe6631eSdrh */
afpClose(sqlite3_file * id)3203e339d65aSdanielk1977 static int afpClose(sqlite3_file *id) {
32047ed97b9dSdrh   int rc = SQLITE_OK;
3205218c5084Sdrh   unixFile *pFile = (unixFile*)id;
3206a8de1e1cSdrh   assert( id!=0 );
3207e339d65aSdanielk1977   afpUnlock(id, NO_LOCK);
3208095908e1Sdrh   assert( unixFileMutexNotheld(pFile) );
32096c7d5c5bSdrh   unixEnterMutex();
3210ef52b36aSdrh   if( pFile->pInode ){
3211ef52b36aSdrh     unixInodeInfo *pInode = pFile->pInode;
3212ef52b36aSdrh     sqlite3_mutex_enter(pInode->pLockMutex);
3213cb4e4b00Sdrh     if( pInode->nLock ){
3214aebf413dSaswift       /* If there are outstanding locks, do not actually close the file just
3215aebf413dSaswift       ** yet because that would clear those locks.  Instead, add the file
32168af6c228Sdrh       ** descriptor to pInode->aPending.  It will be automatically closed when
3217aebf413dSaswift       ** the last lock is cleared.
3218aebf413dSaswift       */
321908da86a6Sdan       setPendingFd(pFile);
3220aebf413dSaswift     }
3221ef52b36aSdrh     sqlite3_mutex_leave(pInode->pLockMutex);
3222ef52b36aSdrh   }
3223b0ac3e3aSdan   releaseInodeInfo(pFile);
3224aebf413dSaswift   sqlite3_free(pFile->lockingContext);
32257ed97b9dSdrh   rc = closeUnixFile(id);
32266c7d5c5bSdrh   unixLeaveMutex();
32277ed97b9dSdrh   return rc;
3228bfe6631eSdrh }
3229bfe6631eSdrh 
3230d2cb50b7Sdrh #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
3231734c9864Sdrh /*
3232734c9864Sdrh ** The code above is the AFP lock implementation.  The code is specific
3233734c9864Sdrh ** to MacOSX and does not work on other unix platforms.  No alternative
3234734c9864Sdrh ** is available.  If you don't compile for a mac, then the "unix-afp"
3235734c9864Sdrh ** VFS is not available.
3236734c9864Sdrh **
3237734c9864Sdrh ********************* End of the AFP lock implementation **********************
3238734c9864Sdrh ******************************************************************************/
3239bfe6631eSdrh 
32407ed97b9dSdrh /******************************************************************************
32417ed97b9dSdrh *************************** Begin NFS Locking ********************************/
32427ed97b9dSdrh 
32437ed97b9dSdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
32447ed97b9dSdrh /*
3245308c2a5cSdrh  ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
32467ed97b9dSdrh  ** must be either NO_LOCK or SHARED_LOCK.
32477ed97b9dSdrh  **
32487ed97b9dSdrh  ** If the locking level of the file descriptor is already at or below
32497ed97b9dSdrh  ** the requested locking level, this routine is a no-op.
32507ed97b9dSdrh  */
nfsUnlock(sqlite3_file * id,int eFileLock)3251308c2a5cSdrh static int nfsUnlock(sqlite3_file *id, int eFileLock){
3252a7e61d8bSdrh   return posixUnlock(id, eFileLock, 1);
32537ed97b9dSdrh }
32547ed97b9dSdrh 
32557ed97b9dSdrh #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
32567ed97b9dSdrh /*
32577ed97b9dSdrh ** The code above is the NFS lock implementation.  The code is specific
32587ed97b9dSdrh ** to MacOSX and does not work on other unix platforms.  No alternative
32597ed97b9dSdrh ** is available.
32607ed97b9dSdrh **
32617ed97b9dSdrh ********************* End of the NFS lock implementation **********************
32627ed97b9dSdrh ******************************************************************************/
3263734c9864Sdrh 
3264734c9864Sdrh /******************************************************************************
3265734c9864Sdrh **************** Non-locking sqlite3_file methods *****************************
3266734c9864Sdrh **
3267734c9864Sdrh ** The next division contains implementations for all methods of the
3268734c9864Sdrh ** sqlite3_file object other than the locking methods.  The locking
3269734c9864Sdrh ** methods were defined in divisions above (one locking method per
3270734c9864Sdrh ** division).  Those methods that are common to all locking modes
3271734c9864Sdrh ** are gather together into this division.
3272734c9864Sdrh */
3273bfe6631eSdrh 
3274bfe6631eSdrh /*
3275734c9864Sdrh ** Seek to the offset passed as the second argument, then read cnt
3276734c9864Sdrh ** bytes into pBuf. Return the number of bytes actually read.
3277734c9864Sdrh **
3278734c9864Sdrh ** NB:  If you define USE_PREAD or USE_PREAD64, then it might also
3279734c9864Sdrh ** be necessary to define _XOPEN_SOURCE to be 500.  This varies from
3280734c9864Sdrh ** one system to another.  Since SQLite does not define USE_PREAD
328160ec914cSpeter.d.reid ** in any form by default, we will not attempt to define _XOPEN_SOURCE.
3282734c9864Sdrh ** See tickets #2741 and #2681.
3283734c9864Sdrh **
3284734c9864Sdrh ** To avoid stomping the errno value on a failed read the lastErrno value
3285734c9864Sdrh ** is set before returning.
3286bfe6631eSdrh */
seekAndRead(unixFile * id,sqlite3_int64 offset,void * pBuf,int cnt)3287734c9864Sdrh static int seekAndRead(unixFile *id, sqlite3_int64 offset, void *pBuf, int cnt){
3288734c9864Sdrh   int got;
328958024643Sdrh   int prior = 0;
3290a46cadc4Sdrh #if (!defined(USE_PREAD) && !defined(USE_PREAD64))
3291a46cadc4Sdrh   i64 newOffset;
3292a46cadc4Sdrh #endif
3293734c9864Sdrh   TIMER_START;
3294c1fd2cfeSdrh   assert( cnt==(cnt&0x1ffff) );
329535a0379aSdrh   assert( id->h>2 );
329658024643Sdrh   do{
3297734c9864Sdrh #if defined(USE_PREAD)
329858024643Sdrh     got = osPread(id->h, pBuf, cnt, offset);
3299734c9864Sdrh     SimulateIOError( got = -1 );
3300734c9864Sdrh #elif defined(USE_PREAD64)
330158024643Sdrh     got = osPread64(id->h, pBuf, cnt, offset);
3302734c9864Sdrh     SimulateIOError( got = -1 );
3303734c9864Sdrh #else
3304a46cadc4Sdrh     newOffset = lseek(id->h, offset, SEEK_SET);
3305a46cadc4Sdrh     SimulateIOError( newOffset = -1 );
3306a46cadc4Sdrh     if( newOffset<0 ){
33074bf66fd6Sdrh       storeLastErrno((unixFile*)id, errno);
3308734c9864Sdrh       return -1;
3309bfe6631eSdrh     }
331058024643Sdrh     got = osRead(id->h, pBuf, cnt);
3311734c9864Sdrh #endif
331258024643Sdrh     if( got==cnt ) break;
3313734c9864Sdrh     if( got<0 ){
331458024643Sdrh       if( errno==EINTR ){ got = 1; continue; }
331558024643Sdrh       prior = 0;
33164bf66fd6Sdrh       storeLastErrno((unixFile*)id,  errno);
331758024643Sdrh       break;
331858024643Sdrh     }else if( got>0 ){
331958024643Sdrh       cnt -= got;
332058024643Sdrh       offset += got;
332158024643Sdrh       prior += got;
332258024643Sdrh       pBuf = (void*)(got + (char*)pBuf);
3323734c9864Sdrh     }
332458024643Sdrh   }while( got>0 );
332558024643Sdrh   TIMER_END;
332658024643Sdrh   OSTRACE(("READ    %-3d %5d %7lld %llu\n",
332758024643Sdrh             id->h, got+prior, offset-prior, TIMER_ELAPSED));
332858024643Sdrh   return got+prior;
3329bfe6631eSdrh }
3330bfe6631eSdrh 
3331bfe6631eSdrh /*
3332734c9864Sdrh ** Read data from a file into a buffer.  Return SQLITE_OK if all
3333734c9864Sdrh ** bytes were read successfully and SQLITE_IOERR if anything goes
3334734c9864Sdrh ** wrong.
3335bfe6631eSdrh */
unixRead(sqlite3_file * id,void * pBuf,int amt,sqlite3_int64 offset)3336734c9864Sdrh static int unixRead(
3337734c9864Sdrh   sqlite3_file *id,
3338734c9864Sdrh   void *pBuf,
3339734c9864Sdrh   int amt,
3340734c9864Sdrh   sqlite3_int64 offset
3341734c9864Sdrh ){
334208da86a6Sdan   unixFile *pFile = (unixFile *)id;
3343734c9864Sdrh   int got;
3344734c9864Sdrh   assert( id );
33456cf9d8d6Sdrh   assert( offset>=0 );
33466cf9d8d6Sdrh   assert( amt>0 );
334708c6d446Sdrh 
3348067b92baSdrh   /* If this is a database file (not a journal, super-journal or temp
334908da86a6Sdan   ** file), the bytes in the locking range should never be read or written. */
33507c24610eSdan #if 0
3351c68886bbSdrh   assert( pFile->pPreallocatedUnused==0
335208c6d446Sdrh        || offset>=PENDING_BYTE+512
335308da86a6Sdan        || offset+amt<=PENDING_BYTE
335408da86a6Sdan   );
33557c24610eSdan #endif
335608c6d446Sdrh 
33579b4c59faSdrh #if SQLITE_MAX_MMAP_SIZE>0
33586c569638Sdrh   /* Deal with as much of this read request as possible by transfering
33596c569638Sdrh   ** data from the memory mapping using memcpy().  */
3360f23da966Sdan   if( offset<pFile->mmapSize ){
3361f23da966Sdan     if( offset+amt <= pFile->mmapSize ){
3362f23da966Sdan       memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], amt);
3363f23da966Sdan       return SQLITE_OK;
3364f23da966Sdan     }else{
3365f23da966Sdan       int nCopy = pFile->mmapSize - offset;
3366f23da966Sdan       memcpy(pBuf, &((u8 *)(pFile->pMapRegion))[offset], nCopy);
3367f23da966Sdan       pBuf = &((u8 *)pBuf)[nCopy];
3368f23da966Sdan       amt -= nCopy;
3369f23da966Sdan       offset += nCopy;
3370f23da966Sdan     }
3371f23da966Sdan   }
33726e0b6d52Sdrh #endif
3373f23da966Sdan 
337408da86a6Sdan   got = seekAndRead(pFile, offset, pBuf, amt);
3375734c9864Sdrh   if( got==amt ){
3376734c9864Sdrh     return SQLITE_OK;
3377734c9864Sdrh   }else if( got<0 ){
33785a07d10fSdrh     /* pFile->lastErrno has been set by seekAndRead().
33795a07d10fSdrh     ** Usually we return SQLITE_IOERR_READ here, though for some
33805a07d10fSdrh     ** kinds of errors we return SQLITE_IOERR_CORRUPTFS.  The
33815a07d10fSdrh     ** SQLITE_IOERR_CORRUPTFS will be converted into SQLITE_CORRUPT
33825a07d10fSdrh     ** prior to returning to the application by the sqlite3ApiExit()
33835a07d10fSdrh     ** routine.
33845a07d10fSdrh     */
33855a07d10fSdrh     switch( pFile->lastErrno ){
33865a07d10fSdrh       case ERANGE:
33875a07d10fSdrh       case EIO:
33885a07d10fSdrh #ifdef ENXIO
33895a07d10fSdrh       case ENXIO:
33905a07d10fSdrh #endif
33915a07d10fSdrh #ifdef EDEVERR
33925a07d10fSdrh       case EDEVERR:
33935a07d10fSdrh #endif
33945a07d10fSdrh         return SQLITE_IOERR_CORRUPTFS;
33955a07d10fSdrh     }
3396734c9864Sdrh     return SQLITE_IOERR_READ;
3397734c9864Sdrh   }else{
33984bf66fd6Sdrh     storeLastErrno(pFile, 0);   /* not a system error */
3399734c9864Sdrh     /* Unread parts of the buffer must be zero-filled */
3400734c9864Sdrh     memset(&((char*)pBuf)[got], 0, amt-got);
3401734c9864Sdrh     return SQLITE_IOERR_SHORT_READ;
3402734c9864Sdrh   }
3403734c9864Sdrh }
3404734c9864Sdrh 
3405734c9864Sdrh /*
340647a2b4a0Sdan ** Attempt to seek the file-descriptor passed as the first argument to
340747a2b4a0Sdan ** absolute offset iOff, then attempt to write nBuf bytes of data from
340847a2b4a0Sdan ** pBuf to it. If an error occurs, return -1 and set *piErrno. Otherwise,
340947a2b4a0Sdan ** return the actual number of bytes written (which may be less than
341047a2b4a0Sdan ** nBuf).
341147a2b4a0Sdan */
seekAndWriteFd(int fd,i64 iOff,const void * pBuf,int nBuf,int * piErrno)341247a2b4a0Sdan static int seekAndWriteFd(
341347a2b4a0Sdan   int fd,                         /* File descriptor to write to */
341447a2b4a0Sdan   i64 iOff,                       /* File offset to begin writing at */
341547a2b4a0Sdan   const void *pBuf,               /* Copy data from this buffer to the file */
341647a2b4a0Sdan   int nBuf,                       /* Size of buffer pBuf in bytes */
341747a2b4a0Sdan   int *piErrno                    /* OUT: Error number if error occurs */
341847a2b4a0Sdan ){
341947a2b4a0Sdan   int rc = 0;                     /* Value returned by system call */
342047a2b4a0Sdan 
342147a2b4a0Sdan   assert( nBuf==(nBuf&0x1ffff) );
342235a0379aSdrh   assert( fd>2 );
3423e1818ec7Sdrh   assert( piErrno!=0 );
342447a2b4a0Sdan   nBuf &= 0x1ffff;
342547a2b4a0Sdan   TIMER_START;
342647a2b4a0Sdan 
342747a2b4a0Sdan #if defined(USE_PREAD)
34282da47d38Sdrh   do{ rc = (int)osPwrite(fd, pBuf, nBuf, iOff); }while( rc<0 && errno==EINTR );
342947a2b4a0Sdan #elif defined(USE_PREAD64)
34302da47d38Sdrh   do{ rc = (int)osPwrite64(fd, pBuf, nBuf, iOff);}while( rc<0 && errno==EINTR);
343147a2b4a0Sdan #else
343247a2b4a0Sdan   do{
343347a2b4a0Sdan     i64 iSeek = lseek(fd, iOff, SEEK_SET);
3434e1818ec7Sdrh     SimulateIOError( iSeek = -1 );
3435e1818ec7Sdrh     if( iSeek<0 ){
3436e1818ec7Sdrh       rc = -1;
3437e1818ec7Sdrh       break;
343847a2b4a0Sdan     }
343947a2b4a0Sdan     rc = osWrite(fd, pBuf, nBuf);
344047a2b4a0Sdan   }while( rc<0 && errno==EINTR );
344147a2b4a0Sdan #endif
344247a2b4a0Sdan 
344347a2b4a0Sdan   TIMER_END;
344447a2b4a0Sdan   OSTRACE(("WRITE   %-3d %5d %7lld %llu\n", fd, rc, iOff, TIMER_ELAPSED));
344547a2b4a0Sdan 
3446e1818ec7Sdrh   if( rc<0 ) *piErrno = errno;
344747a2b4a0Sdan   return rc;
344847a2b4a0Sdan }
344947a2b4a0Sdan 
345047a2b4a0Sdan 
345147a2b4a0Sdan /*
3452734c9864Sdrh ** Seek to the offset in id->offset then read cnt bytes into pBuf.
3453734c9864Sdrh ** Return the number of bytes actually read.  Update the offset.
3454734c9864Sdrh **
3455734c9864Sdrh ** To avoid stomping the errno value on a failed write the lastErrno value
3456734c9864Sdrh ** is set before returning.
3457734c9864Sdrh */
seekAndWrite(unixFile * id,i64 offset,const void * pBuf,int cnt)3458734c9864Sdrh static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){
345947a2b4a0Sdan   return seekAndWriteFd(id->h, offset, pBuf, cnt, &id->lastErrno);
3460734c9864Sdrh }
3461734c9864Sdrh 
3462734c9864Sdrh 
3463734c9864Sdrh /*
3464734c9864Sdrh ** Write data from a buffer into a file.  Return SQLITE_OK on success
3465734c9864Sdrh ** or some other error code on failure.
3466734c9864Sdrh */
unixWrite(sqlite3_file * id,const void * pBuf,int amt,sqlite3_int64 offset)3467734c9864Sdrh static int unixWrite(
3468734c9864Sdrh   sqlite3_file *id,
3469734c9864Sdrh   const void *pBuf,
3470734c9864Sdrh   int amt,
3471734c9864Sdrh   sqlite3_int64 offset
3472734c9864Sdrh ){
347308da86a6Sdan   unixFile *pFile = (unixFile*)id;
3474734c9864Sdrh   int wrote = 0;
3475734c9864Sdrh   assert( id );
3476734c9864Sdrh   assert( amt>0 );
34778f941bc7Sdrh 
3478067b92baSdrh   /* If this is a database file (not a journal, super-journal or temp
347908da86a6Sdan   ** file), the bytes in the locking range should never be read or written. */
34807c24610eSdan #if 0
3481c68886bbSdrh   assert( pFile->pPreallocatedUnused==0
348208c6d446Sdrh        || offset>=PENDING_BYTE+512
348308da86a6Sdan        || offset+amt<=PENDING_BYTE
348408da86a6Sdan   );
34857c24610eSdan #endif
348608c6d446Sdrh 
3487d3d8c04fSdrh #ifdef SQLITE_DEBUG
34888f941bc7Sdrh   /* If we are doing a normal write to a database file (as opposed to
34898f941bc7Sdrh   ** doing a hot-journal rollback or a write to some file other than a
34908f941bc7Sdrh   ** normal database file) then record the fact that the database
34918f941bc7Sdrh   ** has changed.  If the transaction counter is modified, record that
34928f941bc7Sdrh   ** fact too.
34938f941bc7Sdrh   */
349408da86a6Sdan   if( pFile->inNormalWrite ){
34958f941bc7Sdrh     pFile->dbUpdate = 1;  /* The database has been modified */
34968f941bc7Sdrh     if( offset<=24 && offset+amt>=27 ){
3497a6d90f0dSdrh       int rc;
34988f941bc7Sdrh       char oldCntr[4];
34998f941bc7Sdrh       SimulateIOErrorBenign(1);
3500a6d90f0dSdrh       rc = seekAndRead(pFile, 24, oldCntr, 4);
35018f941bc7Sdrh       SimulateIOErrorBenign(0);
3502a6d90f0dSdrh       if( rc!=4 || memcmp(oldCntr, &((char*)pBuf)[24-offset], 4)!=0 ){
35038f941bc7Sdrh         pFile->transCntrChng = 1;  /* The transaction counter has changed */
35048f941bc7Sdrh       }
35058f941bc7Sdrh     }
35068f941bc7Sdrh   }
35078f941bc7Sdrh #endif
35088f941bc7Sdrh 
3509fe33e39bSdan #if defined(SQLITE_MMAP_READWRITE) && SQLITE_MAX_MMAP_SIZE>0
3510f23da966Sdan   /* Deal with as much of this write request as possible by transfering
3511f23da966Sdan   ** data from the memory mapping using memcpy().  */
3512f23da966Sdan   if( offset<pFile->mmapSize ){
3513f23da966Sdan     if( offset+amt <= pFile->mmapSize ){
3514f23da966Sdan       memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, amt);
3515f23da966Sdan       return SQLITE_OK;
3516f23da966Sdan     }else{
3517f23da966Sdan       int nCopy = pFile->mmapSize - offset;
3518f23da966Sdan       memcpy(&((u8 *)(pFile->pMapRegion))[offset], pBuf, nCopy);
3519f23da966Sdan       pBuf = &((u8 *)pBuf)[nCopy];
3520f23da966Sdan       amt -= nCopy;
3521f23da966Sdan       offset += nCopy;
3522f23da966Sdan     }
3523f23da966Sdan   }
35246e0b6d52Sdrh #endif
3525f23da966Sdan 
352602bf8b45Sdrh   while( (wrote = seekAndWrite(pFile, offset, pBuf, amt))<amt && wrote>0 ){
3527734c9864Sdrh     amt -= wrote;
3528734c9864Sdrh     offset += wrote;
3529734c9864Sdrh     pBuf = &((char*)pBuf)[wrote];
3530734c9864Sdrh   }
3531734c9864Sdrh   SimulateIOError(( wrote=(-1), amt=1 ));
3532734c9864Sdrh   SimulateDiskfullError(( wrote=0, amt=1 ));
35336e09d69cSdan 
353402bf8b45Sdrh   if( amt>wrote ){
3535a21b83baSdrh     if( wrote<0 && pFile->lastErrno!=ENOSPC ){
3536734c9864Sdrh       /* lastErrno set by seekAndWrite */
3537734c9864Sdrh       return SQLITE_IOERR_WRITE;
3538734c9864Sdrh     }else{
35394bf66fd6Sdrh       storeLastErrno(pFile, 0); /* not a system error */
3540734c9864Sdrh       return SQLITE_FULL;
3541734c9864Sdrh     }
3542734c9864Sdrh   }
35436e09d69cSdan 
3544734c9864Sdrh   return SQLITE_OK;
3545734c9864Sdrh }
3546734c9864Sdrh 
3547734c9864Sdrh #ifdef SQLITE_TEST
3548734c9864Sdrh /*
3549734c9864Sdrh ** Count the number of fullsyncs and normal syncs.  This is used to test
35506b9d6ddcSdrh ** that syncs and fullsyncs are occurring at the right times.
3551734c9864Sdrh */
3552734c9864Sdrh int sqlite3_sync_count = 0;
3553734c9864Sdrh int sqlite3_fullsync_count = 0;
3554734c9864Sdrh #endif
3555734c9864Sdrh 
3556734c9864Sdrh /*
355789240432Sdrh ** We do not trust systems to provide a working fdatasync().  Some do.
355820f8e13bSdrh ** Others do no.  To be safe, we will stick with the (slightly slower)
355920f8e13bSdrh ** fsync(). If you know that your system does support fdatasync() correctly,
3560f7a4a1b8Sdrh ** then simply compile with -Dfdatasync=fdatasync or -DHAVE_FDATASYNC
3561734c9864Sdrh */
3562f7a4a1b8Sdrh #if !defined(fdatasync) && !HAVE_FDATASYNC
3563734c9864Sdrh # define fdatasync fsync
3564734c9864Sdrh #endif
3565734c9864Sdrh 
3566734c9864Sdrh /*
3567734c9864Sdrh ** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not
3568734c9864Sdrh ** the F_FULLFSYNC macro is defined.  F_FULLFSYNC is currently
3569734c9864Sdrh ** only available on Mac OS X.  But that could change.
3570734c9864Sdrh */
3571734c9864Sdrh #ifdef F_FULLFSYNC
3572734c9864Sdrh # define HAVE_FULLFSYNC 1
3573734c9864Sdrh #else
3574734c9864Sdrh # define HAVE_FULLFSYNC 0
3575734c9864Sdrh #endif
3576734c9864Sdrh 
3577734c9864Sdrh 
3578734c9864Sdrh /*
3579734c9864Sdrh ** The fsync() system call does not work as advertised on many
3580734c9864Sdrh ** unix systems.  The following procedure is an attempt to make
3581734c9864Sdrh ** it work better.
3582734c9864Sdrh **
3583734c9864Sdrh ** The SQLITE_NO_SYNC macro disables all fsync()s.  This is useful
3584734c9864Sdrh ** for testing when we want to run through the test suite quickly.
3585734c9864Sdrh ** You are strongly advised *not* to deploy with SQLITE_NO_SYNC
3586734c9864Sdrh ** enabled, however, since with SQLITE_NO_SYNC enabled, an OS crash
3587734c9864Sdrh ** or power failure will likely corrupt the database file.
35880b647ffdSdrh **
35890b647ffdSdrh ** SQLite sets the dataOnly flag if the size of the file is unchanged.
35900b647ffdSdrh ** The idea behind dataOnly is that it should only write the file content
35910b647ffdSdrh ** to disk, not the inode.  We only set dataOnly if the file size is
35920b647ffdSdrh ** unchanged since the file size is part of the inode.  However,
35930b647ffdSdrh ** Ted Ts'o tells us that fdatasync() will also write the inode if the
35940b647ffdSdrh ** file size has changed.  The only real difference between fdatasync()
35950b647ffdSdrh ** and fsync(), Ted tells us, is that fdatasync() will not flush the
35960b647ffdSdrh ** inode if the mtime or owner or other inode attributes have changed.
35970b647ffdSdrh ** We only care about the file size, not the other file attributes, so
35980b647ffdSdrh ** as far as SQLite is concerned, an fdatasync() is always adequate.
35990b647ffdSdrh ** So, we always use fdatasync() if it is available, regardless of
36000b647ffdSdrh ** the value of the dataOnly flag.
3601734c9864Sdrh */
full_fsync(int fd,int fullSync,int dataOnly)3602734c9864Sdrh static int full_fsync(int fd, int fullSync, int dataOnly){
360397185489Schw   int rc;
3604734c9864Sdrh 
3605734c9864Sdrh   /* The following "ifdef/elif/else/" block has the same structure as
3606734c9864Sdrh   ** the one below. It is replicated here solely to avoid cluttering
3607734c9864Sdrh   ** up the real code with the UNUSED_PARAMETER() macros.
3608734c9864Sdrh   */
3609734c9864Sdrh #ifdef SQLITE_NO_SYNC
3610734c9864Sdrh   UNUSED_PARAMETER(fd);
3611734c9864Sdrh   UNUSED_PARAMETER(fullSync);
3612734c9864Sdrh   UNUSED_PARAMETER(dataOnly);
3613734c9864Sdrh #elif HAVE_FULLFSYNC
3614734c9864Sdrh   UNUSED_PARAMETER(dataOnly);
3615734c9864Sdrh #else
3616734c9864Sdrh   UNUSED_PARAMETER(fullSync);
36170b647ffdSdrh   UNUSED_PARAMETER(dataOnly);
3618734c9864Sdrh #endif
3619734c9864Sdrh 
3620734c9864Sdrh   /* Record the number of times that we do a normal fsync() and
3621734c9864Sdrh   ** FULLSYNC.  This is used during testing to verify that this procedure
3622734c9864Sdrh   ** gets called with the correct arguments.
3623734c9864Sdrh   */
3624734c9864Sdrh #ifdef SQLITE_TEST
3625734c9864Sdrh   if( fullSync ) sqlite3_fullsync_count++;
3626734c9864Sdrh   sqlite3_sync_count++;
3627734c9864Sdrh #endif
3628734c9864Sdrh 
3629734c9864Sdrh   /* If we compiled with the SQLITE_NO_SYNC flag, then syncing is a
36302c8fd12fSdrh   ** no-op.  But go ahead and call fstat() to validate the file
36312c8fd12fSdrh   ** descriptor as we need a method to provoke a failure during
36322c8fd12fSdrh   ** coverate testing.
3633734c9864Sdrh   */
3634734c9864Sdrh #ifdef SQLITE_NO_SYNC
36352c8fd12fSdrh   {
36362c8fd12fSdrh     struct stat buf;
36372c8fd12fSdrh     rc = osFstat(fd, &buf);
36382c8fd12fSdrh   }
3639734c9864Sdrh #elif HAVE_FULLFSYNC
3640734c9864Sdrh   if( fullSync ){
364199ab3b12Sdrh     rc = osFcntl(fd, F_FULLFSYNC, 0);
3642734c9864Sdrh   }else{
3643734c9864Sdrh     rc = 1;
3644734c9864Sdrh   }
3645734c9864Sdrh   /* If the FULLFSYNC failed, fall back to attempting an fsync().
36466b9d6ddcSdrh   ** It shouldn't be possible for fullfsync to fail on the local
36476b9d6ddcSdrh   ** file system (on OSX), so failure indicates that FULLFSYNC
36486b9d6ddcSdrh   ** isn't supported for this file system. So, attempt an fsync
36496b9d6ddcSdrh   ** and (for now) ignore the overhead of a superfluous fcntl call.
36506b9d6ddcSdrh   ** It'd be better to detect fullfsync support once and avoid
36516b9d6ddcSdrh   ** the fcntl call every time sync is called.
3652734c9864Sdrh   */
3653734c9864Sdrh   if( rc ) rc = fsync(fd);
3654734c9864Sdrh 
36557ed97b9dSdrh #elif defined(__APPLE__)
36567ed97b9dSdrh   /* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly
36577ed97b9dSdrh   ** so currently we default to the macro that redefines fdatasync to fsync
36587ed97b9dSdrh   */
36597ed97b9dSdrh   rc = fsync(fd);
3660734c9864Sdrh #else
3661734c9864Sdrh   rc = fdatasync(fd);
3662c7288ee0Sdrh #if OS_VXWORKS
3663c7288ee0Sdrh   if( rc==-1 && errno==ENOTSUP ){
3664734c9864Sdrh     rc = fsync(fd);
3665734c9864Sdrh   }
36660b647ffdSdrh #endif /* OS_VXWORKS */
3667734c9864Sdrh #endif /* ifdef SQLITE_NO_SYNC elif HAVE_FULLFSYNC */
3668734c9864Sdrh 
3669734c9864Sdrh   if( OS_VXWORKS && rc!= -1 ){
3670734c9864Sdrh     rc = 0;
3671734c9864Sdrh   }
367297185489Schw   return rc;
3673bfe6631eSdrh }
3674bfe6631eSdrh 
3675734c9864Sdrh /*
36760059eae3Sdrh ** Open a file descriptor to the directory containing file zFilename.
36770059eae3Sdrh ** If successful, *pFd is set to the opened file descriptor and
36780059eae3Sdrh ** SQLITE_OK is returned. If an error occurs, either SQLITE_NOMEM
36790059eae3Sdrh ** or SQLITE_CANTOPEN is returned and *pFd is set to an undefined
36800059eae3Sdrh ** value.
36810059eae3Sdrh **
368290315a24Sdrh ** The directory file descriptor is used for only one thing - to
368390315a24Sdrh ** fsync() a directory to make sure file creation and deletion events
368490315a24Sdrh ** are flushed to disk.  Such fsyncs are not needed on newer
368590315a24Sdrh ** journaling filesystems, but are required on older filesystems.
368690315a24Sdrh **
368790315a24Sdrh ** This routine can be overridden using the xSetSysCall interface.
368890315a24Sdrh ** The ability to override this routine was added in support of the
368990315a24Sdrh ** chromium sandbox.  Opening a directory is a security risk (we are
369090315a24Sdrh ** told) so making it overrideable allows the chromium sandbox to
369190315a24Sdrh ** replace this routine with a harmless no-op.  To make this routine
369290315a24Sdrh ** a no-op, replace it with a stub that returns SQLITE_OK but leaves
369390315a24Sdrh ** *pFd set to a negative number.
369490315a24Sdrh **
36950059eae3Sdrh ** If SQLITE_OK is returned, the caller is responsible for closing
36960059eae3Sdrh ** the file descriptor *pFd using close().
36970059eae3Sdrh */
openDirectory(const char * zFilename,int * pFd)36980059eae3Sdrh static int openDirectory(const char *zFilename, int *pFd){
36990059eae3Sdrh   int ii;
37000059eae3Sdrh   int fd = -1;
37010059eae3Sdrh   char zDirname[MAX_PATHNAME+1];
37020059eae3Sdrh 
37030059eae3Sdrh   sqlite3_snprintf(MAX_PATHNAME, zDirname, "%s", zFilename);
3704dc27851eSdrh   for(ii=(int)strlen(zDirname); ii>0 && zDirname[ii]!='/'; ii--);
3705dc27851eSdrh   if( ii>0 ){
37060059eae3Sdrh     zDirname[ii] = '\0';
3707dc27851eSdrh   }else{
3708dc27851eSdrh     if( zDirname[0]!='/' ) zDirname[0] = '.';
3709dc27851eSdrh     zDirname[1] = 0;
3710dc27851eSdrh   }
37113b9f154bSdrh   fd = robust_open(zDirname, O_RDONLY|O_BINARY, 0);
37120059eae3Sdrh   if( fd>=0 ){
37130059eae3Sdrh     OSTRACE(("OPENDIR %-3d %s\n", fd, zDirname));
37140059eae3Sdrh   }
37150059eae3Sdrh   *pFd = fd;
3716acb6b28dSdrh   if( fd>=0 ) return SQLITE_OK;
3717acb6b28dSdrh   return unixLogError(SQLITE_CANTOPEN_BKPT, "openDirectory", zDirname);
37180059eae3Sdrh }
37190059eae3Sdrh 
37200059eae3Sdrh /*
3721734c9864Sdrh ** Make sure all writes to a particular file are committed to disk.
3722734c9864Sdrh **
3723734c9864Sdrh ** If dataOnly==0 then both the file itself and its metadata (file
3724734c9864Sdrh ** size, access time, etc) are synced.  If dataOnly!=0 then only the
3725734c9864Sdrh ** file data is synced.
3726734c9864Sdrh **
3727734c9864Sdrh ** Under Unix, also make sure that the directory entry for the file
3728734c9864Sdrh ** has been created by fsync-ing the directory that contains the file.
3729734c9864Sdrh ** If we do not do this and we encounter a power failure, the directory
3730734c9864Sdrh ** entry for the journal might not exist after we reboot.  The next
3731734c9864Sdrh ** SQLite to access the file will not know that the journal exists (because
3732734c9864Sdrh ** the directory entry for the journal was never created) and the transaction
3733734c9864Sdrh ** will not roll back - possibly leading to database corruption.
3734734c9864Sdrh */
unixSync(sqlite3_file * id,int flags)3735734c9864Sdrh static int unixSync(sqlite3_file *id, int flags){
3736734c9864Sdrh   int rc;
3737734c9864Sdrh   unixFile *pFile = (unixFile*)id;
3738734c9864Sdrh 
3739734c9864Sdrh   int isDataOnly = (flags&SQLITE_SYNC_DATAONLY);
3740734c9864Sdrh   int isFullsync = (flags&0x0F)==SQLITE_SYNC_FULL;
3741734c9864Sdrh 
3742734c9864Sdrh   /* Check that one of SQLITE_SYNC_NORMAL or FULL was passed */
3743734c9864Sdrh   assert((flags&0x0F)==SQLITE_SYNC_NORMAL
3744734c9864Sdrh       || (flags&0x0F)==SQLITE_SYNC_FULL
3745734c9864Sdrh   );
3746734c9864Sdrh 
3747734c9864Sdrh   /* Unix cannot, but some systems may return SQLITE_FULL from here. This
3748734c9864Sdrh   ** line is to test that doing so does not cause any problems.
3749734c9864Sdrh   */
3750734c9864Sdrh   SimulateDiskfullError( return SQLITE_FULL );
3751734c9864Sdrh 
3752734c9864Sdrh   assert( pFile );
3753308c2a5cSdrh   OSTRACE(("SYNC    %-3d\n", pFile->h));
3754734c9864Sdrh   rc = full_fsync(pFile->h, isFullsync, isDataOnly);
3755734c9864Sdrh   SimulateIOError( rc=1 );
3756734c9864Sdrh   if( rc ){
37574bf66fd6Sdrh     storeLastErrno(pFile, errno);
3758e18d4953Sdan     return unixLogError(SQLITE_IOERR_FSYNC, "full_fsync", pFile->zPath);
3759734c9864Sdrh   }
37600059eae3Sdrh 
37610059eae3Sdrh   /* Also fsync the directory containing the file if the DIRSYNC flag
376248864df9Smistachkin   ** is set.  This is a one-time occurrence.  Many systems (examples: AIX)
376390315a24Sdrh   ** are unable to fsync a directory, so ignore errors on the fsync.
37640059eae3Sdrh   */
37650059eae3Sdrh   if( pFile->ctrlFlags & UNIXFILE_DIRSYNC ){
37660059eae3Sdrh     int dirfd;
37670059eae3Sdrh     OSTRACE(("DIRSYNC %s (have_fullfsync=%d fullsync=%d)\n", pFile->zPath,
3768308c2a5cSdrh             HAVE_FULLFSYNC, isFullsync));
376990315a24Sdrh     rc = osOpenDirectory(pFile->zPath, &dirfd);
3770acb6b28dSdrh     if( rc==SQLITE_OK ){
37710059eae3Sdrh       full_fsync(dirfd, 0, 0);
37720059eae3Sdrh       robust_close(pFile, dirfd, __LINE__);
3773acb6b28dSdrh     }else{
3774acb6b28dSdrh       assert( rc==SQLITE_CANTOPEN );
37751ee6f740Sdrh       rc = SQLITE_OK;
3776734c9864Sdrh     }
37770059eae3Sdrh     pFile->ctrlFlags &= ~UNIXFILE_DIRSYNC;
3778734c9864Sdrh   }
3779734c9864Sdrh   return rc;
3780734c9864Sdrh }
3781734c9864Sdrh 
3782734c9864Sdrh /*
3783734c9864Sdrh ** Truncate an open file to a specified size
3784734c9864Sdrh */
unixTruncate(sqlite3_file * id,i64 nByte)3785734c9864Sdrh static int unixTruncate(sqlite3_file *id, i64 nByte){
37866e09d69cSdan   unixFile *pFile = (unixFile *)id;
3787734c9864Sdrh   int rc;
37886e09d69cSdan   assert( pFile );
3789734c9864Sdrh   SimulateIOError( return SQLITE_IOERR_TRUNCATE );
37906e09d69cSdan 
37916e09d69cSdan   /* If the user has configured a chunk-size for this file, truncate the
37926e09d69cSdan   ** file so that it consists of an integer number of chunks (i.e. the
37936e09d69cSdan   ** actual file size after the operation may be larger than the requested
37946e09d69cSdan   ** size).
37956e09d69cSdan   */
3796b8af4b79Sdrh   if( pFile->szChunk>0 ){
37976e09d69cSdan     nByte = ((nByte + pFile->szChunk - 1)/pFile->szChunk) * pFile->szChunk;
37986e09d69cSdan   }
37996e09d69cSdan 
38002ee53412Sdan   rc = robust_ftruncate(pFile->h, nByte);
3801734c9864Sdrh   if( rc ){
38024bf66fd6Sdrh     storeLastErrno(pFile, errno);
3803e18d4953Sdan     return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath);
3804734c9864Sdrh   }else{
3805d3d8c04fSdrh #ifdef SQLITE_DEBUG
38063313b14fSdrh     /* If we are doing a normal write to a database file (as opposed to
38073313b14fSdrh     ** doing a hot-journal rollback or a write to some file other than a
38083313b14fSdrh     ** normal database file) and we truncate the file to zero length,
38093313b14fSdrh     ** that effectively updates the change counter.  This might happen
38103313b14fSdrh     ** when restoring a database using the backup API from a zero-length
38113313b14fSdrh     ** source.
38123313b14fSdrh     */
38136e09d69cSdan     if( pFile->inNormalWrite && nByte==0 ){
38146e09d69cSdan       pFile->transCntrChng = 1;
38153313b14fSdrh     }
3816f23da966Sdan #endif
3817c0003312Sdan 
3818e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
3819c0003312Sdan     /* If the file was just truncated to a size smaller than the currently
3820c0003312Sdan     ** mapped region, reduce the effective mapping size as well. SQLite will
3821c0003312Sdan     ** use read() and write() to access data beyond this point from now on.
3822c0003312Sdan     */
3823c0003312Sdan     if( nByte<pFile->mmapSize ){
3824c0003312Sdan       pFile->mmapSize = nByte;
3825c0003312Sdan     }
3826e98844f7Smistachkin #endif
38273313b14fSdrh 
3828734c9864Sdrh     return SQLITE_OK;
3829734c9864Sdrh   }
3830734c9864Sdrh }
3831734c9864Sdrh 
3832734c9864Sdrh /*
3833734c9864Sdrh ** Determine the current size of a file in bytes
3834734c9864Sdrh */
unixFileSize(sqlite3_file * id,i64 * pSize)3835734c9864Sdrh static int unixFileSize(sqlite3_file *id, i64 *pSize){
3836734c9864Sdrh   int rc;
3837734c9864Sdrh   struct stat buf;
38383044b51dSdrh   assert( id );
38393044b51dSdrh   rc = osFstat(((unixFile*)id)->h, &buf);
3840734c9864Sdrh   SimulateIOError( rc=1 );
3841734c9864Sdrh   if( rc!=0 ){
38424bf66fd6Sdrh     storeLastErrno((unixFile*)id, errno);
3843734c9864Sdrh     return SQLITE_IOERR_FSTAT;
3844734c9864Sdrh   }
3845734c9864Sdrh   *pSize = buf.st_size;
3846734c9864Sdrh 
38478af6c228Sdrh   /* When opening a zero-size database, the findInodeInfo() procedure
3848734c9864Sdrh   ** writes a single byte into that file in order to work around a bug
3849734c9864Sdrh   ** in the OS-X msdos filesystem.  In order to avoid problems with upper
3850734c9864Sdrh   ** layers, we need to report this file size as zero even though it is
3851734c9864Sdrh   ** really 1.   Ticket #3260.
3852734c9864Sdrh   */
3853734c9864Sdrh   if( *pSize==1 ) *pSize = 0;
3854734c9864Sdrh 
3855734c9864Sdrh 
3856734c9864Sdrh   return SQLITE_OK;
3857734c9864Sdrh }
3858734c9864Sdrh 
3859d2cb50b7Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
3860715ff30eSdrh /*
3861715ff30eSdrh ** Handler for proxy-locking file-control verbs.  Defined below in the
3862715ff30eSdrh ** proxying locking division.
3863715ff30eSdrh */
3864715ff30eSdrh static int proxyFileControl(sqlite3_file*,int,void*);
3865947bd809Sdrh #endif
3866715ff30eSdrh 
3867502019c8Sdan /*
3868502019c8Sdan ** This function is called to handle the SQLITE_FCNTL_SIZE_HINT
38693d4435b2Sdrh ** file-control operation.  Enlarge the database to nBytes in size
38703d4435b2Sdrh ** (rounded up to the next chunk-size).  If the database is already
38713d4435b2Sdrh ** nBytes or larger, this routine is a no-op.
3872502019c8Sdan */
fcntlSizeHint(unixFile * pFile,i64 nByte)3873502019c8Sdan static int fcntlSizeHint(unixFile *pFile, i64 nByte){
3874d589a544Smistachkin   if( pFile->szChunk>0 ){
3875502019c8Sdan     i64 nSize;                    /* Required file size */
3876502019c8Sdan     struct stat buf;              /* Used to hold return values of fstat() */
3877502019c8Sdan 
38784bf66fd6Sdrh     if( osFstat(pFile->h, &buf) ){
38794bf66fd6Sdrh       return SQLITE_IOERR_FSTAT;
38804bf66fd6Sdrh     }
3881502019c8Sdan 
3882502019c8Sdan     nSize = ((nByte+pFile->szChunk-1) / pFile->szChunk) * pFile->szChunk;
3883502019c8Sdan     if( nSize>(i64)buf.st_size ){
3884661d71afSdan 
3885502019c8Sdan #if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
3886661d71afSdan       /* The code below is handling the return value of osFallocate()
3887661d71afSdan       ** correctly. posix_fallocate() is defined to "returns zero on success,
3888661d71afSdan       ** or an error number on  failure". See the manpage for details. */
3889661d71afSdan       int err;
3890ff81231eSdrh       do{
3891661d71afSdan         err = osFallocate(pFile->h, buf.st_size, nSize-buf.st_size);
3892661d71afSdan       }while( err==EINTR );
3893789df14bSdrh       if( err && err!=EINVAL ) return SQLITE_IOERR_WRITE;
3894502019c8Sdan #else
3895592bf7faSdan       /* If the OS does not have posix_fallocate(), fake it. Write a
3896592bf7faSdan       ** single byte to the last byte in each block that falls entirely
3897592bf7faSdan       ** within the extended region. Then, if required, a single byte
3898592bf7faSdan       ** at offset (nSize-1), to set the size of the file correctly.
3899592bf7faSdan       ** This is a similar technique to that used by glibc on systems
3900592bf7faSdan       ** that do not have a real fallocate() call.
3901502019c8Sdan       */
3902502019c8Sdan       int nBlk = buf.st_blksize;  /* File-system block size */
3903ef3d66cbSdan       int nWrite = 0;             /* Number of bytes written by seekAndWrite */
3904502019c8Sdan       i64 iWrite;                 /* Next offset to write to */
3905502019c8Sdan 
3906053378dfSdrh       iWrite = (buf.st_size/nBlk)*nBlk + nBlk - 1;
3907592bf7faSdan       assert( iWrite>=buf.st_size );
3908592bf7faSdan       assert( ((iWrite+1)%nBlk)==0 );
3909053378dfSdrh       for(/*no-op*/; iWrite<nSize+nBlk-1; iWrite+=nBlk ){
3910053378dfSdrh         if( iWrite>=nSize ) iWrite = nSize - 1;
3911ef3d66cbSdan         nWrite = seekAndWrite(pFile, iWrite, "", 1);
3912502019c8Sdan         if( nWrite!=1 ) return SQLITE_IOERR_WRITE;
3913dc5df0f8Sdan       }
3914502019c8Sdan #endif
3915502019c8Sdan     }
3916502019c8Sdan   }
3917502019c8Sdan 
3918e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
39199b4c59faSdrh   if( pFile->mmapSizeMax>0 && nByte>pFile->mmapSize ){
3920f23da966Sdan     int rc;
3921f23da966Sdan     if( pFile->szChunk<=0 ){
3922f23da966Sdan       if( robust_ftruncate(pFile->h, nByte) ){
39234bf66fd6Sdrh         storeLastErrno(pFile, errno);
3924f23da966Sdan         return unixLogError(SQLITE_IOERR_TRUNCATE, "ftruncate", pFile->zPath);
3925f23da966Sdan       }
3926f23da966Sdan     }
3927f23da966Sdan 
3928f23da966Sdan     rc = unixMapfile(pFile, nByte);
3929f23da966Sdan     return rc;
3930f23da966Sdan   }
3931e98844f7Smistachkin #endif
3932f23da966Sdan 
3933502019c8Sdan   return SQLITE_OK;
3934502019c8Sdan }
3935ad94b58aSdanielk1977 
3936e3026636Sdanielk1977 /*
393760ec914cSpeter.d.reid ** If *pArg is initially negative then this is a query.  Set *pArg to
3938f12b3f60Sdrh ** 1 or 0 depending on whether or not bit mask of pFile->ctrlFlags is set.
3939f12b3f60Sdrh **
3940f12b3f60Sdrh ** If *pArg is 0 or 1, then clear or set the mask bit of pFile->ctrlFlags.
3941f12b3f60Sdrh */
unixModeBit(unixFile * pFile,unsigned char mask,int * pArg)3942f12b3f60Sdrh static void unixModeBit(unixFile *pFile, unsigned char mask, int *pArg){
3943f12b3f60Sdrh   if( *pArg<0 ){
3944f12b3f60Sdrh     *pArg = (pFile->ctrlFlags & mask)!=0;
3945f12b3f60Sdrh   }else if( (*pArg)==0 ){
3946f12b3f60Sdrh     pFile->ctrlFlags &= ~mask;
3947f12b3f60Sdrh   }else{
3948f12b3f60Sdrh     pFile->ctrlFlags |= mask;
3949f12b3f60Sdrh   }
3950f12b3f60Sdrh }
3951f12b3f60Sdrh 
3952696b33e6Sdrh /* Forward declaration */
3953696b33e6Sdrh static int unixGetTempname(int nBuf, char *zBuf);
3954a12a40c3Sdan #ifndef SQLITE_OMIT_WAL
3955aecc04d6Sdan  static int unixFcntlExternalReader(unixFile*, int*);
3956a12a40c3Sdan #endif
3957696b33e6Sdrh 
3958f12b3f60Sdrh /*
39599e33c2c1Sdrh ** Information and control of an open file handle.
396018839217Sdrh */
unixFileControl(sqlite3_file * id,int op,void * pArg)3961cc6bb3eaSdrh static int unixFileControl(sqlite3_file *id, int op, void *pArg){
3962f0b190d9Sdrh   unixFile *pFile = (unixFile*)id;
39639e33c2c1Sdrh   switch( op ){
3964d76dba7eSdrh #if defined(__linux__) && defined(SQLITE_ENABLE_BATCH_ATOMIC_WRITE)
3965efe16971Sdan     case SQLITE_FCNTL_BEGIN_ATOMIC_WRITE: {
3966efe16971Sdan       int rc = osIoctl(pFile->h, F2FS_IOC_START_ATOMIC_WRITE);
3967344f763fSdrh       return rc ? SQLITE_IOERR_BEGIN_ATOMIC : SQLITE_OK;
3968efe16971Sdan     }
3969efe16971Sdan     case SQLITE_FCNTL_COMMIT_ATOMIC_WRITE: {
3970efe16971Sdan       int rc = osIoctl(pFile->h, F2FS_IOC_COMMIT_ATOMIC_WRITE);
3971344f763fSdrh       return rc ? SQLITE_IOERR_COMMIT_ATOMIC : SQLITE_OK;
3972efe16971Sdan     }
3973efe16971Sdan     case SQLITE_FCNTL_ROLLBACK_ATOMIC_WRITE: {
3974efe16971Sdan       int rc = osIoctl(pFile->h, F2FS_IOC_ABORT_VOLATILE_WRITE);
3975344f763fSdrh       return rc ? SQLITE_IOERR_ROLLBACK_ATOMIC : SQLITE_OK;
3976efe16971Sdan     }
3977d76dba7eSdrh #endif /* __linux__ && SQLITE_ENABLE_BATCH_ATOMIC_WRITE */
3978efe16971Sdan 
39799e33c2c1Sdrh     case SQLITE_FCNTL_LOCKSTATE: {
3980f0b190d9Sdrh       *(int*)pArg = pFile->eFileLock;
39819e33c2c1Sdrh       return SQLITE_OK;
39829cbe6352Sdrh     }
39834bf66fd6Sdrh     case SQLITE_FCNTL_LAST_ERRNO: {
3984f0b190d9Sdrh       *(int*)pArg = pFile->lastErrno;
39857708e972Sdrh       return SQLITE_OK;
39867708e972Sdrh     }
39876e09d69cSdan     case SQLITE_FCNTL_CHUNK_SIZE: {
3988f0b190d9Sdrh       pFile->szChunk = *(int *)pArg;
39896e09d69cSdan       return SQLITE_OK;
39906e09d69cSdan     }
39919ff27ecdSdrh     case SQLITE_FCNTL_SIZE_HINT: {
3992da04ea4fSdan       int rc;
3993da04ea4fSdan       SimulateIOErrorBenign(1);
3994da04ea4fSdan       rc = fcntlSizeHint(pFile, *(i64 *)pArg);
3995da04ea4fSdan       SimulateIOErrorBenign(0);
3996da04ea4fSdan       return rc;
3997f0b190d9Sdrh     }
3998f0b190d9Sdrh     case SQLITE_FCNTL_PERSIST_WAL: {
3999f12b3f60Sdrh       unixModeBit(pFile, UNIXFILE_PERSIST_WAL, (int*)pArg);
4000f12b3f60Sdrh       return SQLITE_OK;
4001f0b190d9Sdrh     }
4002cb15f35fSdrh     case SQLITE_FCNTL_POWERSAFE_OVERWRITE: {
4003cb15f35fSdrh       unixModeBit(pFile, UNIXFILE_PSOW, (int*)pArg);
4004f0b190d9Sdrh       return SQLITE_OK;
40059ff27ecdSdrh     }
4006de60fc2dSdrh     case SQLITE_FCNTL_VFSNAME: {
4007de60fc2dSdrh       *(char**)pArg = sqlite3_mprintf("%s", pFile->pVfs->zName);
4008de60fc2dSdrh       return SQLITE_OK;
4009de60fc2dSdrh     }
4010696b33e6Sdrh     case SQLITE_FCNTL_TEMPFILENAME: {
4011f3cdcdccSdrh       char *zTFile = sqlite3_malloc64( pFile->pVfs->mxPathname );
4012696b33e6Sdrh       if( zTFile ){
4013696b33e6Sdrh         unixGetTempname(pFile->pVfs->mxPathname, zTFile);
4014696b33e6Sdrh         *(char**)pArg = zTFile;
4015696b33e6Sdrh       }
4016696b33e6Sdrh       return SQLITE_OK;
4017696b33e6Sdrh     }
4018b959a017Sdrh     case SQLITE_FCNTL_HAS_MOVED: {
4019b959a017Sdrh       *(int*)pArg = fileHasMoved(pFile);
4020b959a017Sdrh       return SQLITE_OK;
4021b959a017Sdrh     }
4022f0119b2eSdrh #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
4023f0119b2eSdrh     case SQLITE_FCNTL_LOCK_TIMEOUT: {
402497ccc1bdSdan       int iOld = pFile->iBusyTimeout;
4025f0119b2eSdrh       pFile->iBusyTimeout = *(int*)pArg;
402697ccc1bdSdan       *(int*)pArg = iOld;
4027f0119b2eSdrh       return SQLITE_OK;
4028f0119b2eSdrh     }
4029f0119b2eSdrh #endif
4030e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
40319b4c59faSdrh     case SQLITE_FCNTL_MMAP_SIZE: {
403234f74903Sdrh       i64 newLimit = *(i64*)pArg;
403334e258c9Sdrh       int rc = SQLITE_OK;
40349b4c59faSdrh       if( newLimit>sqlite3GlobalConfig.mxMmap ){
40359b4c59faSdrh         newLimit = sqlite3GlobalConfig.mxMmap;
40369b4c59faSdrh       }
403743c1e622Sdan 
403843c1e622Sdan       /* The value of newLimit may be eventually cast to (size_t) and passed
4039e35395a4Smistachkin       ** to mmap(). Restrict its value to 2GB if (size_t) is not at least a
4040e35395a4Smistachkin       ** 64-bit type. */
4041089df506Sdan       if( newLimit>0 && sizeof(size_t)<8 ){
404243c1e622Sdan         newLimit = (newLimit & 0x7FFFFFFF);
404343c1e622Sdan       }
404443c1e622Sdan 
40459b4c59faSdrh       *(i64*)pArg = pFile->mmapSizeMax;
404634e258c9Sdrh       if( newLimit>=0 && newLimit!=pFile->mmapSizeMax && pFile->nFetchOut==0 ){
40479b4c59faSdrh         pFile->mmapSizeMax = newLimit;
404834e258c9Sdrh         if( pFile->mmapSize>0 ){
404934e258c9Sdrh           unixUnmapfile(pFile);
405034e258c9Sdrh           rc = unixMapfile(pFile, -1);
4051bcb8a868Sdan         }
405234e258c9Sdrh       }
405334e258c9Sdrh       return rc;
4054b2d3de3bSdan     }
4055e98844f7Smistachkin #endif
4056d3d8c04fSdrh #ifdef SQLITE_DEBUG
40578f941bc7Sdrh     /* The pager calls this method to signal that it has done
40588f941bc7Sdrh     ** a rollback and that the database is therefore unchanged and
40598f941bc7Sdrh     ** it hence it is OK for the transaction change counter to be
40608f941bc7Sdrh     ** unchanged.
40618f941bc7Sdrh     */
40628f941bc7Sdrh     case SQLITE_FCNTL_DB_UNCHANGED: {
40638f941bc7Sdrh       ((unixFile*)id)->dbUpdate = 0;
40648f941bc7Sdrh       return SQLITE_OK;
40658f941bc7Sdrh     }
40668f941bc7Sdrh #endif
4067d2cb50b7Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
40684bf66fd6Sdrh     case SQLITE_FCNTL_SET_LOCKPROXYFILE:
40694bf66fd6Sdrh     case SQLITE_FCNTL_GET_LOCKPROXYFILE: {
4070715ff30eSdrh       return proxyFileControl(id,op,pArg);
40717708e972Sdrh     }
4072d2cb50b7Sdrh #endif /* SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__) */
4073aecc04d6Sdan 
4074aecc04d6Sdan     case SQLITE_FCNTL_EXTERNAL_READER: {
4075a12a40c3Sdan #ifndef SQLITE_OMIT_WAL
4076aecc04d6Sdan       return unixFcntlExternalReader((unixFile*)id, (int*)pArg);
4077a12a40c3Sdan #else
4078a12a40c3Sdan       *(int*)pArg = 0;
4079a12a40c3Sdan       return SQLITE_OK;
4080a12a40c3Sdan #endif
4081aecc04d6Sdan     }
40820b52b7d0Sdrh   }
40830b52b7d0Sdrh   return SQLITE_NOTFOUND;
40849cbe6352Sdrh }
40859cbe6352Sdrh 
40869c06c953Sdrh /*
4087efe16971Sdan ** If pFd->sectorSize is non-zero when this function is called, it is a
4088efe16971Sdan ** no-op. Otherwise, the values of pFd->sectorSize and
4089efe16971Sdan ** pFd->deviceCharacteristics are set according to the file-system
4090efe16971Sdan ** characteristics.
4091a3d4c887Sdanielk1977 **
4092efe16971Sdan ** There are two versions of this function. One for QNX and one for all
4093efe16971Sdan ** other systems.
4094a3d4c887Sdanielk1977 */
4095537dddf0Sdrh #ifndef __QNXNTO__
setDeviceCharacteristics(unixFile * pFd)4096efe16971Sdan static void setDeviceCharacteristics(unixFile *pFd){
4097d76dba7eSdrh   assert( pFd->deviceCharacteristics==0 || pFd->sectorSize!=0 );
4098efe16971Sdan   if( pFd->sectorSize==0 ){
4099d76dba7eSdrh #if defined(__linux__) && defined(SQLITE_ENABLE_BATCH_ATOMIC_WRITE)
4100efe16971Sdan     int res;
41019d709540Sdan     u32 f = 0;
4102537dddf0Sdrh 
4103efe16971Sdan     /* Check for support for F2FS atomic batch writes. */
41049d709540Sdan     res = osIoctl(pFd->h, F2FS_IOC_GET_FEATURES, &f);
41059d709540Sdan     if( res==0 && (f & F2FS_FEATURE_ATOMIC_WRITE) ){
410677b4f528Sdan       pFd->deviceCharacteristics = SQLITE_IOCAP_BATCH_ATOMIC;
4107efe16971Sdan     }
4108d76dba7eSdrh #endif /* __linux__ && SQLITE_ENABLE_BATCH_ATOMIC_WRITE */
4109efe16971Sdan 
4110efe16971Sdan     /* Set the POWERSAFE_OVERWRITE flag if requested. */
4111efe16971Sdan     if( pFd->ctrlFlags & UNIXFILE_PSOW ){
4112efe16971Sdan       pFd->deviceCharacteristics |= SQLITE_IOCAP_POWERSAFE_OVERWRITE;
4113efe16971Sdan     }
4114efe16971Sdan 
4115efe16971Sdan     pFd->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE;
4116efe16971Sdan   }
4117efe16971Sdan }
4118efe16971Sdan #else
4119537dddf0Sdrh #include <sys/dcmd_blk.h>
4120537dddf0Sdrh #include <sys/statvfs.h>
setDeviceCharacteristics(unixFile * pFile)4121efe16971Sdan static void setDeviceCharacteristics(unixFile *pFile){
4122537dddf0Sdrh   if( pFile->sectorSize == 0 ){
4123537dddf0Sdrh     struct statvfs fsInfo;
4124537dddf0Sdrh 
4125537dddf0Sdrh     /* Set defaults for non-supported filesystems */
4126537dddf0Sdrh     pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE;
4127537dddf0Sdrh     pFile->deviceCharacteristics = 0;
4128537dddf0Sdrh     if( fstatvfs(pFile->h, &fsInfo) == -1 ) {
4129a9be508aSdrh       return;
4130537dddf0Sdrh     }
4131537dddf0Sdrh 
4132537dddf0Sdrh     if( !strcmp(fsInfo.f_basetype, "tmp") ) {
4133537dddf0Sdrh       pFile->sectorSize = fsInfo.f_bsize;
4134537dddf0Sdrh       pFile->deviceCharacteristics =
4135537dddf0Sdrh         SQLITE_IOCAP_ATOMIC4K |       /* All ram filesystem writes are atomic */
4136537dddf0Sdrh         SQLITE_IOCAP_SAFE_APPEND |    /* growing the file does not occur until
4137537dddf0Sdrh                                       ** the write succeeds */
4138537dddf0Sdrh         SQLITE_IOCAP_SEQUENTIAL |     /* The ram filesystem has no write behind
4139537dddf0Sdrh                                       ** so it is ordered */
4140537dddf0Sdrh         0;
4141537dddf0Sdrh     }else if( strstr(fsInfo.f_basetype, "etfs") ){
4142537dddf0Sdrh       pFile->sectorSize = fsInfo.f_bsize;
4143537dddf0Sdrh       pFile->deviceCharacteristics =
4144537dddf0Sdrh         /* etfs cluster size writes are atomic */
4145537dddf0Sdrh         (pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) |
4146537dddf0Sdrh         SQLITE_IOCAP_SAFE_APPEND |    /* growing the file does not occur until
4147537dddf0Sdrh                                       ** the write succeeds */
4148537dddf0Sdrh         SQLITE_IOCAP_SEQUENTIAL |     /* The ram filesystem has no write behind
4149537dddf0Sdrh                                       ** so it is ordered */
4150537dddf0Sdrh         0;
4151537dddf0Sdrh     }else if( !strcmp(fsInfo.f_basetype, "qnx6") ){
4152537dddf0Sdrh       pFile->sectorSize = fsInfo.f_bsize;
4153537dddf0Sdrh       pFile->deviceCharacteristics =
4154537dddf0Sdrh         SQLITE_IOCAP_ATOMIC |         /* All filesystem writes are atomic */
4155537dddf0Sdrh         SQLITE_IOCAP_SAFE_APPEND |    /* growing the file does not occur until
4156537dddf0Sdrh                                       ** the write succeeds */
4157537dddf0Sdrh         SQLITE_IOCAP_SEQUENTIAL |     /* The ram filesystem has no write behind
4158537dddf0Sdrh                                       ** so it is ordered */
4159537dddf0Sdrh         0;
4160537dddf0Sdrh     }else if( !strcmp(fsInfo.f_basetype, "qnx4") ){
4161537dddf0Sdrh       pFile->sectorSize = fsInfo.f_bsize;
4162537dddf0Sdrh       pFile->deviceCharacteristics =
4163537dddf0Sdrh         /* full bitset of atomics from max sector size and smaller */
4164537dddf0Sdrh         ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 |
4165537dddf0Sdrh         SQLITE_IOCAP_SEQUENTIAL |     /* The ram filesystem has no write behind
4166537dddf0Sdrh                                       ** so it is ordered */
4167537dddf0Sdrh         0;
4168537dddf0Sdrh     }else if( strstr(fsInfo.f_basetype, "dos") ){
4169537dddf0Sdrh       pFile->sectorSize = fsInfo.f_bsize;
4170537dddf0Sdrh       pFile->deviceCharacteristics =
4171537dddf0Sdrh         /* full bitset of atomics from max sector size and smaller */
4172537dddf0Sdrh         ((pFile->sectorSize / 512 * SQLITE_IOCAP_ATOMIC512) << 1) - 2 |
4173537dddf0Sdrh         SQLITE_IOCAP_SEQUENTIAL |     /* The ram filesystem has no write behind
4174537dddf0Sdrh                                       ** so it is ordered */
4175537dddf0Sdrh         0;
4176537dddf0Sdrh     }else{
4177537dddf0Sdrh       pFile->deviceCharacteristics =
4178537dddf0Sdrh         SQLITE_IOCAP_ATOMIC512 |      /* blocks are atomic */
4179537dddf0Sdrh         SQLITE_IOCAP_SAFE_APPEND |    /* growing the file does not occur until
4180537dddf0Sdrh                                       ** the write succeeds */
4181537dddf0Sdrh         0;
4182537dddf0Sdrh     }
4183537dddf0Sdrh   }
4184537dddf0Sdrh   /* Last chance verification.  If the sector size isn't a multiple of 512
4185537dddf0Sdrh   ** then it isn't valid.*/
4186537dddf0Sdrh   if( pFile->sectorSize % 512 != 0 ){
4187537dddf0Sdrh     pFile->deviceCharacteristics = 0;
4188537dddf0Sdrh     pFile->sectorSize = SQLITE_DEFAULT_SECTOR_SIZE;
4189537dddf0Sdrh   }
4190537dddf0Sdrh }
4191efe16971Sdan #endif
4192efe16971Sdan 
4193efe16971Sdan /*
4194efe16971Sdan ** Return the sector size in bytes of the underlying block device for
4195efe16971Sdan ** the specified file. This is almost always 512 bytes, but may be
4196efe16971Sdan ** larger for some devices.
4197efe16971Sdan **
4198efe16971Sdan ** SQLite code assumes this function cannot fail. It also assumes that
4199efe16971Sdan ** if two files are created in the same file-system directory (i.e.
4200efe16971Sdan ** a database and its journal file) that the sector size will be the
4201efe16971Sdan ** same for both.
4202efe16971Sdan */
unixSectorSize(sqlite3_file * id)4203efe16971Sdan static int unixSectorSize(sqlite3_file *id){
4204efe16971Sdan   unixFile *pFd = (unixFile*)id;
4205efe16971Sdan   setDeviceCharacteristics(pFd);
4206efe16971Sdan   return pFd->sectorSize;
4207efe16971Sdan }
4208a3d4c887Sdanielk1977 
420990949c20Sdanielk1977 /*
4210f12b3f60Sdrh ** Return the device characteristics for the file.
4211f12b3f60Sdrh **
4212cb15f35fSdrh ** This VFS is set up to return SQLITE_IOCAP_POWERSAFE_OVERWRITE by default.
421360ec914cSpeter.d.reid ** However, that choice is controversial since technically the underlying
4214cb15f35fSdrh ** file system does not always provide powersafe overwrites.  (In other
4215cb15f35fSdrh ** words, after a power-loss event, parts of the file that were never
4216cb15f35fSdrh ** written might end up being altered.)  However, non-PSOW behavior is very,
4217cb15f35fSdrh ** very rare.  And asserting PSOW makes a large reduction in the amount
4218cb15f35fSdrh ** of required I/O for journaling, since a lot of padding is eliminated.
4219cb15f35fSdrh **  Hence, while POWERSAFE_OVERWRITE is on by default, there is a file-control
4220cb15f35fSdrh ** available to turn it off and URI query parameter available to turn it off.
422190949c20Sdanielk1977 */
unixDeviceCharacteristics(sqlite3_file * id)4222f12b3f60Sdrh static int unixDeviceCharacteristics(sqlite3_file *id){
4223efe16971Sdan   unixFile *pFd = (unixFile*)id;
4224efe16971Sdan   setDeviceCharacteristics(pFd);
4225efe16971Sdan   return pFd->deviceCharacteristics;
422662079060Sdanielk1977 }
422762079060Sdanielk1977 
4228702eec1cSdan #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0
4229d9e5c4f6Sdrh 
4230702eec1cSdan /*
4231702eec1cSdan ** Return the system page size.
4232702eec1cSdan **
4233702eec1cSdan ** This function should not be called directly by other code in this file.
4234702eec1cSdan ** Instead, it should be called via macro osGetpagesize().
4235702eec1cSdan */
unixGetpagesize(void)4236702eec1cSdan static int unixGetpagesize(void){
42378cd5b254Sdrh #if OS_VXWORKS
42388cd5b254Sdrh   return 1024;
42398cd5b254Sdrh #elif defined(_BSD_SOURCE)
4240702eec1cSdan   return getpagesize();
4241702eec1cSdan #else
4242702eec1cSdan   return (int)sysconf(_SC_PAGESIZE);
4243702eec1cSdan #endif
4244702eec1cSdan }
4245702eec1cSdan 
4246702eec1cSdan #endif /* !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0 */
4247702eec1cSdan 
4248702eec1cSdan #ifndef SQLITE_OMIT_WAL
4249d9e5c4f6Sdrh 
4250d9e5c4f6Sdrh /*
4251d91c68f6Sdrh ** Object used to represent an shared memory buffer.
4252d91c68f6Sdrh **
4253d91c68f6Sdrh ** When multiple threads all reference the same wal-index, each thread
4254d91c68f6Sdrh ** has its own unixShm object, but they all point to a single instance
4255d91c68f6Sdrh ** of this unixShmNode object.  In other words, each wal-index is opened
4256d91c68f6Sdrh ** only once per process.
4257d91c68f6Sdrh **
4258d91c68f6Sdrh ** Each unixShmNode object is connected to a single unixInodeInfo object.
4259d91c68f6Sdrh ** We could coalesce this object into unixInodeInfo, but that would mean
4260d91c68f6Sdrh ** every open file that does not use shared memory (in other words, most
4261d91c68f6Sdrh ** open files) would have to carry around this extra information.  So
4262d91c68f6Sdrh ** the unixInodeInfo object contains a pointer to this unixShmNode object
4263d91c68f6Sdrh ** and the unixShmNode object is created only when needed.
4264d9e5c4f6Sdrh **
4265d9e5c4f6Sdrh ** unixMutexHeld() must be true when creating or destroying
4266d9e5c4f6Sdrh ** this object or while reading or writing the following fields:
4267d9e5c4f6Sdrh **
4268d9e5c4f6Sdrh **      nRef
4269d9e5c4f6Sdrh **
4270d9e5c4f6Sdrh ** The following fields are read-only after the object is created:
4271d9e5c4f6Sdrh **
42728820c8d3Sdrh **      hShm
4273d9e5c4f6Sdrh **      zFilename
4274d9e5c4f6Sdrh **
42758820c8d3Sdrh ** Either unixShmNode.pShmMutex must be held or unixShmNode.nRef==0 and
4276d9e5c4f6Sdrh ** unixMutexHeld() is true when reading or writing any other field
4277d9e5c4f6Sdrh ** in this structure.
4278d9e5c4f6Sdrh */
4279d91c68f6Sdrh struct unixShmNode {
4280d91c68f6Sdrh   unixInodeInfo *pInode;     /* unixInodeInfo that owns this SHM node */
428124efa544Sdrh   sqlite3_mutex *pShmMutex;  /* Mutex to access this object */
4282d9e5c4f6Sdrh   char *zFilename;           /* Name of the mmapped file */
42838820c8d3Sdrh   int hShm;                  /* Open file descriptor */
428418801915Sdan   int szRegion;              /* Size of shared-memory regions */
428566dfec8bSdrh   u16 nRegion;               /* Size of array apRegion */
428666dfec8bSdrh   u8 isReadonly;             /* True if read-only */
428792c02da3Sdan   u8 isUnlocked;             /* True if no DMS lock held */
428818801915Sdan   char **apRegion;           /* Array of mapped shared-memory regions */
4289d9e5c4f6Sdrh   int nRef;                  /* Number of unixShm objects pointing to this */
4290d9e5c4f6Sdrh   unixShm *pFirst;           /* All unixShm objects pointing to this */
42918337da66Sdan   int aLock[SQLITE_SHM_NLOCK];  /* # shared locks on slot, -1==excl lock */
4292d9e5c4f6Sdrh #ifdef SQLITE_DEBUG
4293d9e5c4f6Sdrh   u8 exclMask;               /* Mask of exclusive locks held */
4294d9e5c4f6Sdrh   u8 sharedMask;             /* Mask of shared locks held */
4295d9e5c4f6Sdrh   u8 nextShmId;              /* Next available unixShm.id value */
4296d9e5c4f6Sdrh #endif
4297d9e5c4f6Sdrh };
4298d9e5c4f6Sdrh 
4299d9e5c4f6Sdrh /*
4300d9e5c4f6Sdrh ** Structure used internally by this VFS to record the state of an
4301d9e5c4f6Sdrh ** open shared memory connection.
4302d9e5c4f6Sdrh **
4303d91c68f6Sdrh ** The following fields are initialized when this object is created and
4304d91c68f6Sdrh ** are read-only thereafter:
4305d9e5c4f6Sdrh **
430624efa544Sdrh **    unixShm.pShmNode
4307d91c68f6Sdrh **    unixShm.id
4308d91c68f6Sdrh **
430924efa544Sdrh ** All other fields are read/write.  The unixShm.pShmNode->pShmMutex must
431024efa544Sdrh ** be held while accessing any read/write fields.
4311d9e5c4f6Sdrh */
4312d9e5c4f6Sdrh struct unixShm {
4313d91c68f6Sdrh   unixShmNode *pShmNode;     /* The underlying unixShmNode object */
4314d91c68f6Sdrh   unixShm *pNext;            /* Next unixShm with the same unixShmNode */
431524efa544Sdrh   u8 hasMutex;               /* True if holding the unixShmNode->pShmMutex */
4316fd53231cSdrh   u8 id;                     /* Id of this connection within its unixShmNode */
431773b64e4dSdrh   u16 sharedMask;            /* Mask of shared locks held */
431873b64e4dSdrh   u16 exclMask;              /* Mask of exclusive locks held */
4319d9e5c4f6Sdrh };
4320d9e5c4f6Sdrh 
4321d9e5c4f6Sdrh /*
4322d9e5c4f6Sdrh ** Constants used for locking
4323d9e5c4f6Sdrh */
4324bd9676c1Sdrh #define UNIX_SHM_BASE   ((22+SQLITE_SHM_NLOCK)*4)         /* first lock byte */
43254222441dSdrh #define UNIX_SHM_DMS    (UNIX_SHM_BASE+SQLITE_SHM_NLOCK)  /* deadman switch */
4326d9e5c4f6Sdrh 
4327d9e5c4f6Sdrh /*
4328aecc04d6Sdan ** Use F_GETLK to check whether or not there are any readers with open
4329aecc04d6Sdan ** wal-mode transactions in other processes on database file pFile. If
4330aecc04d6Sdan ** no error occurs, return SQLITE_OK and set (*piOut) to 1 if there are
4331aecc04d6Sdan ** such transactions, or 0 otherwise. If an error occurs, return an
4332aecc04d6Sdan ** SQLite error code. The final value of *piOut is undefined in this
4333aecc04d6Sdan ** case.
4334aecc04d6Sdan */
unixFcntlExternalReader(unixFile * pFile,int * piOut)4335aecc04d6Sdan static int unixFcntlExternalReader(unixFile *pFile, int *piOut){
4336aecc04d6Sdan   int rc = SQLITE_OK;
4337aecc04d6Sdan   *piOut = 0;
4338aecc04d6Sdan   if( pFile->pShm){
4339aecc04d6Sdan     unixShmNode *pShmNode = pFile->pShm->pShmNode;
4340aecc04d6Sdan     struct flock f;
4341aecc04d6Sdan 
4342aecc04d6Sdan     memset(&f, 0, sizeof(f));
4343aecc04d6Sdan     f.l_type = F_WRLCK;
4344aecc04d6Sdan     f.l_whence = SEEK_SET;
4345aecc04d6Sdan     f.l_start = UNIX_SHM_BASE + 3;
4346aecc04d6Sdan     f.l_len = SQLITE_SHM_NLOCK - 3;
4347aecc04d6Sdan 
4348aecc04d6Sdan     sqlite3_mutex_enter(pShmNode->pShmMutex);
4349aecc04d6Sdan     if( osFcntl(pShmNode->hShm, F_GETLK, &f)<0 ){
4350aecc04d6Sdan       rc = SQLITE_IOERR_LOCK;
4351aecc04d6Sdan     }else{
4352aecc04d6Sdan       *piOut = (f.l_type!=F_UNLCK);
4353aecc04d6Sdan     }
4354aecc04d6Sdan     sqlite3_mutex_leave(pShmNode->pShmMutex);
4355aecc04d6Sdan   }
4356aecc04d6Sdan 
4357aecc04d6Sdan   return rc;
4358aecc04d6Sdan }
4359aecc04d6Sdan 
4360aecc04d6Sdan 
4361aecc04d6Sdan /*
436273b64e4dSdrh ** Apply posix advisory locks for all bytes from ofst through ofst+n-1.
4363d9e5c4f6Sdrh **
4364d9e5c4f6Sdrh ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking
4365d9e5c4f6Sdrh ** otherwise.
4366d9e5c4f6Sdrh */
unixShmSystemLock(unixFile * pFile,int lockType,int ofst,int n)4367d9e5c4f6Sdrh static int unixShmSystemLock(
4368bbf76eecSdrh   unixFile *pFile,       /* Open connection to the WAL file */
4369d9e5c4f6Sdrh   int lockType,          /* F_UNLCK, F_RDLCK, or F_WRLCK */
437073b64e4dSdrh   int ofst,              /* First byte of the locking range */
437173b64e4dSdrh   int n                  /* Number of bytes to lock */
4372d9e5c4f6Sdrh ){
4373bbf76eecSdrh   unixShmNode *pShmNode; /* Apply locks to this open shared-memory segment */
4374d9e5c4f6Sdrh   struct flock f;        /* The posix advisory locking structure */
437573b64e4dSdrh   int rc = SQLITE_OK;    /* Result code form fcntl() */
4376d9e5c4f6Sdrh 
4377d91c68f6Sdrh   /* Access to the unixShmNode object is serialized by the caller */
4378bbf76eecSdrh   pShmNode = pFile->pInode->pShmNode;
437924efa544Sdrh   assert( pShmNode->nRef==0 || sqlite3_mutex_held(pShmNode->pShmMutex) );
43809b7e8e10Sdrh   assert( pShmNode->nRef>0 || unixMutexHeld() );
4381d9e5c4f6Sdrh 
43829181ae99Sdan   /* Shared locks never span more than one byte */
43839181ae99Sdan   assert( n==1 || lockType!=F_RDLCK );
43849181ae99Sdan 
43859181ae99Sdan   /* Locks are within range */
43869181ae99Sdan   assert( n>=1 && n<=SQLITE_SHM_NLOCK );
43879181ae99Sdan 
43888820c8d3Sdrh   if( pShmNode->hShm>=0 ){
43897bb8b8a4Sdan     int res;
4390d9e5c4f6Sdrh     /* Initialize the locking parameters */
4391d9e5c4f6Sdrh     f.l_type = lockType;
4392d9e5c4f6Sdrh     f.l_whence = SEEK_SET;
4393c99597caSdrh     f.l_start = ofst;
439473b64e4dSdrh     f.l_len = n;
43957bb8b8a4Sdan     res = osSetPosixAdvisoryLock(pShmNode->hShm, &f, pFile);
43967bb8b8a4Sdan     if( res==-1 ){
43977a623e1dSdan #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
43987bb8b8a4Sdan       rc = (pFile->iBusyTimeout ? SQLITE_BUSY_TIMEOUT : SQLITE_BUSY);
43997a623e1dSdan #else
44007a623e1dSdan       rc = SQLITE_BUSY;
44017a623e1dSdan #endif
44027bb8b8a4Sdan     }
44033cb9339aSdrh   }
4404d9e5c4f6Sdrh 
4405d9e5c4f6Sdrh   /* Update the global lock state and do debug tracing */
4406d9e5c4f6Sdrh #ifdef SQLITE_DEBUG
44079181ae99Sdan   { u16 mask;
4408d9e5c4f6Sdrh   OSTRACE(("SHM-LOCK "));
4409693e6719Sdrh   mask = ofst>31 ? 0xffff : (1<<(ofst+n)) - (1<<ofst);
4410d9e5c4f6Sdrh   if( rc==SQLITE_OK ){
4411d9e5c4f6Sdrh     if( lockType==F_UNLCK ){
441273b64e4dSdrh       OSTRACE(("unlock %d ok", ofst));
441373b64e4dSdrh       pShmNode->exclMask &= ~mask;
441473b64e4dSdrh       pShmNode->sharedMask &= ~mask;
4415d9e5c4f6Sdrh     }else if( lockType==F_RDLCK ){
441673b64e4dSdrh       OSTRACE(("read-lock %d ok", ofst));
441773b64e4dSdrh       pShmNode->exclMask &= ~mask;
441873b64e4dSdrh       pShmNode->sharedMask |= mask;
4419d9e5c4f6Sdrh     }else{
4420d9e5c4f6Sdrh       assert( lockType==F_WRLCK );
442173b64e4dSdrh       OSTRACE(("write-lock %d ok", ofst));
442273b64e4dSdrh       pShmNode->exclMask |= mask;
442373b64e4dSdrh       pShmNode->sharedMask &= ~mask;
4424d9e5c4f6Sdrh     }
4425d9e5c4f6Sdrh   }else{
4426d9e5c4f6Sdrh     if( lockType==F_UNLCK ){
442773b64e4dSdrh       OSTRACE(("unlock %d failed", ofst));
4428d9e5c4f6Sdrh     }else if( lockType==F_RDLCK ){
4429d9e5c4f6Sdrh       OSTRACE(("read-lock failed"));
4430d9e5c4f6Sdrh     }else{
4431d9e5c4f6Sdrh       assert( lockType==F_WRLCK );
443273b64e4dSdrh       OSTRACE(("write-lock %d failed", ofst));
4433d9e5c4f6Sdrh     }
4434d9e5c4f6Sdrh   }
443520e1f08eSdrh   OSTRACE((" - afterwards %03x,%03x\n",
443620e1f08eSdrh            pShmNode->sharedMask, pShmNode->exclMask));
443773b64e4dSdrh   }
4438d9e5c4f6Sdrh #endif
4439d9e5c4f6Sdrh 
4440d9e5c4f6Sdrh   return rc;
4441d9e5c4f6Sdrh }
4442d9e5c4f6Sdrh 
4443781e34cdSdan /*
4444781e34cdSdan ** Return the minimum number of 32KB shm regions that should be mapped at
4445781e34cdSdan ** a time, assuming that each mapping must be an integer multiple of the
4446781e34cdSdan ** current system page-size.
4447781e34cdSdan **
4448781e34cdSdan ** Usually, this is 1. The exception seems to be systems that are configured
4449781e34cdSdan ** to use 64KB pages - in this case each mapping must cover at least two
4450781e34cdSdan ** shm regions.
4451781e34cdSdan */
unixShmRegionPerMap(void)4452781e34cdSdan static int unixShmRegionPerMap(void){
4453781e34cdSdan   int shmsz = 32*1024;            /* SHM region size */
4454bc76063cSdan   int pgsz = osGetpagesize();   /* System page size */
4455781e34cdSdan   assert( ((pgsz-1)&pgsz)==0 );   /* Page size must be a power of 2 */
4456781e34cdSdan   if( pgsz<shmsz ) return 1;
4457781e34cdSdan   return pgsz/shmsz;
4458781e34cdSdan }
4459d9e5c4f6Sdrh 
4460d9e5c4f6Sdrh /*
4461d91c68f6Sdrh ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
4462d9e5c4f6Sdrh **
4463d9e5c4f6Sdrh ** This is not a VFS shared-memory method; it is a utility function called
4464d9e5c4f6Sdrh ** by VFS shared-memory methods.
4465d9e5c4f6Sdrh */
unixShmPurge(unixFile * pFd)4466d91c68f6Sdrh static void unixShmPurge(unixFile *pFd){
4467d91c68f6Sdrh   unixShmNode *p = pFd->pInode->pShmNode;
4468d9e5c4f6Sdrh   assert( unixMutexHeld() );
4469f3b1ed0fSdrh   if( p && ALWAYS(p->nRef==0) ){
4470781e34cdSdan     int nShmPerMap = unixShmRegionPerMap();
447113a3cb82Sdan     int i;
4472d91c68f6Sdrh     assert( p->pInode==pFd->pInode );
447324efa544Sdrh     sqlite3_mutex_free(p->pShmMutex);
4474781e34cdSdan     for(i=0; i<p->nRegion; i+=nShmPerMap){
44758820c8d3Sdrh       if( p->hShm>=0 ){
4476d1ab8065Sdrh         osMunmap(p->apRegion[i], p->szRegion);
44773cb9339aSdrh       }else{
44783cb9339aSdrh         sqlite3_free(p->apRegion[i]);
44793cb9339aSdrh       }
448013a3cb82Sdan     }
448118801915Sdan     sqlite3_free(p->apRegion);
44828820c8d3Sdrh     if( p->hShm>=0 ){
44838820c8d3Sdrh       robust_close(pFd, p->hShm, __LINE__);
44848820c8d3Sdrh       p->hShm = -1;
44850e9365ceSdrh     }
4486d91c68f6Sdrh     p->pInode->pShmNode = 0;
4487d9e5c4f6Sdrh     sqlite3_free(p);
4488d9e5c4f6Sdrh   }
4489d9e5c4f6Sdrh }
4490d9e5c4f6Sdrh 
4491d9e5c4f6Sdrh /*
449292c02da3Sdan ** The DMS lock has not yet been taken on shm file pShmNode. Attempt to
449392c02da3Sdan ** take it now. Return SQLITE_OK if successful, or an SQLite error
449492c02da3Sdan ** code otherwise.
449592c02da3Sdan **
449692c02da3Sdan ** If the DMS cannot be locked because this is a readonly_shm=1
449792c02da3Sdan ** connection and no other process already holds a lock, return
44987e45e3a5Sdrh ** SQLITE_READONLY_CANTINIT and set pShmNode->isUnlocked=1.
449992c02da3Sdan */
unixLockSharedMemory(unixFile * pDbFd,unixShmNode * pShmNode)450092c02da3Sdan static int unixLockSharedMemory(unixFile *pDbFd, unixShmNode *pShmNode){
450192c02da3Sdan   struct flock lock;
450292c02da3Sdan   int rc = SQLITE_OK;
450392c02da3Sdan 
450492c02da3Sdan   /* Use F_GETLK to determine the locks other processes are holding
450592c02da3Sdan   ** on the DMS byte. If it indicates that another process is holding
450692c02da3Sdan   ** a SHARED lock, then this process may also take a SHARED lock
450792c02da3Sdan   ** and proceed with opening the *-shm file.
450892c02da3Sdan   **
450992c02da3Sdan   ** Or, if no other process is holding any lock, then this process
451092c02da3Sdan   ** is the first to open it. In this case take an EXCLUSIVE lock on the
451192c02da3Sdan   ** DMS byte and truncate the *-shm file to zero bytes in size. Then
451292c02da3Sdan   ** downgrade to a SHARED lock on the DMS byte.
451392c02da3Sdan   **
451492c02da3Sdan   ** If another process is holding an EXCLUSIVE lock on the DMS byte,
451592c02da3Sdan   ** return SQLITE_BUSY to the caller (it will try again). An earlier
451692c02da3Sdan   ** version of this code attempted the SHARED lock at this point. But
451792c02da3Sdan   ** this introduced a subtle race condition: if the process holding
451892c02da3Sdan   ** EXCLUSIVE failed just before truncating the *-shm file, then this
451992c02da3Sdan   ** process might open and use the *-shm file without truncating it.
452092c02da3Sdan   ** And if the *-shm file has been corrupted by a power failure or
452192c02da3Sdan   ** system crash, the database itself may also become corrupt.  */
452292c02da3Sdan   lock.l_whence = SEEK_SET;
452392c02da3Sdan   lock.l_start = UNIX_SHM_DMS;
452492c02da3Sdan   lock.l_len = 1;
452592c02da3Sdan   lock.l_type = F_WRLCK;
45268820c8d3Sdrh   if( osFcntl(pShmNode->hShm, F_GETLK, &lock)!=0 ) {
452792c02da3Sdan     rc = SQLITE_IOERR_LOCK;
452892c02da3Sdan   }else if( lock.l_type==F_UNLCK ){
452992c02da3Sdan     if( pShmNode->isReadonly ){
453092c02da3Sdan       pShmNode->isUnlocked = 1;
45317e45e3a5Sdrh       rc = SQLITE_READONLY_CANTINIT;
453292c02da3Sdan     }else{
453392c02da3Sdan       rc = unixShmSystemLock(pDbFd, F_WRLCK, UNIX_SHM_DMS, 1);
4534f7f2a82aSdrh       /* The first connection to attach must truncate the -shm file.  We
4535f7f2a82aSdrh       ** truncate to 3 bytes (an arbitrary small number, less than the
4536f7f2a82aSdrh       ** -shm header size) rather than 0 as a system debugging aid, to
4537f7f2a82aSdrh       ** help detect if a -shm file truncation is legitimate or is the work
4538f7f2a82aSdrh       ** or a rogue process. */
4539f7f2a82aSdrh       if( rc==SQLITE_OK && robust_ftruncate(pShmNode->hShm, 3) ){
454092c02da3Sdan         rc = unixLogError(SQLITE_IOERR_SHMOPEN,"ftruncate",pShmNode->zFilename);
454192c02da3Sdan       }
454292c02da3Sdan     }
454392c02da3Sdan   }else if( lock.l_type==F_WRLCK ){
454492c02da3Sdan     rc = SQLITE_BUSY;
454592c02da3Sdan   }
454692c02da3Sdan 
454792c02da3Sdan   if( rc==SQLITE_OK ){
454892c02da3Sdan     assert( lock.l_type==F_UNLCK || lock.l_type==F_RDLCK );
454992c02da3Sdan     rc = unixShmSystemLock(pDbFd, F_RDLCK, UNIX_SHM_DMS, 1);
455092c02da3Sdan   }
455192c02da3Sdan   return rc;
455292c02da3Sdan }
455392c02da3Sdan 
455492c02da3Sdan /*
4555da9fe0c3Sdan ** Open a shared-memory area associated with open database file pDbFd.
45567234c6d6Sdrh ** This particular implementation uses mmapped files.
4557d9e5c4f6Sdrh **
45587234c6d6Sdrh ** The file used to implement shared-memory is in the same directory
45597234c6d6Sdrh ** as the open database file and has the same name as the open database
45607234c6d6Sdrh ** file with the "-shm" suffix added.  For example, if the database file
45617234c6d6Sdrh ** is "/home/user1/config.db" then the file that is created and mmapped
4562a4ced195Sdrh ** for shared memory will be called "/home/user1/config.db-shm".
4563a4ced195Sdrh **
4564a4ced195Sdrh ** Another approach to is to use files in /dev/shm or /dev/tmp or an
4565a4ced195Sdrh ** some other tmpfs mount. But if a file in a different directory
4566a4ced195Sdrh ** from the database file is used, then differing access permissions
4567a4ced195Sdrh ** or a chroot() might cause two different processes on the same
4568a4ced195Sdrh ** database to end up using different files for shared memory -
4569a4ced195Sdrh ** meaning that their memory would not really be shared - resulting
4570a4ced195Sdrh ** in database corruption.  Nevertheless, this tmpfs file usage
4571a4ced195Sdrh ** can be enabled at compile-time using -DSQLITE_SHM_DIRECTORY="/dev/shm"
4572a4ced195Sdrh ** or the equivalent.  The use of the SQLITE_SHM_DIRECTORY compile-time
4573a4ced195Sdrh ** option results in an incompatible build of SQLite;  builds of SQLite
4574a4ced195Sdrh ** that with differing SQLITE_SHM_DIRECTORY settings attempt to use the
4575a4ced195Sdrh ** same database file at the same time, database corruption will likely
4576a4ced195Sdrh ** result. The SQLITE_SHM_DIRECTORY compile-time option is considered
4577a4ced195Sdrh ** "unsupported" and may go away in a future SQLite release.
4578d9e5c4f6Sdrh **
4579d9e5c4f6Sdrh ** When opening a new shared-memory file, if no other instances of that
4580d9e5c4f6Sdrh ** file are currently open, in this process or in other processes, then
4581d9e5c4f6Sdrh ** the file must be truncated to zero length or have its header cleared.
45823cb9339aSdrh **
45833cb9339aSdrh ** If the original database file (pDbFd) is using the "unix-excl" VFS
45843cb9339aSdrh ** that means that an exclusive lock is held on the database file and
45853cb9339aSdrh ** that no other processes are able to read or write the database.  In
45863cb9339aSdrh ** that case, we do not really need shared memory.  No shared memory
45873cb9339aSdrh ** file is created.  The shared memory will be simulated with heap memory.
4588d9e5c4f6Sdrh */
unixOpenSharedMemory(unixFile * pDbFd)4589da9fe0c3Sdan static int unixOpenSharedMemory(unixFile *pDbFd){
4590d9e5c4f6Sdrh   struct unixShm *p = 0;          /* The connection to be opened */
4591da9fe0c3Sdan   struct unixShmNode *pShmNode;   /* The underlying mmapped file */
459292c02da3Sdan   int rc = SQLITE_OK;             /* Result code */
45938b3cf82dSdrh   unixInodeInfo *pInode;          /* The inode of fd */
4594f12ba66cSdan   char *zShm;             /* Name of the file used for SHM */
45957234c6d6Sdrh   int nShmFilename;               /* Size of the SHM filename in bytes */
4596d9e5c4f6Sdrh 
4597da9fe0c3Sdan   /* Allocate space for the new unixShm object. */
4598f3cdcdccSdrh   p = sqlite3_malloc64( sizeof(*p) );
4599fad3039cSmistachkin   if( p==0 ) return SQLITE_NOMEM_BKPT;
4600d9e5c4f6Sdrh   memset(p, 0, sizeof(*p));
4601d9e5c4f6Sdrh   assert( pDbFd->pShm==0 );
4602d9e5c4f6Sdrh 
4603d91c68f6Sdrh   /* Check to see if a unixShmNode object already exists. Reuse an existing
4604d91c68f6Sdrh   ** one if present. Create a new one if necessary.
4605d9e5c4f6Sdrh   */
4606095908e1Sdrh   assert( unixFileMutexNotheld(pDbFd) );
4607d9e5c4f6Sdrh   unixEnterMutex();
46088b3cf82dSdrh   pInode = pDbFd->pInode;
46098b3cf82dSdrh   pShmNode = pInode->pShmNode;
4610d91c68f6Sdrh   if( pShmNode==0 ){
4611ddb0ac4bSdan     struct stat sStat;                 /* fstat() info for database file */
46124bf66fd6Sdrh #ifndef SQLITE_SHM_DIRECTORY
46134bf66fd6Sdrh     const char *zBasePath = pDbFd->zPath;
46144bf66fd6Sdrh #endif
4615ddb0ac4bSdan 
4616ddb0ac4bSdan     /* Call fstat() to figure out the permissions on the database file. If
4617ddb0ac4bSdan     ** a new *-shm file is created, an attempt will be made to create it
46188c815d14Sdrh     ** with the same permissions.
4619ddb0ac4bSdan     */
4620f3b1ed0fSdrh     if( osFstat(pDbFd->h, &sStat) ){
4621ddb0ac4bSdan       rc = SQLITE_IOERR_FSTAT;
4622ddb0ac4bSdan       goto shm_open_err;
4623ddb0ac4bSdan     }
4624ddb0ac4bSdan 
4625a4ced195Sdrh #ifdef SQLITE_SHM_DIRECTORY
462652bcde0eSdrh     nShmFilename = sizeof(SQLITE_SHM_DIRECTORY) + 31;
4627a4ced195Sdrh #else
46284bf66fd6Sdrh     nShmFilename = 6 + (int)strlen(zBasePath);
4629a4ced195Sdrh #endif
4630f3cdcdccSdrh     pShmNode = sqlite3_malloc64( sizeof(*pShmNode) + nShmFilename );
4631d91c68f6Sdrh     if( pShmNode==0 ){
4632fad3039cSmistachkin       rc = SQLITE_NOMEM_BKPT;
4633d9e5c4f6Sdrh       goto shm_open_err;
4634d9e5c4f6Sdrh     }
46359cb5a0d9Sdrh     memset(pShmNode, 0, sizeof(*pShmNode)+nShmFilename);
4636f12ba66cSdan     zShm = pShmNode->zFilename = (char*)&pShmNode[1];
4637a4ced195Sdrh #ifdef SQLITE_SHM_DIRECTORY
4638f12ba66cSdan     sqlite3_snprintf(nShmFilename, zShm,
4639a4ced195Sdrh                      SQLITE_SHM_DIRECTORY "/sqlite-shm-%x-%x",
4640a4ced195Sdrh                      (u32)sStat.st_ino, (u32)sStat.st_dev);
4641a4ced195Sdrh #else
4642f12ba66cSdan     sqlite3_snprintf(nShmFilename, zShm, "%s-shm", zBasePath);
4643f12ba66cSdan     sqlite3FileSuffix3(pDbFd->zPath, zShm);
4644a4ced195Sdrh #endif
46458820c8d3Sdrh     pShmNode->hShm = -1;
4646d91c68f6Sdrh     pDbFd->pInode->pShmNode = pShmNode;
4647d91c68f6Sdrh     pShmNode->pInode = pDbFd->pInode;
464897a7e5e6Sdrh     if( sqlite3GlobalConfig.bCoreMutex ){
464924efa544Sdrh       pShmNode->pShmMutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
465024efa544Sdrh       if( pShmNode->pShmMutex==0 ){
4651fad3039cSmistachkin         rc = SQLITE_NOMEM_BKPT;
4652d91c68f6Sdrh         goto shm_open_err;
4653d91c68f6Sdrh       }
465497a7e5e6Sdrh     }
4655d9e5c4f6Sdrh 
46563cb9339aSdrh     if( pInode->bProcessLock==0 ){
4657f12ba66cSdan       if( 0==sqlite3_uri_boolean(pDbFd->zPath, "readonly_shm", 0) ){
4658c398c65bSdrh         pShmNode->hShm = robust_open(zShm, O_RDWR|O_CREAT|O_NOFOLLOW,
4659c398c65bSdrh                                      (sStat.st_mode&0777));
466066dfec8bSdrh       }
46618820c8d3Sdrh       if( pShmNode->hShm<0 ){
4662c398c65bSdrh         pShmNode->hShm = robust_open(zShm, O_RDONLY|O_NOFOLLOW,
4663c398c65bSdrh                                      (sStat.st_mode&0777));
46648820c8d3Sdrh         if( pShmNode->hShm<0 ){
4665f12ba66cSdan           rc = unixLogError(SQLITE_CANTOPEN_BKPT, "open", zShm);
4666d9e5c4f6Sdrh           goto shm_open_err;
4667d9e5c4f6Sdrh         }
4668f12ba66cSdan         pShmNode->isReadonly = 1;
4669f12ba66cSdan       }
4670d9e5c4f6Sdrh 
4671ac7c3ac1Sdrh       /* If this process is running as root, make sure that the SHM file
4672ac7c3ac1Sdrh       ** is owned by the same user that owns the original database.  Otherwise,
4673ed466827Sdrh       ** the original owner will not be able to connect.
4674ac7c3ac1Sdrh       */
46758820c8d3Sdrh       robustFchown(pShmNode->hShm, sStat.st_uid, sStat.st_gid);
4676ac7c3ac1Sdrh 
467792c02da3Sdan       rc = unixLockSharedMemory(pDbFd, pShmNode);
46787e45e3a5Sdrh       if( rc!=SQLITE_OK && rc!=SQLITE_READONLY_CANTINIT ) goto shm_open_err;
4679d9e5c4f6Sdrh     }
46803cb9339aSdrh   }
4681d9e5c4f6Sdrh 
4682d91c68f6Sdrh   /* Make the new connection a child of the unixShmNode */
4683d91c68f6Sdrh   p->pShmNode = pShmNode;
4684d9e5c4f6Sdrh #ifdef SQLITE_DEBUG
4685d91c68f6Sdrh   p->id = pShmNode->nextShmId++;
4686d9e5c4f6Sdrh #endif
4687d91c68f6Sdrh   pShmNode->nRef++;
4688d9e5c4f6Sdrh   pDbFd->pShm = p;
4689d9e5c4f6Sdrh   unixLeaveMutex();
46900668f591Sdan 
46910668f591Sdan   /* The reference count on pShmNode has already been incremented under
46920668f591Sdan   ** the cover of the unixEnterMutex() mutex and the pointer from the
46930668f591Sdan   ** new (struct unixShm) object to the pShmNode has been set. All that is
46940668f591Sdan   ** left to do is to link the new object into the linked list starting
469524efa544Sdrh   ** at pShmNode->pFirst. This must be done while holding the
469624efa544Sdrh   ** pShmNode->pShmMutex.
46970668f591Sdan   */
469824efa544Sdrh   sqlite3_mutex_enter(pShmNode->pShmMutex);
46990668f591Sdan   p->pNext = pShmNode->pFirst;
47000668f591Sdan   pShmNode->pFirst = p;
470124efa544Sdrh   sqlite3_mutex_leave(pShmNode->pShmMutex);
470292c02da3Sdan   return rc;
4703d9e5c4f6Sdrh 
4704d9e5c4f6Sdrh   /* Jump here on any error */
4705d9e5c4f6Sdrh shm_open_err:
4706d91c68f6Sdrh   unixShmPurge(pDbFd);       /* This call frees pShmNode if required */
4707d9e5c4f6Sdrh   sqlite3_free(p);
4708d9e5c4f6Sdrh   unixLeaveMutex();
4709d9e5c4f6Sdrh   return rc;
4710d9e5c4f6Sdrh }
4711d9e5c4f6Sdrh 
4712d9e5c4f6Sdrh /*
4713da9fe0c3Sdan ** This function is called to obtain a pointer to region iRegion of the
4714da9fe0c3Sdan ** shared-memory associated with the database file fd. Shared-memory regions
4715da9fe0c3Sdan ** are numbered starting from zero. Each shared-memory region is szRegion
4716da9fe0c3Sdan ** bytes in size.
4717da9fe0c3Sdan **
4718da9fe0c3Sdan ** If an error occurs, an error code is returned and *pp is set to NULL.
4719da9fe0c3Sdan **
4720da9fe0c3Sdan ** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
4721da9fe0c3Sdan ** region has not been allocated (by any client, including one running in a
4722da9fe0c3Sdan ** separate process), then *pp is set to NULL and SQLITE_OK returned. If
4723da9fe0c3Sdan ** bExtend is non-zero and the requested shared-memory region has not yet
4724da9fe0c3Sdan ** been allocated, it is allocated by this function.
4725da9fe0c3Sdan **
4726da9fe0c3Sdan ** If the shared-memory region has already been allocated or is allocated by
4727da9fe0c3Sdan ** this call as described above, then it is mapped into this processes
4728da9fe0c3Sdan ** address space (if it is not already), *pp is set to point to the mapped
4729da9fe0c3Sdan ** memory and SQLITE_OK returned.
4730d9e5c4f6Sdrh */
unixShmMap(sqlite3_file * fd,int iRegion,int szRegion,int bExtend,void volatile ** pp)4731da9fe0c3Sdan static int unixShmMap(
4732da9fe0c3Sdan   sqlite3_file *fd,               /* Handle open on database file */
4733da9fe0c3Sdan   int iRegion,                    /* Region to retrieve */
4734da9fe0c3Sdan   int szRegion,                   /* Size of regions */
4735da9fe0c3Sdan   int bExtend,                    /* True to extend file if necessary */
4736da9fe0c3Sdan   void volatile **pp              /* OUT: Mapped memory */
4737d9e5c4f6Sdrh ){
4738da9fe0c3Sdan   unixFile *pDbFd = (unixFile*)fd;
4739da9fe0c3Sdan   unixShm *p;
4740da9fe0c3Sdan   unixShmNode *pShmNode;
4741da9fe0c3Sdan   int rc = SQLITE_OK;
4742781e34cdSdan   int nShmPerMap = unixShmRegionPerMap();
4743781e34cdSdan   int nReqRegion;
4744d9e5c4f6Sdrh 
4745da9fe0c3Sdan   /* If the shared-memory file has not yet been opened, open it now. */
4746da9fe0c3Sdan   if( pDbFd->pShm==0 ){
4747da9fe0c3Sdan     rc = unixOpenSharedMemory(pDbFd);
4748da9fe0c3Sdan     if( rc!=SQLITE_OK ) return rc;
4749d9e5c4f6Sdrh   }
4750d9e5c4f6Sdrh 
4751da9fe0c3Sdan   p = pDbFd->pShm;
4752da9fe0c3Sdan   pShmNode = p->pShmNode;
475324efa544Sdrh   sqlite3_mutex_enter(pShmNode->pShmMutex);
475492c02da3Sdan   if( pShmNode->isUnlocked ){
475592c02da3Sdan     rc = unixLockSharedMemory(pDbFd, pShmNode);
475692c02da3Sdan     if( rc!=SQLITE_OK ) goto shmpage_out;
475792c02da3Sdan     pShmNode->isUnlocked = 0;
475892c02da3Sdan   }
4759da9fe0c3Sdan   assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
47603cb9339aSdrh   assert( pShmNode->pInode==pDbFd->pInode );
47618820c8d3Sdrh   assert( pShmNode->hShm>=0 || pDbFd->pInode->bProcessLock==1 );
47628820c8d3Sdrh   assert( pShmNode->hShm<0 || pDbFd->pInode->bProcessLock==0 );
4763da9fe0c3Sdan 
4764781e34cdSdan   /* Minimum number of regions required to be mapped. */
4765781e34cdSdan   nReqRegion = ((iRegion+nShmPerMap) / nShmPerMap) * nShmPerMap;
4766781e34cdSdan 
4767781e34cdSdan   if( pShmNode->nRegion<nReqRegion ){
4768da9fe0c3Sdan     char **apNew;                      /* New apRegion[] array */
4769781e34cdSdan     int nByte = nReqRegion*szRegion;   /* Minimum required file size */
4770da9fe0c3Sdan     struct stat sStat;                 /* Used by fstat() */
4771da9fe0c3Sdan 
4772da9fe0c3Sdan     pShmNode->szRegion = szRegion;
4773da9fe0c3Sdan 
47748820c8d3Sdrh     if( pShmNode->hShm>=0 ){
4775da9fe0c3Sdan       /* The requested region is not mapped into this processes address space.
4776da9fe0c3Sdan       ** Check to see if it has been allocated (i.e. if the wal-index file is
4777da9fe0c3Sdan       ** large enough to contain the requested region).
4778da9fe0c3Sdan       */
47798820c8d3Sdrh       if( osFstat(pShmNode->hShm, &sStat) ){
4780da9fe0c3Sdan         rc = SQLITE_IOERR_SHMSIZE;
4781da9fe0c3Sdan         goto shmpage_out;
4782da9fe0c3Sdan       }
4783da9fe0c3Sdan 
4784da9fe0c3Sdan       if( sStat.st_size<nByte ){
4785da9fe0c3Sdan         /* The requested memory region does not exist. If bExtend is set to
4786da9fe0c3Sdan         ** false, exit early. *pp will be set to NULL and SQLITE_OK returned.
4787da9fe0c3Sdan         */
478847a2b4a0Sdan         if( !bExtend ){
47890fbb50eeSdrh           goto shmpage_out;
47900fbb50eeSdrh         }
479147a2b4a0Sdan 
479247a2b4a0Sdan         /* Alternatively, if bExtend is true, extend the file. Do this by
479347a2b4a0Sdan         ** writing a single byte to the end of each (OS) page being
479447a2b4a0Sdan         ** allocated or extended. Technically, we need only write to the
479547a2b4a0Sdan         ** last page in order to extend the file. But writing to all new
479647a2b4a0Sdan         ** pages forces the OS to allocate them immediately, which reduces
479747a2b4a0Sdan         ** the chances of SIGBUS while accessing the mapped region later on.
479847a2b4a0Sdan         */
479947a2b4a0Sdan         else{
480047a2b4a0Sdan           static const int pgsz = 4096;
480147a2b4a0Sdan           int iPg;
480247a2b4a0Sdan 
480347a2b4a0Sdan           /* Write to the last byte of each newly allocated or extended page */
480447a2b4a0Sdan           assert( (nByte % pgsz)==0 );
480547a2b4a0Sdan           for(iPg=(sStat.st_size/pgsz); iPg<(nByte/pgsz); iPg++){
4806e1818ec7Sdrh             int x = 0;
48078820c8d3Sdrh             if( seekAndWriteFd(pShmNode->hShm, iPg*pgsz + pgsz-1,"",1,&x)!=1 ){
480847a2b4a0Sdan               const char *zFile = pShmNode->zFilename;
480947a2b4a0Sdan               rc = unixLogError(SQLITE_IOERR_SHMSIZE, "write", zFile);
4810da9fe0c3Sdan               goto shmpage_out;
4811da9fe0c3Sdan             }
481247a2b4a0Sdan           }
481347a2b4a0Sdan         }
4814da9fe0c3Sdan       }
48153cb9339aSdrh     }
4816da9fe0c3Sdan 
4817da9fe0c3Sdan     /* Map the requested memory region into this processes address space. */
4818da9fe0c3Sdan     apNew = (char **)sqlite3_realloc(
4819781e34cdSdan         pShmNode->apRegion, nReqRegion*sizeof(char *)
4820da9fe0c3Sdan     );
4821da9fe0c3Sdan     if( !apNew ){
4822fad3039cSmistachkin       rc = SQLITE_IOERR_NOMEM_BKPT;
4823da9fe0c3Sdan       goto shmpage_out;
4824da9fe0c3Sdan     }
4825da9fe0c3Sdan     pShmNode->apRegion = apNew;
4826781e34cdSdan     while( pShmNode->nRegion<nReqRegion ){
4827781e34cdSdan       int nMap = szRegion*nShmPerMap;
4828781e34cdSdan       int i;
48293cb9339aSdrh       void *pMem;
48308820c8d3Sdrh       if( pShmNode->hShm>=0 ){
4831781e34cdSdan         pMem = osMmap(0, nMap,
483266dfec8bSdrh             pShmNode->isReadonly ? PROT_READ : PROT_READ|PROT_WRITE,
48338820c8d3Sdrh             MAP_SHARED, pShmNode->hShm, szRegion*(i64)pShmNode->nRegion
4834da9fe0c3Sdan         );
4835da9fe0c3Sdan         if( pMem==MAP_FAILED ){
483650990dbbSdrh           rc = unixLogError(SQLITE_IOERR_SHMMAP, "mmap", pShmNode->zFilename);
4837da9fe0c3Sdan           goto shmpage_out;
4838da9fe0c3Sdan         }
48393cb9339aSdrh       }else{
4840b6c4d59eSdrh         pMem = sqlite3_malloc64(nMap);
48413cb9339aSdrh         if( pMem==0 ){
4842fad3039cSmistachkin           rc = SQLITE_NOMEM_BKPT;
48433cb9339aSdrh           goto shmpage_out;
48443cb9339aSdrh         }
4845b6c4d59eSdrh         memset(pMem, 0, nMap);
48463cb9339aSdrh       }
4847781e34cdSdan 
4848781e34cdSdan       for(i=0; i<nShmPerMap; i++){
4849781e34cdSdan         pShmNode->apRegion[pShmNode->nRegion+i] = &((char*)pMem)[szRegion*i];
4850781e34cdSdan       }
4851781e34cdSdan       pShmNode->nRegion += nShmPerMap;
4852da9fe0c3Sdan     }
4853da9fe0c3Sdan   }
4854da9fe0c3Sdan 
4855da9fe0c3Sdan shmpage_out:
4856da9fe0c3Sdan   if( pShmNode->nRegion>iRegion ){
4857da9fe0c3Sdan     *pp = pShmNode->apRegion[iRegion];
4858da9fe0c3Sdan   }else{
4859da9fe0c3Sdan     *pp = 0;
4860da9fe0c3Sdan   }
486166dfec8bSdrh   if( pShmNode->isReadonly && rc==SQLITE_OK ) rc = SQLITE_READONLY;
486224efa544Sdrh   sqlite3_mutex_leave(pShmNode->pShmMutex);
4863da9fe0c3Sdan   return rc;
4864d9e5c4f6Sdrh }
4865d9e5c4f6Sdrh 
4866d9e5c4f6Sdrh /*
48678337da66Sdan ** Check that the pShmNode->aLock[] array comports with the locking bitmasks
48688337da66Sdan ** held by each client. Return true if it does, or false otherwise. This
48698337da66Sdan ** is to be used in an assert(). e.g.
48708337da66Sdan **
48718337da66Sdan **     assert( assertLockingArrayOk(pShmNode) );
48728337da66Sdan */
48738337da66Sdan #ifdef SQLITE_DEBUG
assertLockingArrayOk(unixShmNode * pShmNode)48748337da66Sdan static int assertLockingArrayOk(unixShmNode *pShmNode){
48758337da66Sdan   unixShm *pX;
48768337da66Sdan   int aLock[SQLITE_SHM_NLOCK];
48778337da66Sdan   assert( sqlite3_mutex_held(pShmNode->pShmMutex) );
48788337da66Sdan 
48798337da66Sdan   memset(aLock, 0, sizeof(aLock));
48808337da66Sdan   for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
48818337da66Sdan     int i;
48828337da66Sdan     for(i=0; i<SQLITE_SHM_NLOCK; i++){
48838337da66Sdan       if( pX->exclMask & (1<<i) ){
48848337da66Sdan         assert( aLock[i]==0 );
48858337da66Sdan         aLock[i] = -1;
48868337da66Sdan       }else if( pX->sharedMask & (1<<i) ){
48878337da66Sdan         assert( aLock[i]>=0 );
48888337da66Sdan         aLock[i]++;
48898337da66Sdan       }
48908337da66Sdan     }
48918337da66Sdan   }
48928337da66Sdan 
48938337da66Sdan   assert( 0==memcmp(pShmNode->aLock, aLock, sizeof(aLock)) );
48948337da66Sdan   return (memcmp(pShmNode->aLock, aLock, sizeof(aLock))==0);
48958337da66Sdan }
48968337da66Sdan #endif
48978337da66Sdan 
48988337da66Sdan /*
4899d9e5c4f6Sdrh ** Change the lock state for a shared-memory segment.
490015d68092Sdrh **
490115d68092Sdrh ** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
490215d68092Sdrh ** different here than in posix.  In xShmLock(), one can go from unlocked
490315d68092Sdrh ** to shared and back or from unlocked to exclusive and back.  But one may
490415d68092Sdrh ** not go from shared to exclusive or from exclusive to shared.
4905d9e5c4f6Sdrh */
unixShmLock(sqlite3_file * fd,int ofst,int n,int flags)4906d9e5c4f6Sdrh static int unixShmLock(
4907d9e5c4f6Sdrh   sqlite3_file *fd,          /* Database file holding the shared memory */
490873b64e4dSdrh   int ofst,                  /* First lock to acquire or release */
490973b64e4dSdrh   int n,                     /* Number of locks to acquire or release */
491073b64e4dSdrh   int flags                  /* What to do with the lock */
4911d9e5c4f6Sdrh ){
491273b64e4dSdrh   unixFile *pDbFd = (unixFile*)fd;      /* Connection holding shared memory */
491356d88aadSdrh   unixShm *p;                           /* The shared memory being locked */
491456d88aadSdrh   unixShmNode *pShmNode;                /* The underlying file iNode */
491573b64e4dSdrh   int rc = SQLITE_OK;                   /* Result code */
491673b64e4dSdrh   u16 mask;                             /* Mask of locks to take or release */
491756d88aadSdrh   int *aLock;
491856d88aadSdrh 
491956d88aadSdrh   p = pDbFd->pShm;
492056d88aadSdrh   if( p==0 ) return SQLITE_IOERR_SHMLOCK;
492156d88aadSdrh   pShmNode = p->pShmNode;
492256d88aadSdrh   if( NEVER(pShmNode==0) ) return SQLITE_IOERR_SHMLOCK;
492356d88aadSdrh   aLock = pShmNode->aLock;
4924d9e5c4f6Sdrh 
4925d91c68f6Sdrh   assert( pShmNode==pDbFd->pInode->pShmNode );
4926d91c68f6Sdrh   assert( pShmNode->pInode==pDbFd->pInode );
4927c99597caSdrh   assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK );
492873b64e4dSdrh   assert( n>=1 );
492973b64e4dSdrh   assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
493073b64e4dSdrh        || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
493173b64e4dSdrh        || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
493273b64e4dSdrh        || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
493373b64e4dSdrh   assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
49348820c8d3Sdrh   assert( pShmNode->hShm>=0 || pDbFd->pInode->bProcessLock==1 );
49358820c8d3Sdrh   assert( pShmNode->hShm<0 || pDbFd->pInode->bProcessLock==0 );
4936d91c68f6Sdrh 
493758021b23Sdan   /* Check that, if this to be a blocking lock, no locks that occur later
493858021b23Sdan   ** in the following list than the lock being obtained are already held:
493997ccc1bdSdan   **
494097ccc1bdSdan   **   1. Checkpointer lock (ofst==1).
494158021b23Sdan   **   2. Write lock (ofst==0).
494297ccc1bdSdan   **   3. Read locks (ofst>=3 && ofst<SQLITE_SHM_NLOCK).
494397ccc1bdSdan   **
494497ccc1bdSdan   ** In other words, if this is a blocking lock, none of the locks that
494597ccc1bdSdan   ** occur later in the above list than the lock being obtained may be
4946d31fcd47Sdan   ** held.
4947d31fcd47Sdan   **
4948d31fcd47Sdan   ** It is not permitted to block on the RECOVER lock.
4949d31fcd47Sdan   */
495097ccc1bdSdan #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
495158021b23Sdan   assert( (flags & SQLITE_SHM_UNLOCK) || pDbFd->iBusyTimeout==0 || (
495258021b23Sdan          (ofst!=2)                                   /* not RECOVER */
495358021b23Sdan       && (ofst!=1 || (p->exclMask|p->sharedMask)==0)
495458021b23Sdan       && (ofst!=0 || (p->exclMask|p->sharedMask)<3)
495558021b23Sdan       && (ofst<3  || (p->exclMask|p->sharedMask)<(1<<ofst))
495658021b23Sdan   ));
495797ccc1bdSdan #endif
495897ccc1bdSdan 
4959c99597caSdrh   mask = (1<<(ofst+n)) - (1<<ofst);
496073b64e4dSdrh   assert( n>1 || mask==(1<<ofst) );
496124efa544Sdrh   sqlite3_mutex_enter(pShmNode->pShmMutex);
49628337da66Sdan   assert( assertLockingArrayOk(pShmNode) );
496373b64e4dSdrh   if( flags & SQLITE_SHM_UNLOCK ){
49646acdee67Sdan     if( (p->exclMask|p->sharedMask) & mask ){
49658337da66Sdan       int ii;
49668337da66Sdan       int bUnlock = 1;
496773b64e4dSdrh 
49688337da66Sdan       for(ii=ofst; ii<ofst+n; ii++){
49698337da66Sdan         if( aLock[ii]>((p->sharedMask & (1<<ii)) ? 1 : 0) ){
49708337da66Sdan           bUnlock = 0;
49718337da66Sdan         }
497273b64e4dSdrh       }
497373b64e4dSdrh 
49748337da66Sdan       if( bUnlock ){
4975bbf76eecSdrh         rc = unixShmSystemLock(pDbFd, F_UNLCK, ofst+UNIX_SHM_BASE, n);
49768337da66Sdan         if( rc==SQLITE_OK ){
49778337da66Sdan           memset(&aLock[ofst], 0, sizeof(int)*n);
49788337da66Sdan         }
497978043e89Sdrh       }else if( ALWAYS(p->sharedMask & (1<<ofst)) ){
49808337da66Sdan         assert( n==1 && aLock[ofst]>1 );
49818337da66Sdan         aLock[ofst]--;
4982d9e5c4f6Sdrh       }
498373b64e4dSdrh 
498473b64e4dSdrh       /* Undo the local locks */
498573b64e4dSdrh       if( rc==SQLITE_OK ){
498673b64e4dSdrh         p->exclMask &= ~mask;
498773b64e4dSdrh         p->sharedMask &= ~mask;
498873b64e4dSdrh       }
49896acdee67Sdan     }
499073b64e4dSdrh   }else if( flags & SQLITE_SHM_SHARED ){
49918337da66Sdan     assert( n==1 );
49928337da66Sdan     assert( (p->exclMask & (1<<ofst))==0 );
49938337da66Sdan     if( (p->sharedMask & mask)==0 ){
49948337da66Sdan       if( aLock[ofst]<0 ){
4995d9e5c4f6Sdrh         rc = SQLITE_BUSY;
49968337da66Sdan       }else if( aLock[ofst]==0 ){
4997bbf76eecSdrh         rc = unixShmSystemLock(pDbFd, F_RDLCK, ofst+UNIX_SHM_BASE, n);
499873b64e4dSdrh       }
499973b64e4dSdrh 
500073b64e4dSdrh       /* Get the local shared locks */
500173b64e4dSdrh       if( rc==SQLITE_OK ){
500273b64e4dSdrh         p->sharedMask |= mask;
50038337da66Sdan         aLock[ofst]++;
50048337da66Sdan       }
5005d9e5c4f6Sdrh     }
5006d9e5c4f6Sdrh   }else{
500773b64e4dSdrh     /* Make sure no sibling connections hold locks that will block this
50088337da66Sdan     ** lock.  If any do, return SQLITE_BUSY right away.  */
50098337da66Sdan     int ii;
50108337da66Sdan     for(ii=ofst; ii<ofst+n; ii++){
50118337da66Sdan       assert( (p->sharedMask & mask)==0 );
501278043e89Sdrh       if( ALWAYS((p->exclMask & (1<<ii))==0) && aLock[ii] ){
501373b64e4dSdrh         rc = SQLITE_BUSY;
5014d9e5c4f6Sdrh         break;
5015d9e5c4f6Sdrh       }
501673b64e4dSdrh     }
501773b64e4dSdrh 
501873b64e4dSdrh     /* Get the exclusive locks at the system level. Then if successful
50198337da66Sdan     ** also update the in-memory values. */
5020d9e5c4f6Sdrh     if( rc==SQLITE_OK ){
5021bbf76eecSdrh       rc = unixShmSystemLock(pDbFd, F_WRLCK, ofst+UNIX_SHM_BASE, n);
5022d9e5c4f6Sdrh       if( rc==SQLITE_OK ){
502315d68092Sdrh         assert( (p->sharedMask & mask)==0 );
502473b64e4dSdrh         p->exclMask |= mask;
50258337da66Sdan         for(ii=ofst; ii<ofst+n; ii++){
50268337da66Sdan           aLock[ii] = -1;
5027d9e5c4f6Sdrh         }
5028d9e5c4f6Sdrh       }
5029d9e5c4f6Sdrh     }
50308337da66Sdan   }
50318337da66Sdan   assert( assertLockingArrayOk(pShmNode) );
503224efa544Sdrh   sqlite3_mutex_leave(pShmNode->pShmMutex);
503320e1f08eSdrh   OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %03x,%03x\n",
50345ac93652Sdrh            p->id, osGetpid(0), p->sharedMask, p->exclMask));
5035d9e5c4f6Sdrh   return rc;
5036d9e5c4f6Sdrh }
5037d9e5c4f6Sdrh 
5038286a2884Sdrh /*
5039286a2884Sdrh ** Implement a memory barrier or memory fence on shared memory.
5040286a2884Sdrh **
5041286a2884Sdrh ** All loads and stores begun before the barrier must complete before
5042286a2884Sdrh ** any load or store begun after the barrier.
5043286a2884Sdrh */
unixShmBarrier(sqlite3_file * fd)5044286a2884Sdrh static void unixShmBarrier(
5045286a2884Sdrh   sqlite3_file *fd                /* Database file holding the shared memory */
5046286a2884Sdrh ){
5047ff82894fSdrh   UNUSED_PARAMETER(fd);
504822c733daSdrh   sqlite3MemoryBarrier();         /* compiler-defined memory barrier */
5049a86acc21Sdan   assert( fd->pMethods->xLock==nolockLock
5050a86acc21Sdan        || unixFileMutexNotheld((unixFile*)fd)
5051a86acc21Sdan   );
505222c733daSdrh   unixEnterMutex();               /* Also mutex, for redundancy */
5053b29ad850Sdrh   unixLeaveMutex();
5054286a2884Sdrh }
5055286a2884Sdrh 
505618801915Sdan /*
5057da9fe0c3Sdan ** Close a connection to shared-memory.  Delete the underlying
5058da9fe0c3Sdan ** storage if deleteFlag is true.
5059e11fedc5Sdrh **
5060e11fedc5Sdrh ** If there is no shared memory associated with the connection then this
5061e11fedc5Sdrh ** routine is a harmless no-op.
506218801915Sdan */
unixShmUnmap(sqlite3_file * fd,int deleteFlag)5063da9fe0c3Sdan static int unixShmUnmap(
5064da9fe0c3Sdan   sqlite3_file *fd,               /* The underlying database file */
5065da9fe0c3Sdan   int deleteFlag                  /* Delete shared-memory if true */
506613a3cb82Sdan ){
5067da9fe0c3Sdan   unixShm *p;                     /* The connection to be closed */
5068da9fe0c3Sdan   unixShmNode *pShmNode;          /* The underlying shared-memory file */
5069da9fe0c3Sdan   unixShm **pp;                   /* For looping over sibling connections */
5070da9fe0c3Sdan   unixFile *pDbFd;                /* The underlying database file */
507113a3cb82Sdan 
5072da9fe0c3Sdan   pDbFd = (unixFile*)fd;
5073da9fe0c3Sdan   p = pDbFd->pShm;
5074da9fe0c3Sdan   if( p==0 ) return SQLITE_OK;
5075da9fe0c3Sdan   pShmNode = p->pShmNode;
5076da9fe0c3Sdan 
5077da9fe0c3Sdan   assert( pShmNode==pDbFd->pInode->pShmNode );
5078da9fe0c3Sdan   assert( pShmNode->pInode==pDbFd->pInode );
5079da9fe0c3Sdan 
5080da9fe0c3Sdan   /* Remove connection p from the set of connections associated
5081da9fe0c3Sdan   ** with pShmNode */
508224efa544Sdrh   sqlite3_mutex_enter(pShmNode->pShmMutex);
5083da9fe0c3Sdan   for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){}
5084da9fe0c3Sdan   *pp = p->pNext;
508513a3cb82Sdan 
5086da9fe0c3Sdan   /* Free the connection p */
5087da9fe0c3Sdan   sqlite3_free(p);
5088da9fe0c3Sdan   pDbFd->pShm = 0;
508924efa544Sdrh   sqlite3_mutex_leave(pShmNode->pShmMutex);
5090da9fe0c3Sdan 
5091da9fe0c3Sdan   /* If pShmNode->nRef has reached 0, then close the underlying
5092da9fe0c3Sdan   ** shared-memory file, too */
5093095908e1Sdrh   assert( unixFileMutexNotheld(pDbFd) );
5094da9fe0c3Sdan   unixEnterMutex();
5095da9fe0c3Sdan   assert( pShmNode->nRef>0 );
5096da9fe0c3Sdan   pShmNode->nRef--;
5097da9fe0c3Sdan   if( pShmNode->nRef==0 ){
50988820c8d3Sdrh     if( deleteFlag && pShmNode->hShm>=0 ){
50994bf66fd6Sdrh       osUnlink(pShmNode->zFilename);
51004bf66fd6Sdrh     }
5101da9fe0c3Sdan     unixShmPurge(pDbFd);
510213a3cb82Sdan   }
5103da9fe0c3Sdan   unixLeaveMutex();
5104da9fe0c3Sdan 
5105da9fe0c3Sdan   return SQLITE_OK;
5106da9fe0c3Sdan }
5107da9fe0c3Sdan 
5108286a2884Sdrh 
5109d9e5c4f6Sdrh #else
51106b017cc6Sdrh # define unixShmMap     0
5111da9fe0c3Sdan # define unixShmLock    0
5112286a2884Sdrh # define unixShmBarrier 0
5113da9fe0c3Sdan # define unixShmUnmap   0
5114d9e5c4f6Sdrh #endif /* #ifndef SQLITE_OMIT_WAL */
5115d9e5c4f6Sdrh 
5116e98844f7Smistachkin #if SQLITE_MAX_MMAP_SIZE>0
5117734c9864Sdrh /*
5118aef49d71Sdan ** If it is currently memory mapped, unmap file pFd.
5119d306e1a3Sdan */
unixUnmapfile(unixFile * pFd)5120f23da966Sdan static void unixUnmapfile(unixFile *pFd){
5121f23da966Sdan   assert( pFd->nFetchOut==0 );
5122f23da966Sdan   if( pFd->pMapRegion ){
51239b4c59faSdrh     osMunmap(pFd->pMapRegion, pFd->mmapSizeActual);
5124f23da966Sdan     pFd->pMapRegion = 0;
5125f23da966Sdan     pFd->mmapSize = 0;
51269b4c59faSdrh     pFd->mmapSizeActual = 0;
5127f23da966Sdan   }
5128f23da966Sdan }
51295d8a1372Sdan 
5130aef49d71Sdan /*
5131e6ecd663Sdan ** Attempt to set the size of the memory mapping maintained by file
5132e6ecd663Sdan ** descriptor pFd to nNew bytes. Any existing mapping is discarded.
5133e6ecd663Sdan **
5134e6ecd663Sdan ** If successful, this function sets the following variables:
5135e6ecd663Sdan **
5136e6ecd663Sdan **       unixFile.pMapRegion
5137e6ecd663Sdan **       unixFile.mmapSize
51389b4c59faSdrh **       unixFile.mmapSizeActual
5139e6ecd663Sdan **
5140e6ecd663Sdan ** If unsuccessful, an error message is logged via sqlite3_log() and
5141e6ecd663Sdan ** the three variables above are zeroed. In this case SQLite should
5142e6ecd663Sdan ** continue accessing the database using the xRead() and xWrite()
5143e6ecd663Sdan ** methods.
5144e6ecd663Sdan */
unixRemapfile(unixFile * pFd,i64 nNew)5145e6ecd663Sdan static void unixRemapfile(
5146e6ecd663Sdan   unixFile *pFd,                  /* File descriptor object */
5147e6ecd663Sdan   i64 nNew                        /* Required mapping size */
5148e6ecd663Sdan ){
51494ff7bc45Sdan   const char *zErr = "mmap";
5150e6ecd663Sdan   int h = pFd->h;                      /* File descriptor open on db file */
5151e6ecd663Sdan   u8 *pOrig = (u8 *)pFd->pMapRegion;   /* Pointer to current file mapping */
51529b4c59faSdrh   i64 nOrig = pFd->mmapSizeActual;     /* Size of pOrig region in bytes */
5153e6ecd663Sdan   u8 *pNew = 0;                        /* Location of new mapping */
5154e6ecd663Sdan   int flags = PROT_READ;               /* Flags to pass to mmap() */
5155e6ecd663Sdan 
5156e6ecd663Sdan   assert( pFd->nFetchOut==0 );
5157e6ecd663Sdan   assert( nNew>pFd->mmapSize );
51589b4c59faSdrh   assert( nNew<=pFd->mmapSizeMax );
5159e6ecd663Sdan   assert( nNew>0 );
51609b4c59faSdrh   assert( pFd->mmapSizeActual>=pFd->mmapSize );
51614ff7bc45Sdan   assert( MAP_FAILED!=0 );
5162e6ecd663Sdan 
5163fe33e39bSdan #ifdef SQLITE_MMAP_READWRITE
5164e6ecd663Sdan   if( (pFd->ctrlFlags & UNIXFILE_RDONLY)==0 ) flags |= PROT_WRITE;
5165fe33e39bSdan #endif
5166e6ecd663Sdan 
5167e6ecd663Sdan   if( pOrig ){
5168781e34cdSdan #if HAVE_MREMAP
5169781e34cdSdan     i64 nReuse = pFd->mmapSize;
5170781e34cdSdan #else
5171bc76063cSdan     const int szSyspage = osGetpagesize();
5172e6ecd663Sdan     i64 nReuse = (pFd->mmapSize & ~(szSyspage-1));
5173781e34cdSdan #endif
5174e6ecd663Sdan     u8 *pReq = &pOrig[nReuse];
5175e6ecd663Sdan 
5176e6ecd663Sdan     /* Unmap any pages of the existing mapping that cannot be reused. */
5177e6ecd663Sdan     if( nReuse!=nOrig ){
5178e6ecd663Sdan       osMunmap(pReq, nOrig-nReuse);
5179e6ecd663Sdan     }
5180e6ecd663Sdan 
5181e6ecd663Sdan #if HAVE_MREMAP
5182e6ecd663Sdan     pNew = osMremap(pOrig, nReuse, nNew, MREMAP_MAYMOVE);
51834ff7bc45Sdan     zErr = "mremap";
5184e6ecd663Sdan #else
5185e6ecd663Sdan     pNew = osMmap(pReq, nNew-nReuse, flags, MAP_SHARED, h, nReuse);
5186e6ecd663Sdan     if( pNew!=MAP_FAILED ){
5187e6ecd663Sdan       if( pNew!=pReq ){
5188e6ecd663Sdan         osMunmap(pNew, nNew - nReuse);
51894ff7bc45Sdan         pNew = 0;
5190e6ecd663Sdan       }else{
5191e6ecd663Sdan         pNew = pOrig;
5192e6ecd663Sdan       }
5193e6ecd663Sdan     }
5194e6ecd663Sdan #endif
5195e6ecd663Sdan 
519648ccef80Sdan     /* The attempt to extend the existing mapping failed. Free it. */
519748ccef80Sdan     if( pNew==MAP_FAILED || pNew==0 ){
5198e6ecd663Sdan       osMunmap(pOrig, nReuse);
5199e6ecd663Sdan     }
5200e6ecd663Sdan   }
5201e6ecd663Sdan 
5202e6ecd663Sdan   /* If pNew is still NULL, try to create an entirely new mapping. */
5203e6ecd663Sdan   if( pNew==0 ){
5204e6ecd663Sdan     pNew = osMmap(0, nNew, flags, MAP_SHARED, h, 0);
52054ff7bc45Sdan   }
52064ff7bc45Sdan 
5207e6ecd663Sdan   if( pNew==MAP_FAILED ){
5208e6ecd663Sdan     pNew = 0;
5209e6ecd663Sdan     nNew = 0;
52104ff7bc45Sdan     unixLogError(SQLITE_OK, zErr, pFd->zPath);
5211e6ecd663Sdan 
5212e6ecd663Sdan     /* If the mmap() above failed, assume that all subsequent mmap() calls
5213e6ecd663Sdan     ** will probably fail too. Fall back to using xRead/xWrite exclusively
5214e6ecd663Sdan     ** in this case.  */
52159b4c59faSdrh     pFd->mmapSizeMax = 0;
5216e6ecd663Sdan   }
5217e6ecd663Sdan   pFd->pMapRegion = (void *)pNew;
52189b4c59faSdrh   pFd->mmapSize = pFd->mmapSizeActual = nNew;
5219e6ecd663Sdan }
5220e6ecd663Sdan 
5221e6ecd663Sdan /*
5222aef49d71Sdan ** Memory map or remap the file opened by file-descriptor pFd (if the file
5223aef49d71Sdan ** is already mapped, the existing mapping is replaced by the new). Or, if
5224aef49d71Sdan ** there already exists a mapping for this file, and there are still
5225aef49d71Sdan ** outstanding xFetch() references to it, this function is a no-op.
5226aef49d71Sdan **
5227aef49d71Sdan ** If parameter nByte is non-negative, then it is the requested size of
5228aef49d71Sdan ** the mapping to create. Otherwise, if nByte is less than zero, then the
5229aef49d71Sdan ** requested size is the size of the file on disk. The actual size of the
5230aef49d71Sdan ** created mapping is either the requested size or the value configured
52310d0614bdSdrh ** using SQLITE_FCNTL_MMAP_LIMIT, whichever is smaller.
5232aef49d71Sdan **
5233aef49d71Sdan ** SQLITE_OK is returned if no error occurs (even if the mapping is not
5234aef49d71Sdan ** recreated as a result of outstanding references) or an SQLite error
5235aef49d71Sdan ** code otherwise.
5236aef49d71Sdan */
unixMapfile(unixFile * pFd,i64 nMap)5237f3b1ed0fSdrh static int unixMapfile(unixFile *pFd, i64 nMap){
5238f23da966Sdan   assert( nMap>=0 || pFd->nFetchOut==0 );
5239333e6ca9Sdrh   assert( nMap>0 || (pFd->mmapSize==0 && pFd->pMapRegion==0) );
5240f23da966Sdan   if( pFd->nFetchOut>0 ) return SQLITE_OK;
5241f23da966Sdan 
5242f23da966Sdan   if( nMap<0 ){
52433044b51dSdrh     struct stat statbuf;          /* Low-level file information */
5244f3b1ed0fSdrh     if( osFstat(pFd->h, &statbuf) ){
5245f23da966Sdan       return SQLITE_IOERR_FSTAT;
5246eb97b293Sdan     }
52473044b51dSdrh     nMap = statbuf.st_size;
5248f23da966Sdan   }
52499b4c59faSdrh   if( nMap>pFd->mmapSizeMax ){
52509b4c59faSdrh     nMap = pFd->mmapSizeMax;
5251eb97b293Sdan   }
5252eb97b293Sdan 
5253333e6ca9Sdrh   assert( nMap>0 || (pFd->mmapSize==0 && pFd->pMapRegion==0) );
5254f23da966Sdan   if( nMap!=pFd->mmapSize ){
5255e6ecd663Sdan     unixRemapfile(pFd, nMap);
5256b7e3a326Sdan   }
5257b7e3a326Sdan 
5258d306e1a3Sdan   return SQLITE_OK;
5259d306e1a3Sdan }
5260e98844f7Smistachkin #endif /* SQLITE_MAX_MMAP_SIZE>0 */
5261d306e1a3Sdan 
5262aef49d71Sdan /*
5263aef49d71Sdan ** If possible, return a pointer to a mapping of file fd starting at offset
5264aef49d71Sdan ** iOff. The mapping must be valid for at least nAmt bytes.
5265aef49d71Sdan **
5266aef49d71Sdan ** If such a pointer can be obtained, store it in *pp and return SQLITE_OK.
5267aef49d71Sdan ** Or, if one cannot but no error occurs, set *pp to 0 and return SQLITE_OK.
5268aef49d71Sdan ** Finally, if an error does occur, return an SQLite error code. The final
5269aef49d71Sdan ** value of *pp is undefined in this case.
5270aef49d71Sdan **
5271aef49d71Sdan ** If this function does return a pointer, the caller must eventually
5272aef49d71Sdan ** release the reference by calling unixUnfetch().
5273aef49d71Sdan */
unixFetch(sqlite3_file * fd,i64 iOff,int nAmt,void ** pp)5274f23da966Sdan static int unixFetch(sqlite3_file *fd, i64 iOff, int nAmt, void **pp){
52759b4c59faSdrh #if SQLITE_MAX_MMAP_SIZE>0
5276f23da966Sdan   unixFile *pFd = (unixFile *)fd;   /* The underlying database file */
5277fbc7e884Sdrh #endif
5278f23da966Sdan   *pp = 0;
5279f23da966Sdan 
52809b4c59faSdrh #if SQLITE_MAX_MMAP_SIZE>0
52819b4c59faSdrh   if( pFd->mmapSizeMax>0 ){
5282f23da966Sdan     if( pFd->pMapRegion==0 ){
5283f23da966Sdan       int rc = unixMapfile(pFd, -1);
5284f23da966Sdan       if( rc!=SQLITE_OK ) return rc;
5285f23da966Sdan     }
5286f23da966Sdan     if( pFd->mmapSize >= iOff+nAmt ){
5287f23da966Sdan       *pp = &((u8 *)pFd->pMapRegion)[iOff];
5288f23da966Sdan       pFd->nFetchOut++;
5289f23da966Sdan     }
5290f23da966Sdan   }
52916e0b6d52Sdrh #endif
5292f23da966Sdan   return SQLITE_OK;
52935d8a1372Sdan }
52945d8a1372Sdan 
5295aef49d71Sdan /*
5296df737fe6Sdan ** If the third argument is non-NULL, then this function releases a
5297df737fe6Sdan ** reference obtained by an earlier call to unixFetch(). The second
5298df737fe6Sdan ** argument passed to this function must be the same as the corresponding
5299df737fe6Sdan ** argument that was passed to the unixFetch() invocation.
5300df737fe6Sdan **
5301df737fe6Sdan ** Or, if the third argument is NULL, then this function is being called
5302df737fe6Sdan ** to inform the VFS layer that, according to POSIX, any existing mapping
5303df737fe6Sdan ** may now be invalid and should be unmapped.
5304aef49d71Sdan */
unixUnfetch(sqlite3_file * fd,i64 iOff,void * p)5305df737fe6Sdan static int unixUnfetch(sqlite3_file *fd, i64 iOff, void *p){
5306b5ca3cbcSmistachkin #if SQLITE_MAX_MMAP_SIZE>0
53071bcbc621Sdrh   unixFile *pFd = (unixFile *)fd;   /* The underlying database file */
53089871c59aSdan   UNUSED_PARAMETER(iOff);
53091bcbc621Sdrh 
5310aef49d71Sdan   /* If p==0 (unmap the entire file) then there must be no outstanding
5311aef49d71Sdan   ** xFetch references. Or, if p!=0 (meaning it is an xFetch reference),
5312aef49d71Sdan   ** then there must be at least one outstanding.  */
5313f23da966Sdan   assert( (p==0)==(pFd->nFetchOut==0) );
5314f23da966Sdan 
5315df737fe6Sdan   /* If p!=0, it must match the iOff value. */
5316df737fe6Sdan   assert( p==0 || p==&((u8 *)pFd->pMapRegion)[iOff] );
5317df737fe6Sdan 
5318f23da966Sdan   if( p ){
5319f23da966Sdan     pFd->nFetchOut--;
53206101d504Sdan   }else{
5321f23da966Sdan     unixUnmapfile(pFd);
53225d8a1372Sdan   }
53235d8a1372Sdan 
5324f23da966Sdan   assert( pFd->nFetchOut>=0 );
53251bcbc621Sdrh #else
53261bcbc621Sdrh   UNUSED_PARAMETER(fd);
53271bcbc621Sdrh   UNUSED_PARAMETER(p);
53289871c59aSdan   UNUSED_PARAMETER(iOff);
5329b5ca3cbcSmistachkin #endif
5330f23da966Sdan   return SQLITE_OK;
53315d8a1372Sdan }
53325d8a1372Sdan 
53335d8a1372Sdan /*
5334734c9864Sdrh ** Here ends the implementation of all sqlite3_file methods.
5335734c9864Sdrh **
5336734c9864Sdrh ********************** End sqlite3_file Methods *******************************
5337734c9864Sdrh ******************************************************************************/
5338734c9864Sdrh 
5339734c9864Sdrh /*
53406b9d6ddcSdrh ** This division contains definitions of sqlite3_io_methods objects that
53416b9d6ddcSdrh ** implement various file locking strategies.  It also contains definitions
53426b9d6ddcSdrh ** of "finder" functions.  A finder-function is used to locate the appropriate
53436b9d6ddcSdrh ** sqlite3_io_methods object for a particular database file.  The pAppData
53446b9d6ddcSdrh ** field of the sqlite3_vfs VFS objects are initialized to be pointers to
53456b9d6ddcSdrh ** the correct finder-function for that VFS.
53466b9d6ddcSdrh **
53476b9d6ddcSdrh ** Most finder functions return a pointer to a fixed sqlite3_io_methods
53486b9d6ddcSdrh ** object.  The only interesting finder-function is autolockIoFinder, which
53496b9d6ddcSdrh ** looks at the filesystem type and tries to guess the best locking
53506b9d6ddcSdrh ** strategy from that.
53516b9d6ddcSdrh **
535260ec914cSpeter.d.reid ** For finder-function F, two objects are created:
53531875f7a3Sdrh **
53541875f7a3Sdrh **    (1) The real finder-function named "FImpt()".
53551875f7a3Sdrh **
5356e946c396Sdan **    (2) A constant pointer to this function named just "F".
53571875f7a3Sdrh **
53581875f7a3Sdrh **
53591875f7a3Sdrh ** A pointer to the F pointer is used as the pAppData value for VFS
53601875f7a3Sdrh ** objects.  We have to do this instead of letting pAppData point
53611875f7a3Sdrh ** directly at the finder-function since C90 rules prevent a void*
53621875f7a3Sdrh ** from be cast into a function pointer.
53631875f7a3Sdrh **
53646b9d6ddcSdrh **
53657708e972Sdrh ** Each instance of this macro generates two objects:
5366734c9864Sdrh **
53677708e972Sdrh **   *  A constant sqlite3_io_methods object call METHOD that has locking
53687708e972Sdrh **      methods CLOSE, LOCK, UNLOCK, CKRESLOCK.
53697708e972Sdrh **
53707708e972Sdrh **   *  An I/O method finder function called FINDER that returns a pointer
53717708e972Sdrh **      to the METHOD object in the previous bullet.
5372734c9864Sdrh */
5373d9f9441dSdrh #define IOMETHODS(FINDER,METHOD,VERSION,CLOSE,LOCK,UNLOCK,CKLOCK,SHMMAP)     \
53747708e972Sdrh static const sqlite3_io_methods METHOD = {                                   \
5375d9e5c4f6Sdrh    VERSION,                    /* iVersion */                                \
53767708e972Sdrh    CLOSE,                      /* xClose */                                  \
5377e339d65aSdanielk1977    unixRead,                   /* xRead */                                   \
5378e339d65aSdanielk1977    unixWrite,                  /* xWrite */                                  \
5379e339d65aSdanielk1977    unixTruncate,               /* xTruncate */                               \
5380e339d65aSdanielk1977    unixSync,                   /* xSync */                                   \
5381e339d65aSdanielk1977    unixFileSize,               /* xFileSize */                               \
53827708e972Sdrh    LOCK,                       /* xLock */                                   \
53837708e972Sdrh    UNLOCK,                     /* xUnlock */                                 \
53847708e972Sdrh    CKLOCK,                     /* xCheckReservedLock */                      \
5385e339d65aSdanielk1977    unixFileControl,            /* xFileControl */                            \
5386e339d65aSdanielk1977    unixSectorSize,             /* xSectorSize */                             \
5387d9e5c4f6Sdrh    unixDeviceCharacteristics,  /* xDeviceCapabilities */                     \
5388d9f9441dSdrh    SHMMAP,                     /* xShmMap */                                 \
5389da9fe0c3Sdan    unixShmLock,                /* xShmLock */                                \
5390286a2884Sdrh    unixShmBarrier,             /* xShmBarrier */                             \
53915d8a1372Sdan    unixShmUnmap,               /* xShmUnmap */                               \
5392f23da966Sdan    unixFetch,                  /* xFetch */                                  \
5393f23da966Sdan    unixUnfetch,                /* xUnfetch */                                \
53947708e972Sdrh };                                                                           \
53950c2694b7Sdrh static const sqlite3_io_methods *FINDER##Impl(const char *z, unixFile *p){   \
53960c2694b7Sdrh   UNUSED_PARAMETER(z); UNUSED_PARAMETER(p);                                  \
53977708e972Sdrh   return &METHOD;                                                            \
53981875f7a3Sdrh }                                                                            \
53990c2694b7Sdrh static const sqlite3_io_methods *(*const FINDER)(const char*,unixFile *p)    \
54001875f7a3Sdrh     = FINDER##Impl;
54017708e972Sdrh 
54027708e972Sdrh /*
54037708e972Sdrh ** Here are all of the sqlite3_io_methods objects for each of the
54047708e972Sdrh ** locking strategies.  Functions that return pointers to these methods
54057708e972Sdrh ** are also created.
5406da0e768bSdrh */
54077708e972Sdrh IOMETHODS(
54087708e972Sdrh   posixIoFinder,            /* Finder function name */
54097708e972Sdrh   posixIoMethods,           /* sqlite3_io_methods object name */
54105d8a1372Sdan   3,                        /* shared memory and mmap are enabled */
54117708e972Sdrh   unixClose,                /* xClose method */
54127708e972Sdrh   unixLock,                 /* xLock method */
54137708e972Sdrh   unixUnlock,               /* xUnlock method */
5414d9f9441dSdrh   unixCheckReservedLock,    /* xCheckReservedLock method */
5415d9f9441dSdrh   unixShmMap                /* xShmMap method */
54161875f7a3Sdrh )
54177708e972Sdrh IOMETHODS(
54187708e972Sdrh   nolockIoFinder,           /* Finder function name */
54197708e972Sdrh   nolockIoMethods,          /* sqlite3_io_methods object name */
54203e2c842eSdrh   3,                        /* shared memory and mmap are enabled */
54217708e972Sdrh   nolockClose,              /* xClose method */
54227708e972Sdrh   nolockLock,               /* xLock method */
54237708e972Sdrh   nolockUnlock,             /* xUnlock method */
5424d9f9441dSdrh   nolockCheckReservedLock,  /* xCheckReservedLock method */
5425d9f9441dSdrh   0                         /* xShmMap method */
54261875f7a3Sdrh )
54277708e972Sdrh IOMETHODS(
54287708e972Sdrh   dotlockIoFinder,          /* Finder function name */
54297708e972Sdrh   dotlockIoMethods,         /* sqlite3_io_methods object name */
54306e1f4828Sdrh   1,                        /* shared memory is disabled */
54317708e972Sdrh   dotlockClose,             /* xClose method */
54327708e972Sdrh   dotlockLock,              /* xLock method */
54337708e972Sdrh   dotlockUnlock,            /* xUnlock method */
5434d9f9441dSdrh   dotlockCheckReservedLock, /* xCheckReservedLock method */
5435d9f9441dSdrh   0                         /* xShmMap method */
54361875f7a3Sdrh )
54377708e972Sdrh 
5438e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
54397708e972Sdrh IOMETHODS(
54407708e972Sdrh   flockIoFinder,            /* Finder function name */
54417708e972Sdrh   flockIoMethods,           /* sqlite3_io_methods object name */
54426e1f4828Sdrh   1,                        /* shared memory is disabled */
54437708e972Sdrh   flockClose,               /* xClose method */
54447708e972Sdrh   flockLock,                /* xLock method */
54457708e972Sdrh   flockUnlock,              /* xUnlock method */
5446d9f9441dSdrh   flockCheckReservedLock,   /* xCheckReservedLock method */
5447d9f9441dSdrh   0                         /* xShmMap method */
54481875f7a3Sdrh )
54497708e972Sdrh #endif
54507708e972Sdrh 
54517708e972Sdrh #if OS_VXWORKS
54527708e972Sdrh IOMETHODS(
54537708e972Sdrh   semIoFinder,              /* Finder function name */
54547708e972Sdrh   semIoMethods,             /* sqlite3_io_methods object name */
54556e1f4828Sdrh   1,                        /* shared memory is disabled */
54568cd5b254Sdrh   semXClose,                /* xClose method */
54578cd5b254Sdrh   semXLock,                 /* xLock method */
54588cd5b254Sdrh   semXUnlock,               /* xUnlock method */
54598cd5b254Sdrh   semXCheckReservedLock,    /* xCheckReservedLock method */
5460d9f9441dSdrh   0                         /* xShmMap method */
54611875f7a3Sdrh )
54627708e972Sdrh #endif
54637708e972Sdrh 
5464d2cb50b7Sdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
54657708e972Sdrh IOMETHODS(
54667708e972Sdrh   afpIoFinder,              /* Finder function name */
54677708e972Sdrh   afpIoMethods,             /* sqlite3_io_methods object name */
54686e1f4828Sdrh   1,                        /* shared memory is disabled */
54697708e972Sdrh   afpClose,                 /* xClose method */
54707708e972Sdrh   afpLock,                  /* xLock method */
54717708e972Sdrh   afpUnlock,                /* xUnlock method */
5472d9f9441dSdrh   afpCheckReservedLock,     /* xCheckReservedLock method */
5473d9f9441dSdrh   0                         /* xShmMap method */
54741875f7a3Sdrh )
5475715ff30eSdrh #endif
5476715ff30eSdrh 
5477715ff30eSdrh /*
5478715ff30eSdrh ** The proxy locking method is a "super-method" in the sense that it
5479715ff30eSdrh ** opens secondary file descriptors for the conch and lock files and
5480715ff30eSdrh ** it uses proxy, dot-file, AFP, and flock() locking methods on those
5481715ff30eSdrh ** secondary files.  For this reason, the division that implements
5482715ff30eSdrh ** proxy locking is located much further down in the file.  But we need
5483715ff30eSdrh ** to go ahead and define the sqlite3_io_methods and finder function
5484715ff30eSdrh ** for proxy locking here.  So we forward declare the I/O methods.
5485715ff30eSdrh */
5486d2cb50b7Sdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
5487715ff30eSdrh static int proxyClose(sqlite3_file*);
5488715ff30eSdrh static int proxyLock(sqlite3_file*, int);
5489715ff30eSdrh static int proxyUnlock(sqlite3_file*, int);
5490715ff30eSdrh static int proxyCheckReservedLock(sqlite3_file*, int*);
54917708e972Sdrh IOMETHODS(
54927708e972Sdrh   proxyIoFinder,            /* Finder function name */
54937708e972Sdrh   proxyIoMethods,           /* sqlite3_io_methods object name */
54946e1f4828Sdrh   1,                        /* shared memory is disabled */
54957708e972Sdrh   proxyClose,               /* xClose method */
54967708e972Sdrh   proxyLock,                /* xLock method */
54977708e972Sdrh   proxyUnlock,              /* xUnlock method */
5498d9f9441dSdrh   proxyCheckReservedLock,   /* xCheckReservedLock method */
5499d9f9441dSdrh   0                         /* xShmMap method */
55001875f7a3Sdrh )
55017708e972Sdrh #endif
55027708e972Sdrh 
55037ed97b9dSdrh /* nfs lockd on OSX 10.3+ doesn't clear write locks when a read lock is set */
55047ed97b9dSdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
55057ed97b9dSdrh IOMETHODS(
55067ed97b9dSdrh   nfsIoFinder,               /* Finder function name */
55077ed97b9dSdrh   nfsIoMethods,              /* sqlite3_io_methods object name */
55086e1f4828Sdrh   1,                         /* shared memory is disabled */
55097ed97b9dSdrh   unixClose,                 /* xClose method */
55107ed97b9dSdrh   unixLock,                  /* xLock method */
55117ed97b9dSdrh   nfsUnlock,                 /* xUnlock method */
5512d9f9441dSdrh   unixCheckReservedLock,     /* xCheckReservedLock method */
5513d9f9441dSdrh   0                          /* xShmMap method */
55147ed97b9dSdrh )
55157ed97b9dSdrh #endif
55167708e972Sdrh 
5517d2cb50b7Sdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
55187708e972Sdrh /*
55196b9d6ddcSdrh ** This "finder" function attempts to determine the best locking strategy
55206b9d6ddcSdrh ** for the database file "filePath".  It then returns the sqlite3_io_methods
55217708e972Sdrh ** object that implements that strategy.
55227708e972Sdrh **
55237708e972Sdrh ** This is for MacOSX only.
55247708e972Sdrh */
autolockIoFinderImpl(const char * filePath,unixFile * pNew)55251875f7a3Sdrh static const sqlite3_io_methods *autolockIoFinderImpl(
55267708e972Sdrh   const char *filePath,    /* name of the database file */
55270c2694b7Sdrh   unixFile *pNew           /* open file object for the database file */
55287708e972Sdrh ){
55297708e972Sdrh   static const struct Mapping {
55306b9d6ddcSdrh     const char *zFilesystem;              /* Filesystem type name */
55316b9d6ddcSdrh     const sqlite3_io_methods *pMethods;   /* Appropriate locking method */
55327708e972Sdrh   } aMap[] = {
55337708e972Sdrh     { "hfs",    &posixIoMethods },
55347708e972Sdrh     { "ufs",    &posixIoMethods },
55357708e972Sdrh     { "afpfs",  &afpIoMethods },
55367708e972Sdrh     { "smbfs",  &afpIoMethods },
55377708e972Sdrh     { "webdav", &nolockIoMethods },
55387708e972Sdrh     { 0, 0 }
5539aebf413dSaswift   };
55407708e972Sdrh   int i;
55417708e972Sdrh   struct statfs fsInfo;
55427708e972Sdrh   struct flock lockInfo;
55437708e972Sdrh 
55447708e972Sdrh   if( !filePath ){
55456b9d6ddcSdrh     /* If filePath==NULL that means we are dealing with a transient file
55466b9d6ddcSdrh     ** that does not need to be locked. */
55477708e972Sdrh     return &nolockIoMethods;
55487708e972Sdrh   }
55497708e972Sdrh   if( statfs(filePath, &fsInfo) != -1 ){
55507708e972Sdrh     if( fsInfo.f_flags & MNT_RDONLY ){
55517708e972Sdrh       return &nolockIoMethods;
55527708e972Sdrh     }
55537708e972Sdrh     for(i=0; aMap[i].zFilesystem; i++){
55547708e972Sdrh       if( strcmp(fsInfo.f_fstypename, aMap[i].zFilesystem)==0 ){
55557708e972Sdrh         return aMap[i].pMethods;
55567708e972Sdrh       }
55577708e972Sdrh     }
55587708e972Sdrh   }
55597708e972Sdrh 
55607708e972Sdrh   /* Default case. Handles, amongst others, "nfs".
55617708e972Sdrh   ** Test byte-range lock using fcntl(). If the call succeeds,
55627708e972Sdrh   ** assume that the file-system supports POSIX style locks.
55637708e972Sdrh   */
55647708e972Sdrh   lockInfo.l_len = 1;
55657708e972Sdrh   lockInfo.l_start = 0;
55667708e972Sdrh   lockInfo.l_whence = SEEK_SET;
55677708e972Sdrh   lockInfo.l_type = F_RDLCK;
556899ab3b12Sdrh   if( osFcntl(pNew->h, F_GETLK, &lockInfo)!=-1 ) {
55697ed97b9dSdrh     if( strcmp(fsInfo.f_fstypename, "nfs")==0 ){
55707ed97b9dSdrh       return &nfsIoMethods;
55717ed97b9dSdrh     } else {
55727708e972Sdrh       return &posixIoMethods;
55737ed97b9dSdrh     }
55747708e972Sdrh   }else{
55757708e972Sdrh     return &dotlockIoMethods;
55767708e972Sdrh   }
55777708e972Sdrh }
55780c2694b7Sdrh static const sqlite3_io_methods
55790c2694b7Sdrh   *(*const autolockIoFinder)(const char*,unixFile*) = autolockIoFinderImpl;
55801875f7a3Sdrh 
5581d2cb50b7Sdrh #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
55827708e972Sdrh 
5583e89b2918Sdrh #if OS_VXWORKS
558478a1318bSchw /*
5585e89b2918Sdrh ** This "finder" function for VxWorks checks to see if posix advisory
5586e89b2918Sdrh ** locking works.  If it does, then that is what is used.  If it does not
5587e89b2918Sdrh ** work, then fallback to named semaphore locking.
558878a1318bSchw */
vxworksIoFinderImpl(const char * filePath,unixFile * pNew)5589e89b2918Sdrh static const sqlite3_io_methods *vxworksIoFinderImpl(
559078a1318bSchw   const char *filePath,    /* name of the database file */
55910c2694b7Sdrh   unixFile *pNew           /* the open file object */
559278a1318bSchw ){
559378a1318bSchw   struct flock lockInfo;
559478a1318bSchw 
559578a1318bSchw   if( !filePath ){
559678a1318bSchw     /* If filePath==NULL that means we are dealing with a transient file
559778a1318bSchw     ** that does not need to be locked. */
559878a1318bSchw     return &nolockIoMethods;
559978a1318bSchw   }
560078a1318bSchw 
560178a1318bSchw   /* Test if fcntl() is supported and use POSIX style locks.
560278a1318bSchw   ** Otherwise fall back to the named semaphore method.
560378a1318bSchw   */
560478a1318bSchw   lockInfo.l_len = 1;
560578a1318bSchw   lockInfo.l_start = 0;
560678a1318bSchw   lockInfo.l_whence = SEEK_SET;
560778a1318bSchw   lockInfo.l_type = F_RDLCK;
560899ab3b12Sdrh   if( osFcntl(pNew->h, F_GETLK, &lockInfo)!=-1 ) {
560978a1318bSchw     return &posixIoMethods;
561078a1318bSchw   }else{
561178a1318bSchw     return &semIoMethods;
561278a1318bSchw   }
561378a1318bSchw }
56140c2694b7Sdrh static const sqlite3_io_methods
5615e89b2918Sdrh   *(*const vxworksIoFinder)(const char*,unixFile*) = vxworksIoFinderImpl;
561678a1318bSchw 
5617e89b2918Sdrh #endif /* OS_VXWORKS */
561878a1318bSchw 
56197708e972Sdrh /*
562060ec914cSpeter.d.reid ** An abstract type for a pointer to an IO method finder function:
56217708e972Sdrh */
56220c2694b7Sdrh typedef const sqlite3_io_methods *(*finder_type)(const char*,unixFile*);
56237708e972Sdrh 
5624aebf413dSaswift 
5625734c9864Sdrh /****************************************************************************
5626734c9864Sdrh **************************** sqlite3_vfs methods ****************************
5627734c9864Sdrh **
5628734c9864Sdrh ** This division contains the implementation of methods on the
5629734c9864Sdrh ** sqlite3_vfs object.
5630734c9864Sdrh */
5631734c9864Sdrh 
5632734c9864Sdrh /*
5633aebf413dSaswift ** Initialize the contents of the unixFile structure pointed to by pId.
5634aebf413dSaswift */
fillInUnixFile(sqlite3_vfs * pVfs,int h,sqlite3_file * pId,const char * zFilename,int ctrlFlags)5635aebf413dSaswift static int fillInUnixFile(
5636aebf413dSaswift   sqlite3_vfs *pVfs,      /* Pointer to vfs object */
5637aebf413dSaswift   int h,                  /* Open file descriptor of file being opened */
5638aebf413dSaswift   sqlite3_file *pId,      /* Write to the unixFile structure here */
5639aebf413dSaswift   const char *zFilename,  /* Name of the file being opened */
5640c02a43afSdrh   int ctrlFlags           /* Zero or more UNIXFILE_* values */
5641aebf413dSaswift ){
56427708e972Sdrh   const sqlite3_io_methods *pLockingStyle;
5643aebf413dSaswift   unixFile *pNew = (unixFile *)pId;
5644aebf413dSaswift   int rc = SQLITE_OK;
5645218c5084Sdrh 
56468af6c228Sdrh   assert( pNew->pInode==NULL );
5647218c5084Sdrh 
5648b07028f7Sdrh   /* No locking occurs in temporary files */
5649c02a43afSdrh   assert( zFilename!=0 || (ctrlFlags & UNIXFILE_NOLOCK)!=0 );
5650b07028f7Sdrh 
5651308c2a5cSdrh   OSTRACE(("OPEN    %-3d %s\n", h, zFilename));
5652ad94b58aSdanielk1977   pNew->h = h;
5653de60fc2dSdrh   pNew->pVfs = pVfs;
5654d9e5c4f6Sdrh   pNew->zPath = zFilename;
5655c02a43afSdrh   pNew->ctrlFlags = (u8)ctrlFlags;
5656b5ca3cbcSmistachkin #if SQLITE_MAX_MMAP_SIZE>0
5657ede01a97Sdan   pNew->mmapSizeMax = sqlite3GlobalConfig.szMmap;
5658b5ca3cbcSmistachkin #endif
5659c02a43afSdrh   if( sqlite3_uri_boolean(((ctrlFlags & UNIXFILE_URI) ? zFilename : 0),
5660c02a43afSdrh                            "psow", SQLITE_POWERSAFE_OVERWRITE) ){
5661cb15f35fSdrh     pNew->ctrlFlags |= UNIXFILE_PSOW;
5662bec7c978Sdrh   }
5663503a686eSdrh   if( strcmp(pVfs->zName,"unix-excl")==0 ){
5664f12b3f60Sdrh     pNew->ctrlFlags |= UNIXFILE_EXCL;
5665a7e61d8bSdrh   }
5666218c5084Sdrh 
56676c7d5c5bSdrh #if OS_VXWORKS
5668107886abSdrh   pNew->pId = vxworksFindFileId(zFilename);
5669107886abSdrh   if( pNew->pId==0 ){
5670c02a43afSdrh     ctrlFlags |= UNIXFILE_NOLOCK;
5671fad3039cSmistachkin     rc = SQLITE_NOMEM_BKPT;
567297185489Schw   }
567397185489Schw #endif
567497185489Schw 
5675c02a43afSdrh   if( ctrlFlags & UNIXFILE_NOLOCK ){
56767708e972Sdrh     pLockingStyle = &nolockIoMethods;
5677da0e768bSdrh   }else{
56780c2694b7Sdrh     pLockingStyle = (**(finder_type*)pVfs->pAppData)(zFilename, pNew);
5679aebf413dSaswift #if SQLITE_ENABLE_LOCKING_STYLE
5680aebf413dSaswift     /* Cache zFilename in the locking context (AFP and dotlock override) for
5681aebf413dSaswift     ** proxyLock activation is possible (remote proxy is based on db name)
5682aebf413dSaswift     ** zFilename remains valid until file is closed, to support */
5683aebf413dSaswift     pNew->lockingContext = (void*)zFilename;
5684aebf413dSaswift #endif
5685da0e768bSdrh   }
5686e339d65aSdanielk1977 
56877ed97b9dSdrh   if( pLockingStyle == &posixIoMethods
56887ed97b9dSdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
56897ed97b9dSdrh     || pLockingStyle == &nfsIoMethods
56907ed97b9dSdrh #endif
56917ed97b9dSdrh   ){
56926c7d5c5bSdrh     unixEnterMutex();
56938af6c228Sdrh     rc = findInodeInfo(pNew, &pNew->pInode);
5694e946c396Sdan     if( rc!=SQLITE_OK ){
569548864df9Smistachkin       /* If an error occurred in findInodeInfo(), close the file descriptor
56968af6c228Sdrh       ** immediately, before releasing the mutex. findInodeInfo() may fail
5697e946c396Sdan       ** in two scenarios:
5698e946c396Sdan       **
5699e946c396Sdan       **   (a) A call to fstat() failed.
5700e946c396Sdan       **   (b) A malloc failed.
5701e946c396Sdan       **
5702e946c396Sdan       ** Scenario (b) may only occur if the process is holding no other
5703e946c396Sdan       ** file descriptors open on the same file. If there were other file
5704e946c396Sdan       ** descriptors on this file, then no malloc would be required by
57058af6c228Sdrh       ** findInodeInfo(). If this is the case, it is quite safe to close
5706e946c396Sdan       ** handle h - as it is guaranteed that no posix locks will be released
5707e946c396Sdan       ** by doing so.
5708e946c396Sdan       **
5709e946c396Sdan       ** If scenario (a) caused the error then things are not so safe. The
5710e946c396Sdan       ** implicit assumption here is that if fstat() fails, things are in
5711e946c396Sdan       ** such bad shape that dropping a lock or two doesn't matter much.
5712e946c396Sdan       */
57130e9365ceSdrh       robust_close(pNew, h, __LINE__);
5714e946c396Sdan       h = -1;
5715e946c396Sdan     }
57166c7d5c5bSdrh     unixLeaveMutex();
5717339eb0b8Sdrh   }
5718339eb0b8Sdrh 
5719d2cb50b7Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
5720f0551ee5Saswift   else if( pLockingStyle == &afpIoMethods ){
5721e339d65aSdanielk1977     /* AFP locking uses the file path so it needs to be included in
5722e339d65aSdanielk1977     ** the afpLockingContext.
5723e339d65aSdanielk1977     */
5724e339d65aSdanielk1977     afpLockingContext *pCtx;
5725f3cdcdccSdrh     pNew->lockingContext = pCtx = sqlite3_malloc64( sizeof(*pCtx) );
5726e339d65aSdanielk1977     if( pCtx==0 ){
5727fad3039cSmistachkin       rc = SQLITE_NOMEM_BKPT;
5728e339d65aSdanielk1977     }else{
5729339eb0b8Sdrh       /* NB: zFilename exists and remains valid until the file is closed
5730339eb0b8Sdrh       ** according to requirement F11141.  So we do not need to make a
5731339eb0b8Sdrh       ** copy of the filename. */
5732aebf413dSaswift       pCtx->dbPath = zFilename;
57337ed97b9dSdrh       pCtx->reserved = 0;
5734bfe6631eSdrh       srandomdev();
57356c7d5c5bSdrh       unixEnterMutex();
57368af6c228Sdrh       rc = findInodeInfo(pNew, &pNew->pInode);
57377ed97b9dSdrh       if( rc!=SQLITE_OK ){
57387ed97b9dSdrh         sqlite3_free(pNew->lockingContext);
57390e9365ceSdrh         robust_close(pNew, h, __LINE__);
57407ed97b9dSdrh         h = -1;
57417ed97b9dSdrh       }
57426c7d5c5bSdrh       unixLeaveMutex();
5743e339d65aSdanielk1977     }
57445bb3eb9bSdrh   }
574597185489Schw #endif
5746e339d65aSdanielk1977 
57477708e972Sdrh   else if( pLockingStyle == &dotlockIoMethods ){
5748e339d65aSdanielk1977     /* Dotfile locking uses the file path so it needs to be included in
5749e339d65aSdanielk1977     ** the dotlockLockingContext
5750e339d65aSdanielk1977     */
5751e339d65aSdanielk1977     char *zLockFile;
57525bb3eb9bSdrh     int nFilename;
5753b07028f7Sdrh     assert( zFilename!=0 );
5754ea678832Sdrh     nFilename = (int)strlen(zFilename) + 6;
5755f3cdcdccSdrh     zLockFile = (char *)sqlite3_malloc64(nFilename);
5756e339d65aSdanielk1977     if( zLockFile==0 ){
5757fad3039cSmistachkin       rc = SQLITE_NOMEM_BKPT;
5758e339d65aSdanielk1977     }else{
5759aebf413dSaswift       sqlite3_snprintf(nFilename, zLockFile, "%s" DOTLOCK_SUFFIX, zFilename);
5760339eb0b8Sdrh     }
5761e339d65aSdanielk1977     pNew->lockingContext = zLockFile;
57625bb3eb9bSdrh   }
5763bfe6631eSdrh 
57646c7d5c5bSdrh #if OS_VXWORKS
57657708e972Sdrh   else if( pLockingStyle == &semIoMethods ){
576697185489Schw     /* Named semaphore locking uses the file path so it needs to be
5767734c9864Sdrh     ** included in the semLockingContext
576897185489Schw     */
57696c7d5c5bSdrh     unixEnterMutex();
57708af6c228Sdrh     rc = findInodeInfo(pNew, &pNew->pInode);
57718af6c228Sdrh     if( (rc==SQLITE_OK) && (pNew->pInode->pSem==NULL) ){
57728af6c228Sdrh       char *zSemName = pNew->pInode->aSemName;
577397185489Schw       int n;
57742238dcccSdrh       sqlite3_snprintf(MAX_PATHNAME, zSemName, "/%s.sem",
5775107886abSdrh                        pNew->pId->zCanonicalName);
57762238dcccSdrh       for( n=1; zSemName[n]; n++ )
577797185489Schw         if( zSemName[n]=='/' ) zSemName[n] = '_';
57788af6c228Sdrh       pNew->pInode->pSem = sem_open(zSemName, O_CREAT, 0666, 1);
57798af6c228Sdrh       if( pNew->pInode->pSem == SEM_FAILED ){
5780fad3039cSmistachkin         rc = SQLITE_NOMEM_BKPT;
57818af6c228Sdrh         pNew->pInode->aSemName[0] = '\0';
578297185489Schw       }
578397185489Schw     }
57846c7d5c5bSdrh     unixLeaveMutex();
578597185489Schw   }
578697185489Schw #endif
578797185489Schw 
57884bf66fd6Sdrh   storeLastErrno(pNew, 0);
57896c7d5c5bSdrh #if OS_VXWORKS
579097185489Schw   if( rc!=SQLITE_OK ){
57910e9365ceSdrh     if( h>=0 ) robust_close(pNew, h, __LINE__);
5792309e6555Sdrh     h = -1;
5793036ac7faSdrh     osUnlink(zFilename);
5794c5797545Sdrh     pNew->ctrlFlags |= UNIXFILE_DELETE;
579597185489Schw   }
579697185489Schw #endif
5797e339d65aSdanielk1977   if( rc!=SQLITE_OK ){
57980e9365ceSdrh     if( h>=0 ) robust_close(pNew, h, __LINE__);
5799e339d65aSdanielk1977   }else{
58000c52f5a2Sdrh     pId->pMethods = pLockingStyle;
5801e339d65aSdanielk1977     OpenCounter(+1);
5802fbc7e884Sdrh     verifyDbFile(pNew);
5803e339d65aSdanielk1977   }
580465594045Sdrh   return rc;
5805bfe6631eSdrh }
5806b4b47411Sdanielk1977 
5807ad94b58aSdanielk1977 /*
5808d9137e3bSdan ** Directories to consider for temp files.
580917b90b53Sdanielk1977 */
5810d9137e3bSdan static const char *azTempDirs[] = {
581117b90b53Sdanielk1977   0,
5812aebf413dSaswift   0,
581317b90b53Sdanielk1977   "/var/tmp",
581417b90b53Sdanielk1977   "/usr/tmp",
581517b90b53Sdanielk1977   "/tmp",
5816b7e50ad5Sdrh   "."
581717b90b53Sdanielk1977 };
5818d9137e3bSdan 
5819d9137e3bSdan /*
5820d9137e3bSdan ** Initialize first two members of azTempDirs[] array.
5821d9137e3bSdan */
unixTempFileInit(void)5822d9137e3bSdan static void unixTempFileInit(void){
5823d9137e3bSdan   azTempDirs[0] = getenv("SQLITE_TMPDIR");
5824d9137e3bSdan   azTempDirs[1] = getenv("TMPDIR");
5825d9137e3bSdan }
5826d9137e3bSdan 
5827d9137e3bSdan /*
5828d9137e3bSdan ** Return the name of a directory in which to put temporary files.
5829d9137e3bSdan ** If no suitable temporary file directory can be found, return NULL.
5830d9137e3bSdan */
unixTempFileDir(void)5831d9137e3bSdan static const char *unixTempFileDir(void){
58322aab11faSdrh   unsigned int i = 0;
58338b3cf82dSdrh   struct stat buf;
5834b7e50ad5Sdrh   const char *zDir = sqlite3_temp_directory;
58358b3cf82dSdrh 
58362aab11faSdrh   while(1){
58372aab11faSdrh     if( zDir!=0
58382aab11faSdrh      && osStat(zDir, &buf)==0
58392aab11faSdrh      && S_ISDIR(buf.st_mode)
58402aab11faSdrh      && osAccess(zDir, 03)==0
58412aab11faSdrh     ){
58428b3cf82dSdrh       return zDir;
58438b3cf82dSdrh     }
5844d9137e3bSdan     if( i>=sizeof(azTempDirs)/sizeof(azTempDirs[0]) ) break;
5845d9137e3bSdan     zDir = azTempDirs[i++];
58462aab11faSdrh   }
58477694e064Sdrh   return 0;
58487694e064Sdrh }
58498b3cf82dSdrh 
58508b3cf82dSdrh /*
58518b3cf82dSdrh ** Create a temporary file name in zBuf.  zBuf must be allocated
58528b3cf82dSdrh ** by the calling process and must be big enough to hold at least
58538b3cf82dSdrh ** pVfs->mxPathname bytes.
58548b3cf82dSdrh */
unixGetTempname(int nBuf,char * zBuf)58558b3cf82dSdrh static int unixGetTempname(int nBuf, char *zBuf){
58568b3cf82dSdrh   const char *zDir;
5857b7e50ad5Sdrh   int iLimit = 0;
585818a3a48dSdrh   int rc = SQLITE_OK;
585917b90b53Sdanielk1977 
586017b90b53Sdanielk1977   /* It's odd to simulate an io-error here, but really this is just
586117b90b53Sdanielk1977   ** using the io-error infrastructure to test that SQLite handles this
586217b90b53Sdanielk1977   ** function failing.
586317b90b53Sdanielk1977   */
58647694e064Sdrh   zBuf[0] = 0;
586517b90b53Sdanielk1977   SimulateIOError( return SQLITE_IOERR );
586617b90b53Sdanielk1977 
586718a3a48dSdrh   sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR));
58687234c6d6Sdrh   zDir = unixTempFileDir();
586918a3a48dSdrh   if( zDir==0 ){
587018a3a48dSdrh     rc = SQLITE_IOERR_GETTEMPPATH;
587118a3a48dSdrh   }else{
587217b90b53Sdanielk1977     do{
5873970942e4Sdrh       u64 r;
5874970942e4Sdrh       sqlite3_randomness(sizeof(r), &r);
5875970942e4Sdrh       assert( nBuf>2 );
5876970942e4Sdrh       zBuf[nBuf-2] = 0;
5877970942e4Sdrh       sqlite3_snprintf(nBuf, zBuf, "%s/"SQLITE_TEMP_FILE_PREFIX"%llx%c",
5878970942e4Sdrh                        zDir, r, 0);
587918a3a48dSdrh       if( zBuf[nBuf-2]!=0 || (iLimit++)>10 ){
588018a3a48dSdrh         rc = SQLITE_ERROR;
588118a3a48dSdrh         break;
588218a3a48dSdrh       }
588399ab3b12Sdrh     }while( osAccess(zBuf,0)==0 );
588418a3a48dSdrh   }
588518a3a48dSdrh   sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR));
588618a3a48dSdrh   return rc;
588717b90b53Sdanielk1977 }
588817b90b53Sdanielk1977 
5889d2cb50b7Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
5890c66d5b64Sdrh /*
5891c66d5b64Sdrh ** Routine to transform a unixFile into a proxy-locking unixFile.
5892c66d5b64Sdrh ** Implementation in the proxy-lock division, but used by unixOpen()
5893c66d5b64Sdrh ** if SQLITE_PREFER_PROXY_LOCKING is defined.
5894c66d5b64Sdrh */
5895c66d5b64Sdrh static int proxyTransformUnixFile(unixFile*, const char*);
5896947bd809Sdrh #endif
5897c66d5b64Sdrh 
589808da86a6Sdan /*
589908da86a6Sdan ** Search for an unused file descriptor that was opened on the database
5900067b92baSdrh ** file (not a journal or super-journal file) identified by pathname
590108da86a6Sdan ** zPath with SQLITE_OPEN_XXX flags matching those passed as the second
590208da86a6Sdan ** argument to this function.
590308da86a6Sdan **
590408da86a6Sdan ** Such a file descriptor may exist if a database connection was closed
590508da86a6Sdan ** but the associated file descriptor could not be closed because some
590608da86a6Sdan ** other file descriptor open on the same file is holding a file-lock.
590708da86a6Sdan ** Refer to comments in the unixClose() function and the lengthy comment
590808da86a6Sdan ** describing "Posix Advisory Locking" at the start of this file for
590908da86a6Sdan ** further details. Also, ticket #4018.
591008da86a6Sdan **
591108da86a6Sdan ** If a suitable file descriptor is found, then it is returned. If no
591208da86a6Sdan ** such file descriptor is located, -1 is returned.
591308da86a6Sdan */
findReusableFd(const char * zPath,int flags)5914e946c396Sdan static UnixUnusedFd *findReusableFd(const char *zPath, int flags){
5915e946c396Sdan   UnixUnusedFd *pUnused = 0;
5916e946c396Sdan 
5917e946c396Sdan   /* Do not search for an unused file descriptor on vxworks. Not because
5918e946c396Sdan   ** vxworks would not benefit from the change (it might, we're not sure),
5919e946c396Sdan   ** but because no way to test it is currently available. It is better
5920e946c396Sdan   ** not to risk breaking vxworks support for the sake of such an obscure
5921e946c396Sdan   ** feature.  */
5922e946c396Sdan #if !OS_VXWORKS
592308da86a6Sdan   struct stat sStat;                   /* Results of stat() call */
592408da86a6Sdan 
5925c68886bbSdrh   unixEnterMutex();
5926c68886bbSdrh 
592708da86a6Sdan   /* A stat() call may fail for various reasons. If this happens, it is
592808da86a6Sdan   ** almost certain that an open() call on the same path will also fail.
592908da86a6Sdan   ** For this reason, if an error occurs in the stat() call here, it is
593008da86a6Sdan   ** ignored and -1 is returned. The caller will try to open a new file
593108da86a6Sdan   ** descriptor on the same path, fail, and return an error to SQLite.
593208da86a6Sdan   **
593308da86a6Sdan   ** Even if a subsequent open() call does succeed, the consequences of
593460ec914cSpeter.d.reid   ** not searching for a reusable file descriptor are not dire.  */
5935095908e1Sdrh   if( inodeList!=0 && 0==osStat(zPath, &sStat) ){
5936d91c68f6Sdrh     unixInodeInfo *pInode;
593708da86a6Sdan 
59388af6c228Sdrh     pInode = inodeList;
59398af6c228Sdrh     while( pInode && (pInode->fileId.dev!=sStat.st_dev
594025ef7f55Sdrh                      || pInode->fileId.ino!=(u64)sStat.st_ino) ){
59418af6c228Sdrh        pInode = pInode->pNext;
59429061ad10Sdrh     }
59438af6c228Sdrh     if( pInode ){
5944e946c396Sdan       UnixUnusedFd **pp;
5945095908e1Sdrh       assert( sqlite3_mutex_notheld(pInode->pLockMutex) );
5946095908e1Sdrh       sqlite3_mutex_enter(pInode->pLockMutex);
594755220a6cSdrh       flags &= (SQLITE_OPEN_READONLY|SQLITE_OPEN_READWRITE);
59488af6c228Sdrh       for(pp=&pInode->pUnused; *pp && (*pp)->flags!=flags; pp=&((*pp)->pNext));
5949e946c396Sdan       pUnused = *pp;
5950e946c396Sdan       if( pUnused ){
5951e946c396Sdan         *pp = pUnused->pNext;
595208da86a6Sdan       }
5953095908e1Sdrh       sqlite3_mutex_leave(pInode->pLockMutex);
595408da86a6Sdan     }
595508da86a6Sdan   }
5956c68886bbSdrh   unixLeaveMutex();
5957e946c396Sdan #endif    /* if !OS_VXWORKS */
5958e946c396Sdan   return pUnused;
595908da86a6Sdan }
596017b90b53Sdanielk1977 
596117b90b53Sdanielk1977 /*
59621bf4ca7cSdan ** Find the mode, uid and gid of file zFile.
59631bf4ca7cSdan */
getFileMode(const char * zFile,mode_t * pMode,uid_t * pUid,gid_t * pGid)59641bf4ca7cSdan static int getFileMode(
59651bf4ca7cSdan   const char *zFile,              /* File name */
59661bf4ca7cSdan   mode_t *pMode,                  /* OUT: Permissions of zFile */
59671bf4ca7cSdan   uid_t *pUid,                    /* OUT: uid of zFile. */
59681bf4ca7cSdan   gid_t *pGid                     /* OUT: gid of zFile. */
59691bf4ca7cSdan ){
59701bf4ca7cSdan   struct stat sStat;              /* Output of stat() on database file */
59711bf4ca7cSdan   int rc = SQLITE_OK;
59721bf4ca7cSdan   if( 0==osStat(zFile, &sStat) ){
59731bf4ca7cSdan     *pMode = sStat.st_mode & 0777;
59741bf4ca7cSdan     *pUid = sStat.st_uid;
59751bf4ca7cSdan     *pGid = sStat.st_gid;
59761bf4ca7cSdan   }else{
59771bf4ca7cSdan     rc = SQLITE_IOERR_FSTAT;
59781bf4ca7cSdan   }
59791bf4ca7cSdan   return rc;
59801bf4ca7cSdan }
59811bf4ca7cSdan 
59821bf4ca7cSdan /*
5983ddb0ac4bSdan ** This function is called by unixOpen() to determine the unix permissions
5984f65bc911Sdrh ** to create new files with. If no error occurs, then SQLITE_OK is returned
5985ddb0ac4bSdan ** and a value suitable for passing as the third argument to open(2) is
5986ddb0ac4bSdan ** written to *pMode. If an IO error occurs, an SQLite error code is
5987ddb0ac4bSdan ** returned and the value of *pMode is not modified.
5988ddb0ac4bSdan **
598960ec914cSpeter.d.reid ** In most cases, this routine sets *pMode to 0, which will become
59908c815d14Sdrh ** an indication to robust_open() to create the file using
59918c815d14Sdrh ** SQLITE_DEFAULT_FILE_PERMISSIONS adjusted by the umask.
59928c815d14Sdrh ** But if the file being opened is a WAL or regular journal file, then
59938ab58665Sdrh ** this function queries the file-system for the permissions on the
59948ab58665Sdrh ** corresponding database file and sets *pMode to this value. Whenever
59958ab58665Sdrh ** possible, WAL and journal files are created using the same permissions
59968ab58665Sdrh ** as the associated database file.
599781cc5163Sdrh **
599881cc5163Sdrh ** If the SQLITE_ENABLE_8_3_NAMES option is enabled, then the
599981cc5163Sdrh ** original filename is unavailable.  But 8_3_NAMES is only used for
600081cc5163Sdrh ** FAT filesystems and permissions do not matter there, so just use
60011116b178Sdrh ** the default permissions.  In 8_3_NAMES mode, leave *pMode set to zero.
6002ddb0ac4bSdan */
findCreateFileMode(const char * zPath,int flags,mode_t * pMode,uid_t * pUid,gid_t * pGid)6003ddb0ac4bSdan static int findCreateFileMode(
6004ddb0ac4bSdan   const char *zPath,              /* Path of file (possibly) being created */
6005ddb0ac4bSdan   int flags,                      /* Flags passed as 4th argument to xOpen() */
6006ac7c3ac1Sdrh   mode_t *pMode,                  /* OUT: Permissions to open file with */
6007ac7c3ac1Sdrh   uid_t *pUid,                    /* OUT: uid to set on the file */
6008ac7c3ac1Sdrh   gid_t *pGid                     /* OUT: gid to set on the file */
6009ddb0ac4bSdan ){
6010ddb0ac4bSdan   int rc = SQLITE_OK;             /* Return Code */
60118c815d14Sdrh   *pMode = 0;
6012ac7c3ac1Sdrh   *pUid = 0;
6013ac7c3ac1Sdrh   *pGid = 0;
60148ab58665Sdrh   if( flags & (SQLITE_OPEN_WAL|SQLITE_OPEN_MAIN_JOURNAL) ){
6015ddb0ac4bSdan     char zDb[MAX_PATHNAME+1];     /* Database file path */
6016ddb0ac4bSdan     int nDb;                      /* Number of valid bytes in zDb */
6017ddb0ac4bSdan 
6018a0c989ddSdan     /* zPath is a path to a WAL or journal file. The following block derives
6019a0c989ddSdan     ** the path to the associated database file from zPath. This block handles
6020a0c989ddSdan     ** the following naming conventions:
6021a0c989ddSdan     **
6022a0c989ddSdan     **   "<path to db>-journal"
6023a0c989ddSdan     **   "<path to db>-wal"
602481cc5163Sdrh     **   "<path to db>-journalNN"
602581cc5163Sdrh     **   "<path to db>-walNN"
6026a0c989ddSdan     **
6027d337c5bdSdrh     ** where NN is a decimal number. The NN naming schemes are
6028a0c989ddSdan     ** used by the test_multiplex.c module.
6029577f0a1eSdrh     **
6030577f0a1eSdrh     ** In normal operation, the journal file name will always contain
6031629ec14aSdan     ** a '-' character.  However in 8+3 filename mode, or if a corrupt
6032067b92baSdrh     ** rollback journal specifies a super-journal with a goofy name, then
6033577f0a1eSdrh     ** the '-' might be missing or the '-' might be the first character in
6034577f0a1eSdrh     ** the filename.  In that case, just return SQLITE_OK with *pMode==0.
6035577f0a1eSdrh     */
6036577f0a1eSdrh     nDb = sqlite3Strlen30(zPath) - 1;
6037577f0a1eSdrh     while( nDb>0 && zPath[nDb]!='.' ){
6038577f0a1eSdrh       if( zPath[nDb]=='-' ){
6039ddb0ac4bSdan         memcpy(zDb, zPath, nDb);
6040ddb0ac4bSdan         zDb[nDb] = '\0';
60411bf4ca7cSdan         rc = getFileMode(zDb, pMode, pUid, pGid);
6042577f0a1eSdrh         break;
6043577f0a1eSdrh       }
6044577f0a1eSdrh       nDb--;
6045577f0a1eSdrh     }
6046ddb0ac4bSdan   }else if( flags & SQLITE_OPEN_DELETEONCLOSE ){
6047ddb0ac4bSdan     *pMode = 0600;
60481bf4ca7cSdan   }else if( flags & SQLITE_OPEN_URI ){
60491bf4ca7cSdan     /* If this is a main database file and the file was opened using a URI
60501bf4ca7cSdan     ** filename, check for the "modeof" parameter. If present, interpret
60511bf4ca7cSdan     ** its value as a filename and try to copy the mode, uid and gid from
60521bf4ca7cSdan     ** that file.  */
60531bf4ca7cSdan     const char *z = sqlite3_uri_parameter(zPath, "modeof");
60541bf4ca7cSdan     if( z ){
60551bf4ca7cSdan       rc = getFileMode(z, pMode, pUid, pGid);
60561bf4ca7cSdan     }
6057ddb0ac4bSdan   }
6058ddb0ac4bSdan   return rc;
6059ddb0ac4bSdan }
6060ddb0ac4bSdan 
6061ddb0ac4bSdan /*
6062ad94b58aSdanielk1977 ** Open the file zPath.
6063ad94b58aSdanielk1977 **
6064b4b47411Sdanielk1977 ** Previously, the SQLite OS layer used three functions in place of this
6065b4b47411Sdanielk1977 ** one:
6066b4b47411Sdanielk1977 **
6067b4b47411Sdanielk1977 **     sqlite3OsOpenReadWrite();
6068b4b47411Sdanielk1977 **     sqlite3OsOpenReadOnly();
6069b4b47411Sdanielk1977 **     sqlite3OsOpenExclusive();
6070b4b47411Sdanielk1977 **
6071b4b47411Sdanielk1977 ** These calls correspond to the following combinations of flags:
6072b4b47411Sdanielk1977 **
6073b4b47411Sdanielk1977 **     ReadWrite() ->     (READWRITE | CREATE)
6074b4b47411Sdanielk1977 **     ReadOnly()  ->     (READONLY)
6075b4b47411Sdanielk1977 **     OpenExclusive() -> (READWRITE | CREATE | EXCLUSIVE)
6076b4b47411Sdanielk1977 **
6077b4b47411Sdanielk1977 ** The old OpenExclusive() accepted a boolean argument - "delFlag". If
6078b4b47411Sdanielk1977 ** true, the file was configured to be automatically deleted when the
6079b4b47411Sdanielk1977 ** file handle closed. To achieve the same effect using this new
6080b4b47411Sdanielk1977 ** interface, add the DELETEONCLOSE flag to those specified above for
6081b4b47411Sdanielk1977 ** OpenExclusive().
6082b4b47411Sdanielk1977 */
unixOpen(sqlite3_vfs * pVfs,const char * zPath,sqlite3_file * pFile,int flags,int * pOutFlags)6083b4b47411Sdanielk1977 static int unixOpen(
60846b9d6ddcSdrh   sqlite3_vfs *pVfs,           /* The VFS for which this is the xOpen method */
60856b9d6ddcSdrh   const char *zPath,           /* Pathname of file to be opened */
60866b9d6ddcSdrh   sqlite3_file *pFile,         /* The file descriptor to be filled in */
60876b9d6ddcSdrh   int flags,                   /* Input flags to control the opening */
60886b9d6ddcSdrh   int *pOutFlags               /* Output flags returned to SQLite core */
6089b4b47411Sdanielk1977 ){
609008da86a6Sdan   unixFile *p = (unixFile *)pFile;
6091577d674fSdrh   int fd = -1;                   /* File descriptor returned by open() */
6092734c9864Sdrh   int openFlags = 0;             /* Flags to pass to open() */
6093c398c65bSdrh   int eType = flags&0x0FFF00;  /* Type of file to open */
6094da0e768bSdrh   int noLock;                    /* True to omit locking primitives */
609508da86a6Sdan   int rc = SQLITE_OK;            /* Function Return Code */
6096c02a43afSdrh   int ctrlFlags = 0;             /* UNIXFILE_* flags */
6097b4b47411Sdanielk1977 
6098b4b47411Sdanielk1977   int isExclusive  = (flags & SQLITE_OPEN_EXCLUSIVE);
6099b4b47411Sdanielk1977   int isDelete     = (flags & SQLITE_OPEN_DELETEONCLOSE);
6100b4b47411Sdanielk1977   int isCreate     = (flags & SQLITE_OPEN_CREATE);
6101b4b47411Sdanielk1977   int isReadonly   = (flags & SQLITE_OPEN_READONLY);
6102b4b47411Sdanielk1977   int isReadWrite  = (flags & SQLITE_OPEN_READWRITE);
61037ed97b9dSdrh #if SQLITE_ENABLE_LOCKING_STYLE
61047ed97b9dSdrh   int isAutoProxy  = (flags & SQLITE_OPEN_AUTOPROXY);
61057ed97b9dSdrh #endif
61063d4435b2Sdrh #if defined(__APPLE__) || SQLITE_ENABLE_LOCKING_STYLE
61073d4435b2Sdrh   struct statfs fsInfo;
61083d4435b2Sdrh #endif
6109b4b47411Sdanielk1977 
6110067b92baSdrh   /* If creating a super- or main-file journal, this function will open
6111fee2d25aSdanielk1977   ** a file-descriptor on the directory too. The first time unixSync()
6112fee2d25aSdanielk1977   ** is called the directory file descriptor will be fsync()ed and close()d.
6113fee2d25aSdanielk1977   */
6114a803a2cdSdrh   int isNewJrnl = (isCreate && (
6115ccb2113aSdrh         eType==SQLITE_OPEN_SUPER_JOURNAL
6116ddb0ac4bSdan      || eType==SQLITE_OPEN_MAIN_JOURNAL
6117ddb0ac4bSdan      || eType==SQLITE_OPEN_WAL
6118ddb0ac4bSdan   ));
6119fee2d25aSdanielk1977 
612017b90b53Sdanielk1977   /* If argument zPath is a NULL pointer, this function is required to open
612117b90b53Sdanielk1977   ** a temporary file. Use this buffer to store the file name in.
612217b90b53Sdanielk1977   */
6123c02a43afSdrh   char zTmpname[MAX_PATHNAME+2];
612417b90b53Sdanielk1977   const char *zName = zPath;
612517b90b53Sdanielk1977 
6126fee2d25aSdanielk1977   /* Check the following statements are true:
6127fee2d25aSdanielk1977   **
6128fee2d25aSdanielk1977   **   (a) Exactly one of the READWRITE and READONLY flags must be set, and
6129fee2d25aSdanielk1977   **   (b) if CREATE is set, then READWRITE must also be set, and
6130fee2d25aSdanielk1977   **   (c) if EXCLUSIVE is set, then CREATE must also be set.
613133f4e02aSdrh   **   (d) if DELETEONCLOSE is set, then CREATE must also be set.
6132fee2d25aSdanielk1977   */
6133b4b47411Sdanielk1977   assert((isReadonly==0 || isReadWrite==0) && (isReadWrite || isReadonly));
6134b4b47411Sdanielk1977   assert(isCreate==0 || isReadWrite);
6135b4b47411Sdanielk1977   assert(isExclusive==0 || isCreate);
613633f4e02aSdrh   assert(isDelete==0 || isCreate);
613733f4e02aSdrh 
6138067b92baSdrh   /* The main DB, main journal, WAL file and super-journal are never
6139ddb0ac4bSdan   ** automatically deleted. Nor are they ever temporary files.  */
614008da86a6Sdan   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_MAIN_DB );
614108da86a6Sdan   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_MAIN_JOURNAL );
6142ccb2113aSdrh   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_SUPER_JOURNAL );
6143ddb0ac4bSdan   assert( (!isDelete && zName) || eType!=SQLITE_OPEN_WAL );
6144b4b47411Sdanielk1977 
6145fee2d25aSdanielk1977   /* Assert that the upper layer has set one of the "file-type" flags. */
6146fee2d25aSdanielk1977   assert( eType==SQLITE_OPEN_MAIN_DB      || eType==SQLITE_OPEN_TEMP_DB
6147fee2d25aSdanielk1977        || eType==SQLITE_OPEN_MAIN_JOURNAL || eType==SQLITE_OPEN_TEMP_JOURNAL
6148ccb2113aSdrh        || eType==SQLITE_OPEN_SUBJOURNAL   || eType==SQLITE_OPEN_SUPER_JOURNAL
6149ddb0ac4bSdan        || eType==SQLITE_OPEN_TRANSIENT_DB || eType==SQLITE_OPEN_WAL
6150fee2d25aSdanielk1977   );
6151fee2d25aSdanielk1977 
6152b00d8621Sdrh   /* Detect a pid change and reset the PRNG.  There is a race condition
6153b00d8621Sdrh   ** here such that two or more threads all trying to open databases at
6154b00d8621Sdrh   ** the same instant might all reset the PRNG.  But multiple resets
6155b00d8621Sdrh   ** are harmless.
6156b00d8621Sdrh   */
61575ac93652Sdrh   if( randomnessPid!=osGetpid(0) ){
61585ac93652Sdrh     randomnessPid = osGetpid(0);
6159b00d8621Sdrh     sqlite3_randomness(0,0);
6160b00d8621Sdrh   }
616108da86a6Sdan   memset(p, 0, sizeof(unixFile));
6162e339d65aSdanielk1977 
6163f81b40a5Sdrh #ifdef SQLITE_ASSERT_NO_FILES
6164f81b40a5Sdrh   /* Applications that never read or write a persistent disk files */
6165f81b40a5Sdrh   assert( zName==0 );
6166f81b40a5Sdrh #endif
6167f81b40a5Sdrh 
616808da86a6Sdan   if( eType==SQLITE_OPEN_MAIN_DB ){
6169e946c396Sdan     UnixUnusedFd *pUnused;
6170e946c396Sdan     pUnused = findReusableFd(zName, flags);
6171e946c396Sdan     if( pUnused ){
6172e946c396Sdan       fd = pUnused->fd;
6173e946c396Sdan     }else{
6174f3cdcdccSdrh       pUnused = sqlite3_malloc64(sizeof(*pUnused));
6175e946c396Sdan       if( !pUnused ){
6176fad3039cSmistachkin         return SQLITE_NOMEM_BKPT;
6177e946c396Sdan       }
6178e946c396Sdan     }
6179c68886bbSdrh     p->pPreallocatedUnused = pUnused;
6180c02a43afSdrh 
6181c02a43afSdrh     /* Database filenames are double-zero terminated if they are not
6182c02a43afSdrh     ** URIs with parameters.  Hence, they can always be passed into
6183c02a43afSdrh     ** sqlite3_uri_parameter(). */
6184c02a43afSdrh     assert( (flags & SQLITE_OPEN_URI) || zName[strlen(zName)+1]==0 );
6185c02a43afSdrh 
618608da86a6Sdan   }else if( !zName ){
618708da86a6Sdan     /* If zName is NULL, the upper layer is requesting a temp file. */
6188a803a2cdSdrh     assert(isDelete && !isNewJrnl);
6189b7e50ad5Sdrh     rc = unixGetTempname(pVfs->mxPathname, zTmpname);
619017b90b53Sdanielk1977     if( rc!=SQLITE_OK ){
619117b90b53Sdanielk1977       return rc;
619217b90b53Sdanielk1977     }
619317b90b53Sdanielk1977     zName = zTmpname;
6194c02a43afSdrh 
6195c02a43afSdrh     /* Generated temporary filenames are always double-zero terminated
6196c02a43afSdrh     ** for use by sqlite3_uri_parameter(). */
6197c02a43afSdrh     assert( zName[strlen(zName)+1]==0 );
619817b90b53Sdanielk1977   }
619917b90b53Sdanielk1977 
620008da86a6Sdan   /* Determine the value of the flags parameter passed to POSIX function
620108da86a6Sdan   ** open(). These must be calculated even if open() is not called, as
620208da86a6Sdan   ** they may be stored as part of the file handle and used by the
620308da86a6Sdan   ** 'conch file' locking functions later on.  */
6204734c9864Sdrh   if( isReadonly )  openFlags |= O_RDONLY;
6205734c9864Sdrh   if( isReadWrite ) openFlags |= O_RDWR;
6206734c9864Sdrh   if( isCreate )    openFlags |= O_CREAT;
6207734c9864Sdrh   if( isExclusive ) openFlags |= (O_EXCL|O_NOFOLLOW);
6208c398c65bSdrh   openFlags |= (O_LARGEFILE|O_BINARY|O_NOFOLLOW);
6209b4b47411Sdanielk1977 
621008da86a6Sdan   if( fd<0 ){
6211ddb0ac4bSdan     mode_t openMode;              /* Permissions to create file with */
6212ac7c3ac1Sdrh     uid_t uid;                    /* Userid for the file */
6213ac7c3ac1Sdrh     gid_t gid;                    /* Groupid for the file */
6214ac7c3ac1Sdrh     rc = findCreateFileMode(zName, flags, &openMode, &uid, &gid);
6215ddb0ac4bSdan     if( rc!=SQLITE_OK ){
6216c68886bbSdrh       assert( !p->pPreallocatedUnused );
62178ab58665Sdrh       assert( eType==SQLITE_OPEN_WAL || eType==SQLITE_OPEN_MAIN_JOURNAL );
6218ddb0ac4bSdan       return rc;
6219ddb0ac4bSdan     }
6220ad4f1e54Sdrh     fd = robust_open(zName, openFlags, openMode);
6221308c2a5cSdrh     OSTRACE(("OPENX   %-3d %s 0%o\n", fd, zName, openFlags));
62225a2d970aSdrh     assert( !isExclusive || (openFlags & O_CREAT)!=0 );
6223a688ca5eSdan     if( fd<0 ){
6224a688ca5eSdan       if( isNewJrnl && errno==EACCES && osAccess(zName, F_OK) ){
6225a688ca5eSdan         /* If unable to create a journal because the directory is not
6226a688ca5eSdan         ** writable, change the error code to indicate that. */
6227a688ca5eSdan         rc = SQLITE_READONLY_DIRECTORY;
6228a688ca5eSdan       }else if( errno!=EISDIR && isReadWrite ){
6229b4b47411Sdanielk1977         /* Failed to open the file for read/write access. Try read-only. */
6230b4b47411Sdanielk1977         flags &= ~(SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE);
6231e946c396Sdan         openFlags &= ~(O_RDWR|O_CREAT);
6232b4b47411Sdanielk1977         flags |= SQLITE_OPEN_READONLY;
6233e946c396Sdan         openFlags |= O_RDONLY;
62347719711bSdrh         isReadonly = 1;
6235ad4f1e54Sdrh         fd = robust_open(zName, openFlags, openMode);
6236b4b47411Sdanielk1977       }
62379898c4a0Sdan     }
6238a688ca5eSdan     if( fd<0 ){
6239a688ca5eSdan       int rc2 = unixLogError(SQLITE_CANTOPEN_BKPT, "open", zName);
6240a688ca5eSdan       if( rc==SQLITE_OK ) rc = rc2;
6241e946c396Sdan       goto open_finished;
6242b4b47411Sdanielk1977     }
6243ac7c3ac1Sdrh 
62441116b178Sdrh     /* The owner of the rollback journal or WAL file should always be the
62451116b178Sdrh     ** same as the owner of the database file.  Try to ensure that this is
62461116b178Sdrh     ** the case.  The chown() system call will be a no-op if the current
62471116b178Sdrh     ** process lacks root privileges, be we should at least try.  Without
62481116b178Sdrh     ** this step, if a root process opens a database file, it can leave
62491116b178Sdrh     ** behinds a journal/WAL that is owned by root and hence make the
62501116b178Sdrh     ** database inaccessible to unprivileged processes.
62511116b178Sdrh     **
6252edf8a7bfSdrh     ** If openMode==0, then that means uid and gid are not set correctly
62531116b178Sdrh     ** (probably because SQLite is configured to use 8+3 filename mode) and
62541116b178Sdrh     ** in that case we do not want to attempt the chown().
6255ac7c3ac1Sdrh     */
6256edf8a7bfSdrh     if( openMode && (flags & (SQLITE_OPEN_WAL|SQLITE_OPEN_MAIN_JOURNAL))!=0 ){
62576226ca2aSdrh       robustFchown(fd, uid, gid);
6258ac7c3ac1Sdrh     }
625908da86a6Sdan   }
626008da86a6Sdan   assert( fd>=0 );
626108da86a6Sdan   if( pOutFlags ){
626208da86a6Sdan     *pOutFlags = flags;
626308da86a6Sdan   }
626408da86a6Sdan 
6265c68886bbSdrh   if( p->pPreallocatedUnused ){
6266c68886bbSdrh     p->pPreallocatedUnused->fd = fd;
626755220a6cSdrh     p->pPreallocatedUnused->flags =
626855220a6cSdrh                           flags & (SQLITE_OPEN_READONLY|SQLITE_OPEN_READWRITE);
6269e946c396Sdan   }
6270e946c396Sdan 
6271b4b47411Sdanielk1977   if( isDelete ){
62726c7d5c5bSdrh #if OS_VXWORKS
627397185489Schw     zPath = zName;
62740bdbc90dSdrh #elif defined(SQLITE_UNLINK_AFTER_CLOSE)
62750bdbc90dSdrh     zPath = sqlite3_mprintf("%s", zName);
62760bdbc90dSdrh     if( zPath==0 ){
62770bdbc90dSdrh       robust_close(p, fd, __LINE__);
6278fad3039cSmistachkin       return SQLITE_NOMEM_BKPT;
62790bdbc90dSdrh     }
628097185489Schw #else
6281036ac7faSdrh     osUnlink(zName);
628297185489Schw #endif
628341022645Sdrh   }
628441022645Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
628541022645Sdrh   else{
628608da86a6Sdan     p->openFlags = openFlags;
628708c6d446Sdrh   }
628808c6d446Sdrh #endif
628908c6d446Sdrh 
62907ed97b9dSdrh #if defined(__APPLE__) || SQLITE_ENABLE_LOCKING_STYLE
62917ed97b9dSdrh   if( fstatfs(fd, &fsInfo) == -1 ){
62924bf66fd6Sdrh     storeLastErrno(p, errno);
62930e9365ceSdrh     robust_close(p, fd, __LINE__);
62947ed97b9dSdrh     return SQLITE_IOERR_ACCESS;
62957ed97b9dSdrh   }
62967ed97b9dSdrh   if (0 == strncmp("msdos", fsInfo.f_fstypename, 5)) {
62977ed97b9dSdrh     ((unixFile*)pFile)->fsFlags |= SQLITE_FSFLAGS_IS_MSDOS;
62987ed97b9dSdrh   }
62994bf66fd6Sdrh   if (0 == strncmp("exfat", fsInfo.f_fstypename, 5)) {
63004bf66fd6Sdrh     ((unixFile*)pFile)->fsFlags |= SQLITE_FSFLAGS_IS_MSDOS;
63014bf66fd6Sdrh   }
63027ed97b9dSdrh #endif
63037ed97b9dSdrh 
6304c02a43afSdrh   /* Set up appropriate ctrlFlags */
6305c02a43afSdrh   if( isDelete )                ctrlFlags |= UNIXFILE_DELETE;
6306c02a43afSdrh   if( isReadonly )              ctrlFlags |= UNIXFILE_RDONLY;
630786151e89Sdrh   noLock = eType!=SQLITE_OPEN_MAIN_DB;
6308c02a43afSdrh   if( noLock )                  ctrlFlags |= UNIXFILE_NOLOCK;
6309a803a2cdSdrh   if( isNewJrnl )               ctrlFlags |= UNIXFILE_DIRSYNC;
6310c02a43afSdrh   if( flags & SQLITE_OPEN_URI ) ctrlFlags |= UNIXFILE_URI;
6311c02a43afSdrh 
63127ed97b9dSdrh #if SQLITE_ENABLE_LOCKING_STYLE
6313aebf413dSaswift #if SQLITE_PREFER_PROXY_LOCKING
63147ed97b9dSdrh   isAutoProxy = 1;
63157ed97b9dSdrh #endif
63167ed97b9dSdrh   if( isAutoProxy && (zPath!=NULL) && (!noLock) && pVfs->xOpen ){
6317aebf413dSaswift     char *envforce = getenv("SQLITE_FORCE_PROXY_LOCKING");
6318aebf413dSaswift     int useProxy = 0;
6319aebf413dSaswift 
632008da86a6Sdan     /* SQLITE_FORCE_PROXY_LOCKING==1 means force always use proxy, 0 means
632108da86a6Sdan     ** never use proxy, NULL means use proxy for non-local files only.  */
6322aebf413dSaswift     if( envforce!=NULL ){
6323aebf413dSaswift       useProxy = atoi(envforce)>0;
6324aebf413dSaswift     }else{
6325aebf413dSaswift       useProxy = !(fsInfo.f_flags&MNT_LOCAL);
6326aebf413dSaswift     }
6327aebf413dSaswift     if( useProxy ){
6328c02a43afSdrh       rc = fillInUnixFile(pVfs, fd, pFile, zPath, ctrlFlags);
6329aebf413dSaswift       if( rc==SQLITE_OK ){
6330715ff30eSdrh         rc = proxyTransformUnixFile((unixFile*)pFile, ":auto:");
63317ed97b9dSdrh         if( rc!=SQLITE_OK ){
63327ed97b9dSdrh           /* Use unixClose to clean up the resources added in fillInUnixFile
63337ed97b9dSdrh           ** and clear all the structure's references.  Specifically,
63347ed97b9dSdrh           ** pFile->pMethods will be NULL so sqlite3OsClose will be a no-op
63357ed97b9dSdrh           */
63367ed97b9dSdrh           unixClose(pFile);
63377ed97b9dSdrh           return rc;
63387ed97b9dSdrh         }
6339aebf413dSaswift       }
6340e946c396Sdan       goto open_finished;
6341aebf413dSaswift     }
6342aebf413dSaswift   }
6343aebf413dSaswift #endif
6344aebf413dSaswift 
63453ed0f1c9Sdan   assert( zPath==0 || zPath[0]=='/'
6346ccb2113aSdrh       || eType==SQLITE_OPEN_SUPER_JOURNAL || eType==SQLITE_OPEN_MAIN_JOURNAL
63473ed0f1c9Sdan   );
6348c02a43afSdrh   rc = fillInUnixFile(pVfs, fd, pFile, zPath, ctrlFlags);
6349c02a43afSdrh 
6350e946c396Sdan open_finished:
6351e946c396Sdan   if( rc!=SQLITE_OK ){
6352c68886bbSdrh     sqlite3_free(p->pPreallocatedUnused);
6353b4b47411Sdanielk1977   }
6354e946c396Sdan   return rc;
6355e946c396Sdan }
6356e946c396Sdan 
6357b4b47411Sdanielk1977 
6358b4b47411Sdanielk1977 /*
6359fee2d25aSdanielk1977 ** Delete the file at zPath. If the dirSync argument is true, fsync()
6360fee2d25aSdanielk1977 ** the directory after deleting the file.
6361b4b47411Sdanielk1977 */
unixDelete(sqlite3_vfs * NotUsed,const char * zPath,int dirSync)63626b9d6ddcSdrh static int unixDelete(
63636b9d6ddcSdrh   sqlite3_vfs *NotUsed,     /* VFS containing this as the xDelete method */
63646b9d6ddcSdrh   const char *zPath,        /* Name of file to be deleted */
63656b9d6ddcSdrh   int dirSync               /* If true, fsync() directory after deleting file */
63666b9d6ddcSdrh ){
6367fee2d25aSdanielk1977   int rc = SQLITE_OK;
6368397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
6369b4b47411Sdanielk1977   SimulateIOError(return SQLITE_IOERR_DELETE);
63709fc5b4a5Sdan   if( osUnlink(zPath)==(-1) ){
6371bd945545Sdrh     if( errno==ENOENT
6372bd945545Sdrh #if OS_VXWORKS
637319541f30Sdrh         || osAccess(zPath,0)!=0
6374bd945545Sdrh #endif
6375bd945545Sdrh     ){
63769fc5b4a5Sdan       rc = SQLITE_IOERR_DELETE_NOENT;
63779fc5b4a5Sdan     }else{
6378b4308167Sdrh       rc = unixLogError(SQLITE_IOERR_DELETE, "unlink", zPath);
63799fc5b4a5Sdan     }
6380b4308167Sdrh     return rc;
63815d4feffeSdrh   }
6382d39fa70cSdanielk1977 #ifndef SQLITE_DISABLE_DIRSYNC
6383e349519fSdrh   if( (dirSync & 1)!=0 ){
6384fee2d25aSdanielk1977     int fd;
638590315a24Sdrh     rc = osOpenDirectory(zPath, &fd);
6386fee2d25aSdanielk1977     if( rc==SQLITE_OK ){
63876d258995Sdrh       if( full_fsync(fd,0,0) ){
6388e18d4953Sdan         rc = unixLogError(SQLITE_IOERR_DIR_FSYNC, "fsync", zPath);
6389fee2d25aSdanielk1977       }
63900e9365ceSdrh       robust_close(0, fd, __LINE__);
6391acb6b28dSdrh     }else{
6392acb6b28dSdrh       assert( rc==SQLITE_CANTOPEN );
63931ee6f740Sdrh       rc = SQLITE_OK;
6394fee2d25aSdanielk1977     }
6395fee2d25aSdanielk1977   }
6396d138dd86Sdanielk1977 #endif
6397fee2d25aSdanielk1977   return rc;
6398b4b47411Sdanielk1977 }
6399b4b47411Sdanielk1977 
640090949c20Sdanielk1977 /*
640148864df9Smistachkin ** Test the existence of or access permissions of file zPath. The
640290949c20Sdanielk1977 ** test performed depends on the value of flags:
640390949c20Sdanielk1977 **
640490949c20Sdanielk1977 **     SQLITE_ACCESS_EXISTS: Return 1 if the file exists
640590949c20Sdanielk1977 **     SQLITE_ACCESS_READWRITE: Return 1 if the file is read and writable.
640690949c20Sdanielk1977 **     SQLITE_ACCESS_READONLY: Return 1 if the file is readable.
640790949c20Sdanielk1977 **
640890949c20Sdanielk1977 ** Otherwise return 0.
640990949c20Sdanielk1977 */
unixAccess(sqlite3_vfs * NotUsed,const char * zPath,int flags,int * pResOut)6410861f7456Sdanielk1977 static int unixAccess(
64116b9d6ddcSdrh   sqlite3_vfs *NotUsed,   /* The VFS containing this xAccess method */
64126b9d6ddcSdrh   const char *zPath,      /* Path of the file to examine */
64136b9d6ddcSdrh   int flags,              /* What do we want to learn about the zPath file? */
64146b9d6ddcSdrh   int *pResOut            /* Write result boolean here */
6415861f7456Sdanielk1977 ){
6416397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
6417861f7456Sdanielk1977   SimulateIOError( return SQLITE_IOERR_ACCESS; );
6418d260b5b2Sdrh   assert( pResOut!=0 );
6419b4b47411Sdanielk1977 
6420c398c65bSdrh   /* The spec says there are three possible values for flags.  But only
6421c398c65bSdrh   ** two of them are actually used */
6422c398c65bSdrh   assert( flags==SQLITE_ACCESS_EXISTS || flags==SQLITE_ACCESS_READWRITE );
6423d260b5b2Sdrh 
6424d260b5b2Sdrh   if( flags==SQLITE_ACCESS_EXISTS ){
642583acd423Sdan     struct stat buf;
642696e8eebfSdrh     *pResOut = 0==osStat(zPath, &buf) &&
642709bee574Sdrh                 (!S_ISREG(buf.st_mode) || buf.st_size>0);
64280933aad7Sdrh   }else{
6429c398c65bSdrh     *pResOut = osAccess(zPath, W_OK|R_OK)==0;
643083acd423Sdan   }
6431861f7456Sdanielk1977   return SQLITE_OK;
6432b4b47411Sdanielk1977 }
6433b4b47411Sdanielk1977 
6434b4b47411Sdanielk1977 /*
6435e8346d0aSdrh ** A pathname under construction
64367f42dcd9Sdrh */
6437e8346d0aSdrh typedef struct DbPath DbPath;
6438e8346d0aSdrh struct DbPath {
6439e8346d0aSdrh   int rc;           /* Non-zero following any error */
6440e8346d0aSdrh   int nSymlink;     /* Number of symlinks resolved */
6441e8346d0aSdrh   char *zOut;       /* Write the pathname here */
6442e8346d0aSdrh   int nOut;         /* Bytes of space available to zOut[] */
6443e8346d0aSdrh   int nUsed;        /* Bytes of zOut[] currently being used */
6444e8346d0aSdrh };
6445e8346d0aSdrh 
6446e8346d0aSdrh /* Forward reference */
6447e8346d0aSdrh static void appendAllPathElements(DbPath*,const char*);
6448e8346d0aSdrh 
6449e8346d0aSdrh /*
6450e8346d0aSdrh ** Append a single path element to the DbPath under construction
6451e8346d0aSdrh */
appendOnePathElement(DbPath * pPath,const char * zName,int nName)6452e8346d0aSdrh static void appendOnePathElement(
6453e8346d0aSdrh   DbPath *pPath,       /* Path under construction, to which to append zName */
6454e8346d0aSdrh   const char *zName,   /* Name to append to pPath.  Not zero-terminated */
6455e8346d0aSdrh   int nName            /* Number of significant bytes in zName */
6456e8346d0aSdrh ){
6457e8346d0aSdrh   assert( nName>0 );
6458e8346d0aSdrh   assert( zName!=0 );
6459e8346d0aSdrh   if( zName[0]=='.' ){
6460e8346d0aSdrh     if( nName==1 ) return;
6461e8346d0aSdrh     if( zName[1]=='.' && nName==2 ){
6462e8346d0aSdrh       if( pPath->nUsed<=1 ){
6463e8346d0aSdrh         pPath->rc = SQLITE_ERROR;
6464e8346d0aSdrh         return;
6465e8346d0aSdrh       }
6466e8346d0aSdrh       assert( pPath->zOut[0]=='/' );
6467e8346d0aSdrh       while( pPath->zOut[--pPath->nUsed]!='/' ){}
6468e8346d0aSdrh       return;
6469e8346d0aSdrh     }
6470e8346d0aSdrh   }
6471e8346d0aSdrh   if( pPath->nUsed + nName + 2 >= pPath->nOut ){
6472e8346d0aSdrh     pPath->rc = SQLITE_ERROR;
6473e8346d0aSdrh     return;
6474e8346d0aSdrh   }
6475e8346d0aSdrh   pPath->zOut[pPath->nUsed++] = '/';
6476e8346d0aSdrh   memcpy(&pPath->zOut[pPath->nUsed], zName, nName);
6477e8346d0aSdrh   pPath->nUsed += nName;
6478e8346d0aSdrh #if defined(HAVE_READLINK) && defined(HAVE_LSTAT)
6479e8346d0aSdrh   if( pPath->rc==SQLITE_OK ){
6480e8346d0aSdrh     const char *zIn;
6481e8346d0aSdrh     struct stat buf;
6482e8346d0aSdrh     pPath->zOut[pPath->nUsed] = 0;
6483e8346d0aSdrh     zIn = pPath->zOut;
6484e8346d0aSdrh     if( osLstat(zIn, &buf)!=0 ){
6485e8346d0aSdrh       if( errno!=ENOENT ){
6486e8346d0aSdrh         pPath->rc = unixLogError(SQLITE_CANTOPEN_BKPT, "lstat", zIn);
6487e8346d0aSdrh       }
6488e8346d0aSdrh     }else if( S_ISLNK(buf.st_mode) ){
6489e8346d0aSdrh       ssize_t got;
6490e8346d0aSdrh       char zLnk[SQLITE_MAX_PATHLEN+2];
6491e8346d0aSdrh       if( pPath->nSymlink++ > SQLITE_MAX_SYMLINK ){
6492e8346d0aSdrh         pPath->rc = SQLITE_CANTOPEN_BKPT;
6493e8346d0aSdrh         return;
6494e8346d0aSdrh       }
6495b302c065Sdrh       got = osReadlink(zIn, zLnk, sizeof(zLnk)-2);
6496b8b2d9c5Sdrh       if( got<=0 || got>=(ssize_t)sizeof(zLnk)-2 ){
6497e8346d0aSdrh         pPath->rc = unixLogError(SQLITE_CANTOPEN_BKPT, "readlink", zIn);
6498e8346d0aSdrh         return;
6499e8346d0aSdrh       }
6500e8346d0aSdrh       zLnk[got] = 0;
6501e8346d0aSdrh       if( zLnk[0]=='/' ){
6502e8346d0aSdrh         pPath->nUsed = 0;
6503e8346d0aSdrh       }else{
6504e8346d0aSdrh         pPath->nUsed -= nName + 1;
6505e8346d0aSdrh       }
6506e8346d0aSdrh       appendAllPathElements(pPath, zLnk);
6507e8346d0aSdrh     }
6508e8346d0aSdrh   }
6509e8346d0aSdrh #endif
65107f42dcd9Sdrh }
65117f42dcd9Sdrh 
65127f42dcd9Sdrh /*
6513e8346d0aSdrh ** Append all path elements in zPath to the DbPath under construction.
6514b4b47411Sdanielk1977 */
appendAllPathElements(DbPath * pPath,const char * zPath)6515e8346d0aSdrh static void appendAllPathElements(
6516e8346d0aSdrh   DbPath *pPath,       /* Path under construction, to which to append zName */
6517e8346d0aSdrh   const char *zPath    /* Path to append to pPath.  Is zero-terminated */
6518adfb9b05Sdanielk1977 ){
6519e8346d0aSdrh   int i = 0;
6520e8346d0aSdrh   int j = 0;
6521e8346d0aSdrh   do{
6522e8346d0aSdrh     while( zPath[i] && zPath[i]!='/' ){ i++; }
6523e8346d0aSdrh     if( i>j ){
6524e8346d0aSdrh       appendOnePathElement(pPath, &zPath[j], i-j);
6525b4b47411Sdanielk1977     }
6526e8346d0aSdrh     j = i+1;
6527e8346d0aSdrh   }while( zPath[i++] );
6528b4b47411Sdanielk1977 }
6529b4b47411Sdanielk1977 
6530e88ec187Sdan /*
6531e88ec187Sdan ** Turn a relative pathname into a full pathname. The relative path
6532e88ec187Sdan ** is stored as a nul-terminated string in the buffer pointed to by
6533e88ec187Sdan ** zPath.
6534e88ec187Sdan **
6535e88ec187Sdan ** zOut points to a buffer of at least sqlite3_vfs.mxPathname bytes
6536e88ec187Sdan ** (in this case, MAX_PATHNAME bytes). The full-path is written to
6537e88ec187Sdan ** this buffer before returning.
6538e88ec187Sdan */
unixFullPathname(sqlite3_vfs * pVfs,const char * zPath,int nOut,char * zOut)6539e88ec187Sdan static int unixFullPathname(
6540e88ec187Sdan   sqlite3_vfs *pVfs,            /* Pointer to vfs object */
6541e88ec187Sdan   const char *zPath,            /* Possibly relative input path */
6542e88ec187Sdan   int nOut,                     /* Size of output buffer in bytes */
6543e88ec187Sdan   char *zOut                    /* Output buffer */
6544e88ec187Sdan ){
6545e8346d0aSdrh   DbPath path;
6546b8b2d9c5Sdrh   UNUSED_PARAMETER(pVfs);
6547e8346d0aSdrh   path.rc = 0;
6548e8346d0aSdrh   path.nUsed = 0;
6549e8346d0aSdrh   path.nSymlink = 0;
6550e8346d0aSdrh   path.nOut = nOut;
6551e8346d0aSdrh   path.zOut = zOut;
6552e8346d0aSdrh   if( zPath[0]!='/' ){
6553e8346d0aSdrh     char zPwd[SQLITE_MAX_PATHLEN+2];
6554e8346d0aSdrh     if( osGetcwd(zPwd, sizeof(zPwd)-2)==0 ){
6555e8346d0aSdrh       return unixLogError(SQLITE_CANTOPEN_BKPT, "getcwd", zPath);
6556e88ec187Sdan     }
6557e8346d0aSdrh     appendAllPathElements(&path, zPwd);
6558e88ec187Sdan   }
6559e8346d0aSdrh   appendAllPathElements(&path, zPath);
6560e8346d0aSdrh   zOut[path.nUsed] = 0;
6561e8346d0aSdrh   if( path.rc || path.nUsed<2 ) return SQLITE_CANTOPEN_BKPT;
6562e8346d0aSdrh   if( path.nSymlink ) return SQLITE_OK_SYMLINK;
6563e8346d0aSdrh   return SQLITE_OK;
6564caf6b150Sdan }
6565caf6b150Sdan 
6566761df87eSdrh #ifndef SQLITE_OMIT_LOAD_EXTENSION
6567761df87eSdrh /*
6568761df87eSdrh ** Interfaces for opening a shared library, finding entry points
6569761df87eSdrh ** within the shared library, and closing the shared library.
6570761df87eSdrh */
6571761df87eSdrh #include <dlfcn.h>
unixDlOpen(sqlite3_vfs * NotUsed,const char * zFilename)6572397d65f6Sdanielk1977 static void *unixDlOpen(sqlite3_vfs *NotUsed, const char *zFilename){
6573397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
6574761df87eSdrh   return dlopen(zFilename, RTLD_NOW | RTLD_GLOBAL);
6575761df87eSdrh }
657695c8a54cSdanielk1977 
657795c8a54cSdanielk1977 /*
657895c8a54cSdanielk1977 ** SQLite calls this function immediately after a call to unixDlSym() or
657995c8a54cSdanielk1977 ** unixDlOpen() fails (returns a null pointer). If a more detailed error
658095c8a54cSdanielk1977 ** message is available, it is written to zBufOut. If no error message
658195c8a54cSdanielk1977 ** is available, zBufOut is left unmodified and SQLite uses a default
658295c8a54cSdanielk1977 ** error message.
658395c8a54cSdanielk1977 */
unixDlError(sqlite3_vfs * NotUsed,int nBuf,char * zBufOut)6584397d65f6Sdanielk1977 static void unixDlError(sqlite3_vfs *NotUsed, int nBuf, char *zBufOut){
65853239053eSdan   const char *zErr;
6586397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
65876c7d5c5bSdrh   unixEnterMutex();
6588b4b47411Sdanielk1977   zErr = dlerror();
6589b4b47411Sdanielk1977   if( zErr ){
6590153c62c4Sdrh     sqlite3_snprintf(nBuf, zBufOut, "%s", zErr);
6591b4b47411Sdanielk1977   }
65926c7d5c5bSdrh   unixLeaveMutex();
6593b4b47411Sdanielk1977 }
unixDlSym(sqlite3_vfs * NotUsed,void * p,const char * zSym)65941875f7a3Sdrh static void (*unixDlSym(sqlite3_vfs *NotUsed, void *p, const char*zSym))(void){
65951875f7a3Sdrh   /*
65961875f7a3Sdrh   ** GCC with -pedantic-errors says that C90 does not allow a void* to be
65971875f7a3Sdrh   ** cast into a pointer to a function.  And yet the library dlsym() routine
65981875f7a3Sdrh   ** returns a void* which is really a pointer to a function.  So how do we
65991875f7a3Sdrh   ** use dlsym() with -pedantic-errors?
66001875f7a3Sdrh   **
66011875f7a3Sdrh   ** Variable x below is defined to be a pointer to a function taking
66021875f7a3Sdrh   ** parameters void* and const char* and returning a pointer to a function.
66031875f7a3Sdrh   ** We initialize x by assigning it a pointer to the dlsym() function.
66041875f7a3Sdrh   ** (That assignment requires a cast.)  Then we call the function that
66051875f7a3Sdrh   ** x points to.
66061875f7a3Sdrh   **
66071875f7a3Sdrh   ** This work-around is unlikely to work correctly on any system where
66081875f7a3Sdrh   ** you really cannot cast a function pointer into void*.  But then, on the
66091875f7a3Sdrh   ** other hand, dlsym() will not work on such a system either, so we have
66101875f7a3Sdrh   ** not really lost anything.
66111875f7a3Sdrh   */
66121875f7a3Sdrh   void (*(*x)(void*,const char*))(void);
6613397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
66141875f7a3Sdrh   x = (void(*(*)(void*,const char*))(void))dlsym;
66151875f7a3Sdrh   return (*x)(p, zSym);
6616761df87eSdrh }
unixDlClose(sqlite3_vfs * NotUsed,void * pHandle)6617397d65f6Sdanielk1977 static void unixDlClose(sqlite3_vfs *NotUsed, void *pHandle){
6618397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
6619b4b47411Sdanielk1977   dlclose(pHandle);
6620761df87eSdrh }
6621b4b47411Sdanielk1977 #else /* if SQLITE_OMIT_LOAD_EXTENSION is defined: */
6622b4b47411Sdanielk1977   #define unixDlOpen  0
6623b4b47411Sdanielk1977   #define unixDlError 0
6624b4b47411Sdanielk1977   #define unixDlSym   0
6625b4b47411Sdanielk1977   #define unixDlClose 0
6626b4b47411Sdanielk1977 #endif
6627b4b47411Sdanielk1977 
6628b4b47411Sdanielk1977 /*
662990949c20Sdanielk1977 ** Write nBuf bytes of random data to the supplied buffer zBuf.
6630bbd42a6dSdrh */
unixRandomness(sqlite3_vfs * NotUsed,int nBuf,char * zBuf)6631397d65f6Sdanielk1977 static int unixRandomness(sqlite3_vfs *NotUsed, int nBuf, char *zBuf){
6632397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
663300e13613Sdanielk1977   assert((size_t)nBuf>=(sizeof(time_t)+sizeof(int)));
663490949c20Sdanielk1977 
6635bbd42a6dSdrh   /* We have to initialize zBuf to prevent valgrind from reporting
6636bbd42a6dSdrh   ** errors.  The reports issued by valgrind are incorrect - we would
6637bbd42a6dSdrh   ** prefer that the randomness be increased by making use of the
6638bbd42a6dSdrh   ** uninitialized space in zBuf - but valgrind errors tend to worry
6639bbd42a6dSdrh   ** some users.  Rather than argue, it seems easier just to initialize
6640bbd42a6dSdrh   ** the whole array and silence valgrind, even if that means less randomness
6641bbd42a6dSdrh   ** in the random seed.
6642bbd42a6dSdrh   **
6643bbd42a6dSdrh   ** When testing, initializing zBuf[] to zero is all we do.  That means
6644f1a221e6Sdrh   ** that we always use the same random number sequence.  This makes the
6645bbd42a6dSdrh   ** tests repeatable.
6646bbd42a6dSdrh   */
6647b4b47411Sdanielk1977   memset(zBuf, 0, nBuf);
66485ac93652Sdrh   randomnessPid = osGetpid(0);
66496a412b8bSdrh #if !defined(SQLITE_TEST) && !defined(SQLITE_OMIT_RANDOMNESS)
6650bbd42a6dSdrh   {
6651b00d8621Sdrh     int fd, got;
6652ad4f1e54Sdrh     fd = robust_open("/dev/urandom", O_RDONLY, 0);
6653842b8641Sdrh     if( fd<0 ){
66540739723dSdrh       time_t t;
66550739723dSdrh       time(&t);
665690949c20Sdanielk1977       memcpy(zBuf, &t, sizeof(t));
6657b00d8621Sdrh       memcpy(&zBuf[sizeof(t)], &randomnessPid, sizeof(randomnessPid));
6658b00d8621Sdrh       assert( sizeof(t)+sizeof(randomnessPid)<=(size_t)nBuf );
6659b00d8621Sdrh       nBuf = sizeof(t) + sizeof(randomnessPid);
6660842b8641Sdrh     }else{
6661c18b4046Sdrh       do{ got = osRead(fd, zBuf, nBuf); }while( got<0 && errno==EINTR );
66620e9365ceSdrh       robust_close(0, fd, __LINE__);
6663842b8641Sdrh     }
6664bbd42a6dSdrh   }
6665bbd42a6dSdrh #endif
666672cbd078Sdrh   return nBuf;
6667bbd42a6dSdrh }
6668bbd42a6dSdrh 
6669b4b47411Sdanielk1977 
6670bbd42a6dSdrh /*
6671bbd42a6dSdrh ** Sleep for a little while.  Return the amount of time slept.
6672b4b47411Sdanielk1977 ** The argument is the number of microseconds we want to sleep.
66734a50aac5Sdrh ** The return value is the number of microseconds of sleep actually
66744a50aac5Sdrh ** requested from the underlying operating system, a number which
66754a50aac5Sdrh ** might be greater than or equal to the argument, but not less
66764a50aac5Sdrh ** than the argument.
6677bbd42a6dSdrh */
unixSleep(sqlite3_vfs * NotUsed,int microseconds)6678397d65f6Sdanielk1977 static int unixSleep(sqlite3_vfs *NotUsed, int microseconds){
66796c7d5c5bSdrh #if OS_VXWORKS
668097185489Schw   struct timespec sp;
668197185489Schw 
668297185489Schw   sp.tv_sec = microseconds / 1000000;
668397185489Schw   sp.tv_nsec = (microseconds % 1000000) * 1000;
668497185489Schw   nanosleep(&sp, NULL);
6685d43fe20bSdrh   UNUSED_PARAMETER(NotUsed);
6686397d65f6Sdanielk1977   return microseconds;
6687397d65f6Sdanielk1977 #elif defined(HAVE_USLEEP) && HAVE_USLEEP
6688ddcfe921Sdrh   if( microseconds>=1000000 ) sleep(microseconds/1000000);
6689ddcfe921Sdrh   if( microseconds%1000000 ) usleep(microseconds%1000000);
6690d43fe20bSdrh   UNUSED_PARAMETER(NotUsed);
6691b4b47411Sdanielk1977   return microseconds;
6692bbd42a6dSdrh #else
6693b4b47411Sdanielk1977   int seconds = (microseconds+999999)/1000000;
6694b4b47411Sdanielk1977   sleep(seconds);
6695d43fe20bSdrh   UNUSED_PARAMETER(NotUsed);
66964a50aac5Sdrh   return seconds*1000000;
6697a3fad6f5Sdrh #endif
669888f474a9Sdrh }
669988f474a9Sdrh 
670088f474a9Sdrh /*
67016b9d6ddcSdrh ** The following variable, if set to a non-zero value, is interpreted as
67026b9d6ddcSdrh ** the number of seconds since 1970 and is used to set the result of
67036b9d6ddcSdrh ** sqlite3OsCurrentTime() during testing.
6704bbd42a6dSdrh */
6705bbd42a6dSdrh #ifdef SQLITE_TEST
67066b9d6ddcSdrh int sqlite3_current_time = 0;  /* Fake system time in seconds since 1970. */
6707bbd42a6dSdrh #endif
6708bbd42a6dSdrh 
6709bbd42a6dSdrh /*
6710b7e8ea20Sdrh ** Find the current time (in Universal Coordinated Time).  Write into *piNow
6711b7e8ea20Sdrh ** the current time and date as a Julian Day number times 86_400_000.  In
6712b7e8ea20Sdrh ** other words, write into *piNow the number of milliseconds since the Julian
6713b7e8ea20Sdrh ** epoch of noon in Greenwich on November 24, 4714 B.C according to the
6714b7e8ea20Sdrh ** proleptic Gregorian calendar.
6715b7e8ea20Sdrh **
67163170225fSdrh ** On success, return SQLITE_OK.  Return SQLITE_ERROR if the time and date
67173170225fSdrh ** cannot be found.
6718b7e8ea20Sdrh */
unixCurrentTimeInt64(sqlite3_vfs * NotUsed,sqlite3_int64 * piNow)6719b7e8ea20Sdrh static int unixCurrentTimeInt64(sqlite3_vfs *NotUsed, sqlite3_int64 *piNow){
6720b7e8ea20Sdrh   static const sqlite3_int64 unixEpoch = 24405875*(sqlite3_int64)8640000;
67213170225fSdrh   int rc = SQLITE_OK;
6722b7e8ea20Sdrh #if defined(NO_GETTOD)
6723b7e8ea20Sdrh   time_t t;
6724b7e8ea20Sdrh   time(&t);
672515eac4e4Sdan   *piNow = ((sqlite3_int64)t)*1000 + unixEpoch;
6726b7e8ea20Sdrh #elif OS_VXWORKS
6727b7e8ea20Sdrh   struct timespec sNow;
6728b7e8ea20Sdrh   clock_gettime(CLOCK_REALTIME, &sNow);
6729b7e8ea20Sdrh   *piNow = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_nsec/1000000;
6730b7e8ea20Sdrh #else
6731b7e8ea20Sdrh   struct timeval sNow;
6732970942e4Sdrh   (void)gettimeofday(&sNow, 0);  /* Cannot fail given valid arguments */
6733b7e8ea20Sdrh   *piNow = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_usec/1000;
6734b7e8ea20Sdrh #endif
6735b7e8ea20Sdrh 
6736b7e8ea20Sdrh #ifdef SQLITE_TEST
6737b7e8ea20Sdrh   if( sqlite3_current_time ){
6738b7e8ea20Sdrh     *piNow = 1000*(sqlite3_int64)sqlite3_current_time + unixEpoch;
6739b7e8ea20Sdrh   }
6740b7e8ea20Sdrh #endif
6741b7e8ea20Sdrh   UNUSED_PARAMETER(NotUsed);
67423170225fSdrh   return rc;
6743b7e8ea20Sdrh }
6744b7e8ea20Sdrh 
6745c3dfa5ebSdrh #ifndef SQLITE_OMIT_DEPRECATED
6746b7e8ea20Sdrh /*
6747bbd42a6dSdrh ** Find the current time (in Universal Coordinated Time).  Write the
6748bbd42a6dSdrh ** current time and date as a Julian Day number into *prNow and
6749bbd42a6dSdrh ** return 0.  Return 1 if the time and date cannot be found.
6750bbd42a6dSdrh */
unixCurrentTime(sqlite3_vfs * NotUsed,double * prNow)6751397d65f6Sdanielk1977 static int unixCurrentTime(sqlite3_vfs *NotUsed, double *prNow){
6752b87a6663Sdrh   sqlite3_int64 i = 0;
67533170225fSdrh   int rc;
6754ff82894fSdrh   UNUSED_PARAMETER(NotUsed);
67553170225fSdrh   rc = unixCurrentTimeInt64(0, &i);
67560dcb0a7dSdrh   *prNow = i/86400000.0;
67573170225fSdrh   return rc;
6758bbd42a6dSdrh }
67595337dacbSdrh #else
67605337dacbSdrh # define unixCurrentTime 0
67615337dacbSdrh #endif
6762bbd42a6dSdrh 
67636b9d6ddcSdrh /*
67641b9f2141Sdrh ** The xGetLastError() method is designed to return a better
67651b9f2141Sdrh ** low-level error message when operating-system problems come up
67661b9f2141Sdrh ** during SQLite operation.  Only the integer return code is currently
67671b9f2141Sdrh ** used.
67686b9d6ddcSdrh */
unixGetLastError(sqlite3_vfs * NotUsed,int NotUsed2,char * NotUsed3)6769397d65f6Sdanielk1977 static int unixGetLastError(sqlite3_vfs *NotUsed, int NotUsed2, char *NotUsed3){
6770397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed);
6771397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed2);
6772397d65f6Sdanielk1977   UNUSED_PARAMETER(NotUsed3);
67731b9f2141Sdrh   return errno;
6774bcb97fe9Sdanielk1977 }
6775bcb97fe9Sdanielk1977 
6776f2424c52Sdrh 
6777f2424c52Sdrh /*
6778734c9864Sdrh ************************ End of sqlite3_vfs methods ***************************
6779734c9864Sdrh ******************************************************************************/
6780734c9864Sdrh 
6781715ff30eSdrh /******************************************************************************
6782715ff30eSdrh ************************** Begin Proxy Locking ********************************
6783715ff30eSdrh **
6784715ff30eSdrh ** Proxy locking is a "uber-locking-method" in this sense:  It uses the
6785715ff30eSdrh ** other locking methods on secondary lock files.  Proxy locking is a
6786715ff30eSdrh ** meta-layer over top of the primitive locking implemented above.  For
6787715ff30eSdrh ** this reason, the division that implements of proxy locking is deferred
6788715ff30eSdrh ** until late in the file (here) after all of the other I/O methods have
6789715ff30eSdrh ** been defined - so that the primitive locking methods are available
6790715ff30eSdrh ** as services to help with the implementation of proxy locking.
6791715ff30eSdrh **
6792715ff30eSdrh ****
6793715ff30eSdrh **
6794715ff30eSdrh ** The default locking schemes in SQLite use byte-range locks on the
6795715ff30eSdrh ** database file to coordinate safe, concurrent access by multiple readers
6796715ff30eSdrh ** and writers [http://sqlite.org/lockingv3.html].  The five file locking
6797715ff30eSdrh ** states (UNLOCKED, PENDING, SHARED, RESERVED, EXCLUSIVE) are implemented
6798715ff30eSdrh ** as POSIX read & write locks over fixed set of locations (via fsctl),
6799715ff30eSdrh ** on AFP and SMB only exclusive byte-range locks are available via fsctl
6800715ff30eSdrh ** with _IOWR('z', 23, struct ByteRangeLockPB2) to track the same 5 states.
6801715ff30eSdrh ** To simulate a F_RDLCK on the shared range, on AFP a randomly selected
6802715ff30eSdrh ** address in the shared range is taken for a SHARED lock, the entire
6803715ff30eSdrh ** shared range is taken for an EXCLUSIVE lock):
6804715ff30eSdrh **
6805715ff30eSdrh **      PENDING_BYTE        0x40000000
6806715ff30eSdrh **      RESERVED_BYTE       0x40000001
6807715ff30eSdrh **      SHARED_RANGE        0x40000002 -> 0x40000200
6808715ff30eSdrh **
6809715ff30eSdrh ** This works well on the local file system, but shows a nearly 100x
6810715ff30eSdrh ** slowdown in read performance on AFP because the AFP client disables
6811715ff30eSdrh ** the read cache when byte-range locks are present.  Enabling the read
6812715ff30eSdrh ** cache exposes a cache coherency problem that is present on all OS X
6813715ff30eSdrh ** supported network file systems.  NFS and AFP both observe the
6814715ff30eSdrh ** close-to-open semantics for ensuring cache coherency
6815715ff30eSdrh ** [http://nfs.sourceforge.net/#faq_a8], which does not effectively
6816715ff30eSdrh ** address the requirements for concurrent database access by multiple
6817715ff30eSdrh ** readers and writers
6818715ff30eSdrh ** [http://www.nabble.com/SQLite-on-NFS-cache-coherency-td15655701.html].
6819715ff30eSdrh **
6820715ff30eSdrh ** To address the performance and cache coherency issues, proxy file locking
6821715ff30eSdrh ** changes the way database access is controlled by limiting access to a
6822715ff30eSdrh ** single host at a time and moving file locks off of the database file
6823715ff30eSdrh ** and onto a proxy file on the local file system.
6824715ff30eSdrh **
6825715ff30eSdrh **
6826715ff30eSdrh ** Using proxy locks
6827715ff30eSdrh ** -----------------
6828715ff30eSdrh **
6829715ff30eSdrh ** C APIs
6830715ff30eSdrh **
68314bf66fd6Sdrh **  sqlite3_file_control(db, dbname, SQLITE_FCNTL_SET_LOCKPROXYFILE,
6832715ff30eSdrh **                       <proxy_path> | ":auto:");
68334bf66fd6Sdrh **  sqlite3_file_control(db, dbname, SQLITE_FCNTL_GET_LOCKPROXYFILE,
68344bf66fd6Sdrh **                       &<proxy_path>);
6835715ff30eSdrh **
6836715ff30eSdrh **
6837715ff30eSdrh ** SQL pragmas
6838715ff30eSdrh **
6839715ff30eSdrh **  PRAGMA [database.]lock_proxy_file=<proxy_path> | :auto:
6840715ff30eSdrh **  PRAGMA [database.]lock_proxy_file
6841715ff30eSdrh **
6842715ff30eSdrh ** Specifying ":auto:" means that if there is a conch file with a matching
6843715ff30eSdrh ** host ID in it, the proxy path in the conch file will be used, otherwise
6844715ff30eSdrh ** a proxy path based on the user's temp dir
6845715ff30eSdrh ** (via confstr(_CS_DARWIN_USER_TEMP_DIR,...)) will be used and the
6846715ff30eSdrh ** actual proxy file name is generated from the name and path of the
6847715ff30eSdrh ** database file.  For example:
6848715ff30eSdrh **
6849715ff30eSdrh **       For database path "/Users/me/foo.db"
6850715ff30eSdrh **       The lock path will be "<tmpdir>/sqliteplocks/_Users_me_foo.db:auto:")
6851715ff30eSdrh **
6852715ff30eSdrh ** Once a lock proxy is configured for a database connection, it can not
6853715ff30eSdrh ** be removed, however it may be switched to a different proxy path via
6854715ff30eSdrh ** the above APIs (assuming the conch file is not being held by another
6855715ff30eSdrh ** connection or process).
6856715ff30eSdrh **
6857715ff30eSdrh **
6858715ff30eSdrh ** How proxy locking works
6859715ff30eSdrh ** -----------------------
6860715ff30eSdrh **
6861715ff30eSdrh ** Proxy file locking relies primarily on two new supporting files:
6862715ff30eSdrh **
6863715ff30eSdrh **   *  conch file to limit access to the database file to a single host
6864715ff30eSdrh **      at a time
6865715ff30eSdrh **
6866715ff30eSdrh **   *  proxy file to act as a proxy for the advisory locks normally
6867715ff30eSdrh **      taken on the database
6868715ff30eSdrh **
6869715ff30eSdrh ** The conch file - to use a proxy file, sqlite must first "hold the conch"
6870715ff30eSdrh ** by taking an sqlite-style shared lock on the conch file, reading the
6871715ff30eSdrh ** contents and comparing the host's unique host ID (see below) and lock
6872715ff30eSdrh ** proxy path against the values stored in the conch.  The conch file is
6873715ff30eSdrh ** stored in the same directory as the database file and the file name
6874715ff30eSdrh ** is patterned after the database file name as ".<databasename>-conch".
687560ec914cSpeter.d.reid ** If the conch file does not exist, or its contents do not match the
6876715ff30eSdrh ** host ID and/or proxy path, then the lock is escalated to an exclusive
6877715ff30eSdrh ** lock and the conch file contents is updated with the host ID and proxy
6878715ff30eSdrh ** path and the lock is downgraded to a shared lock again.  If the conch
6879715ff30eSdrh ** is held by another process (with a shared lock), the exclusive lock
6880715ff30eSdrh ** will fail and SQLITE_BUSY is returned.
6881715ff30eSdrh **
6882715ff30eSdrh ** The proxy file - a single-byte file used for all advisory file locks
6883715ff30eSdrh ** normally taken on the database file.   This allows for safe sharing
6884715ff30eSdrh ** of the database file for multiple readers and writers on the same
6885715ff30eSdrh ** host (the conch ensures that they all use the same local lock file).
6886715ff30eSdrh **
6887715ff30eSdrh ** Requesting the lock proxy does not immediately take the conch, it is
6888715ff30eSdrh ** only taken when the first request to lock database file is made.
6889715ff30eSdrh ** This matches the semantics of the traditional locking behavior, where
6890715ff30eSdrh ** opening a connection to a database file does not take a lock on it.
6891715ff30eSdrh ** The shared lock and an open file descriptor are maintained until
6892715ff30eSdrh ** the connection to the database is closed.
6893715ff30eSdrh **
6894715ff30eSdrh ** The proxy file and the lock file are never deleted so they only need
6895715ff30eSdrh ** to be created the first time they are used.
6896715ff30eSdrh **
6897715ff30eSdrh ** Configuration options
6898715ff30eSdrh ** ---------------------
6899715ff30eSdrh **
6900715ff30eSdrh **  SQLITE_PREFER_PROXY_LOCKING
6901715ff30eSdrh **
6902715ff30eSdrh **       Database files accessed on non-local file systems are
6903715ff30eSdrh **       automatically configured for proxy locking, lock files are
6904715ff30eSdrh **       named automatically using the same logic as
6905715ff30eSdrh **       PRAGMA lock_proxy_file=":auto:"
6906715ff30eSdrh **
6907715ff30eSdrh **  SQLITE_PROXY_DEBUG
6908715ff30eSdrh **
6909715ff30eSdrh **       Enables the logging of error messages during host id file
6910715ff30eSdrh **       retrieval and creation
6911715ff30eSdrh **
6912715ff30eSdrh **  LOCKPROXYDIR
6913715ff30eSdrh **
6914715ff30eSdrh **       Overrides the default directory used for lock proxy files that
6915715ff30eSdrh **       are named automatically via the ":auto:" setting
6916715ff30eSdrh **
6917715ff30eSdrh **  SQLITE_DEFAULT_PROXYDIR_PERMISSIONS
6918715ff30eSdrh **
6919715ff30eSdrh **       Permissions to use when creating a directory for storing the
6920715ff30eSdrh **       lock proxy files, only used when LOCKPROXYDIR is not set.
6921715ff30eSdrh **
6922715ff30eSdrh **
6923715ff30eSdrh ** As mentioned above, when compiled with SQLITE_PREFER_PROXY_LOCKING,
6924715ff30eSdrh ** setting the environment variable SQLITE_FORCE_PROXY_LOCKING to 1 will
6925715ff30eSdrh ** force proxy locking to be used for every database file opened, and 0
6926715ff30eSdrh ** will force automatic proxy locking to be disabled for all database
69274bf66fd6Sdrh ** files (explicitly calling the SQLITE_FCNTL_SET_LOCKPROXYFILE pragma or
6928715ff30eSdrh ** sqlite_file_control API is not affected by SQLITE_FORCE_PROXY_LOCKING).
6929715ff30eSdrh */
6930715ff30eSdrh 
6931715ff30eSdrh /*
6932715ff30eSdrh ** Proxy locking is only available on MacOSX
6933715ff30eSdrh */
6934d2cb50b7Sdrh #if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
6935715ff30eSdrh 
6936715ff30eSdrh /*
6937715ff30eSdrh ** The proxyLockingContext has the path and file structures for the remote
6938715ff30eSdrh ** and local proxy files in it
6939715ff30eSdrh */
6940715ff30eSdrh typedef struct proxyLockingContext proxyLockingContext;
6941715ff30eSdrh struct proxyLockingContext {
6942715ff30eSdrh   unixFile *conchFile;         /* Open conch file */
6943715ff30eSdrh   char *conchFilePath;         /* Name of the conch file */
6944715ff30eSdrh   unixFile *lockProxy;         /* Open proxy lock file */
6945715ff30eSdrh   char *lockProxyPath;         /* Name of the proxy lock file */
6946715ff30eSdrh   char *dbPath;                /* Name of the open file */
69477ed97b9dSdrh   int conchHeld;               /* 1 if the conch is held, -1 if lockless */
69484bf66fd6Sdrh   int nFails;                  /* Number of conch taking failures */
6949715ff30eSdrh   void *oldLockingContext;     /* Original lockingcontext to restore on close */
6950715ff30eSdrh   sqlite3_io_methods const *pOldMethod;     /* Original I/O methods for close */
6951715ff30eSdrh };
6952715ff30eSdrh 
69537ed97b9dSdrh /*
69547ed97b9dSdrh ** The proxy lock file path for the database at dbPath is written into lPath,
69557ed97b9dSdrh ** which must point to valid, writable memory large enough for a maxLen length
69567ed97b9dSdrh ** file path.
6957715ff30eSdrh */
proxyGetLockPath(const char * dbPath,char * lPath,size_t maxLen)6958715ff30eSdrh static int proxyGetLockPath(const char *dbPath, char *lPath, size_t maxLen){
6959715ff30eSdrh   int len;
6960715ff30eSdrh   int dbLen;
6961715ff30eSdrh   int i;
6962715ff30eSdrh 
6963715ff30eSdrh #ifdef LOCKPROXYDIR
6964715ff30eSdrh   len = strlcpy(lPath, LOCKPROXYDIR, maxLen);
6965715ff30eSdrh #else
6966715ff30eSdrh # ifdef _CS_DARWIN_USER_TEMP_DIR
6967715ff30eSdrh   {
69687ed97b9dSdrh     if( !confstr(_CS_DARWIN_USER_TEMP_DIR, lPath, maxLen) ){
6969308c2a5cSdrh       OSTRACE(("GETLOCKPATH  failed %s errno=%d pid=%d\n",
69705ac93652Sdrh                lPath, errno, osGetpid(0)));
69717ed97b9dSdrh       return SQLITE_IOERR_LOCK;
69727ed97b9dSdrh     }
6973715ff30eSdrh     len = strlcat(lPath, "sqliteplocks", maxLen);
6974715ff30eSdrh   }
6975715ff30eSdrh # else
6976715ff30eSdrh   len = strlcpy(lPath, "/tmp/", maxLen);
6977715ff30eSdrh # endif
6978715ff30eSdrh #endif
6979715ff30eSdrh 
6980715ff30eSdrh   if( lPath[len-1]!='/' ){
6981715ff30eSdrh     len = strlcat(lPath, "/", maxLen);
6982715ff30eSdrh   }
6983715ff30eSdrh 
6984715ff30eSdrh   /* transform the db path to a unique cache name */
6985ea678832Sdrh   dbLen = (int)strlen(dbPath);
69860ab216a6Sdrh   for( i=0; i<dbLen && (i+len+7)<(int)maxLen; i++){
6987715ff30eSdrh     char c = dbPath[i];
6988715ff30eSdrh     lPath[i+len] = (c=='/')?'_':c;
6989715ff30eSdrh   }
6990715ff30eSdrh   lPath[i+len]='\0';
6991715ff30eSdrh   strlcat(lPath, ":auto:", maxLen);
69925ac93652Sdrh   OSTRACE(("GETLOCKPATH  proxy lock path=%s pid=%d\n", lPath, osGetpid(0)));
6993715ff30eSdrh   return SQLITE_OK;
6994715ff30eSdrh }
6995715ff30eSdrh 
6996715ff30eSdrh /*
69977ed97b9dSdrh  ** Creates the lock file and any missing directories in lockPath
69987ed97b9dSdrh  */
proxyCreateLockPath(const char * lockPath)69997ed97b9dSdrh static int proxyCreateLockPath(const char *lockPath){
70007ed97b9dSdrh   int i, len;
70017ed97b9dSdrh   char buf[MAXPATHLEN];
70027ed97b9dSdrh   int start = 0;
70037ed97b9dSdrh 
70047ed97b9dSdrh   assert(lockPath!=NULL);
70057ed97b9dSdrh   /* try to create all the intermediate directories */
70067ed97b9dSdrh   len = (int)strlen(lockPath);
70077ed97b9dSdrh   buf[0] = lockPath[0];
70087ed97b9dSdrh   for( i=1; i<len; i++ ){
70097ed97b9dSdrh     if( lockPath[i] == '/' && (i - start > 0) ){
70107ed97b9dSdrh       /* only mkdir if leaf dir != "." or "/" or ".." */
70117ed97b9dSdrh       if( i-start>2 || (i-start==1 && buf[start] != '.' && buf[start] != '/')
70127ed97b9dSdrh          || (i-start==2 && buf[start] != '.' && buf[start+1] != '.') ){
70137ed97b9dSdrh         buf[i]='\0';
70149ef6bc42Sdrh         if( osMkdir(buf, SQLITE_DEFAULT_PROXYDIR_PERMISSIONS) ){
70157ed97b9dSdrh           int err=errno;
70167ed97b9dSdrh           if( err!=EEXIST ) {
7017308c2a5cSdrh             OSTRACE(("CREATELOCKPATH  FAILED creating %s, "
70187ed97b9dSdrh                      "'%s' proxy lock path=%s pid=%d\n",
70195ac93652Sdrh                      buf, strerror(err), lockPath, osGetpid(0)));
70207ed97b9dSdrh             return err;
70217ed97b9dSdrh           }
70227ed97b9dSdrh         }
70237ed97b9dSdrh       }
70247ed97b9dSdrh       start=i+1;
70257ed97b9dSdrh     }
70267ed97b9dSdrh     buf[i] = lockPath[i];
70277ed97b9dSdrh   }
70285ac93652Sdrh   OSTRACE(("CREATELOCKPATH  proxy lock path=%s pid=%d\n",lockPath,osGetpid(0)));
70297ed97b9dSdrh   return 0;
70307ed97b9dSdrh }
70317ed97b9dSdrh 
70327ed97b9dSdrh /*
7033715ff30eSdrh ** Create a new VFS file descriptor (stored in memory obtained from
7034715ff30eSdrh ** sqlite3_malloc) and open the file named "path" in the file descriptor.
7035715ff30eSdrh **
7036715ff30eSdrh ** The caller is responsible not only for closing the file descriptor
7037715ff30eSdrh ** but also for freeing the memory associated with the file descriptor.
7038715ff30eSdrh */
proxyCreateUnixFile(const char * path,unixFile ** ppFile,int islockfile)70397ed97b9dSdrh static int proxyCreateUnixFile(
70407ed97b9dSdrh     const char *path,        /* path for the new unixFile */
70417ed97b9dSdrh     unixFile **ppFile,       /* unixFile created and returned by ref */
70427ed97b9dSdrh     int islockfile           /* if non zero missing dirs will be created */
70437ed97b9dSdrh ) {
70447ed97b9dSdrh   int fd = -1;
7045715ff30eSdrh   unixFile *pNew;
7046715ff30eSdrh   int rc = SQLITE_OK;
7047c398c65bSdrh   int openFlags = O_RDWR | O_CREAT | O_NOFOLLOW;
7048715ff30eSdrh   sqlite3_vfs dummyVfs;
70497ed97b9dSdrh   int terrno = 0;
70507ed97b9dSdrh   UnixUnusedFd *pUnused = NULL;
7051715ff30eSdrh 
70527ed97b9dSdrh   /* 1. first try to open/create the file
70537ed97b9dSdrh   ** 2. if that fails, and this is a lock file (not-conch), try creating
70547ed97b9dSdrh   ** the parent directories and then try again.
70557ed97b9dSdrh   ** 3. if that fails, try to open the file read-only
70567ed97b9dSdrh   ** otherwise return BUSY (if lock file) or CANTOPEN for the conch file
70577ed97b9dSdrh   */
70587ed97b9dSdrh   pUnused = findReusableFd(path, openFlags);
70597ed97b9dSdrh   if( pUnused ){
70607ed97b9dSdrh     fd = pUnused->fd;
70617ed97b9dSdrh   }else{
7062f3cdcdccSdrh     pUnused = sqlite3_malloc64(sizeof(*pUnused));
70637ed97b9dSdrh     if( !pUnused ){
7064fad3039cSmistachkin       return SQLITE_NOMEM_BKPT;
7065715ff30eSdrh     }
70667ed97b9dSdrh   }
70677ed97b9dSdrh   if( fd<0 ){
70688c815d14Sdrh     fd = robust_open(path, openFlags, 0);
70697ed97b9dSdrh     terrno = errno;
70707ed97b9dSdrh     if( fd<0 && errno==ENOENT && islockfile ){
70717ed97b9dSdrh       if( proxyCreateLockPath(path) == SQLITE_OK ){
70728c815d14Sdrh         fd = robust_open(path, openFlags, 0);
70737ed97b9dSdrh       }
70747ed97b9dSdrh     }
70757ed97b9dSdrh   }
70767ed97b9dSdrh   if( fd<0 ){
7077c398c65bSdrh     openFlags = O_RDONLY | O_NOFOLLOW;
70788c815d14Sdrh     fd = robust_open(path, openFlags, 0);
70797ed97b9dSdrh     terrno = errno;
70807ed97b9dSdrh   }
70817ed97b9dSdrh   if( fd<0 ){
70827ed97b9dSdrh     if( islockfile ){
70837ed97b9dSdrh       return SQLITE_BUSY;
70847ed97b9dSdrh     }
70857ed97b9dSdrh     switch (terrno) {
70867ed97b9dSdrh       case EACCES:
70877ed97b9dSdrh         return SQLITE_PERM;
70887ed97b9dSdrh       case EIO:
70897ed97b9dSdrh         return SQLITE_IOERR_LOCK; /* even though it is the conch */
70907ed97b9dSdrh       default:
70919978c97eSdrh         return SQLITE_CANTOPEN_BKPT;
70927ed97b9dSdrh     }
70937ed97b9dSdrh   }
70947ed97b9dSdrh 
7095f3cdcdccSdrh   pNew = (unixFile *)sqlite3_malloc64(sizeof(*pNew));
70967ed97b9dSdrh   if( pNew==NULL ){
7097fad3039cSmistachkin     rc = SQLITE_NOMEM_BKPT;
70987ed97b9dSdrh     goto end_create_proxy;
70997ed97b9dSdrh   }
7100715ff30eSdrh   memset(pNew, 0, sizeof(unixFile));
71017ed97b9dSdrh   pNew->openFlags = openFlags;
7102211fb084Sdan   memset(&dummyVfs, 0, sizeof(dummyVfs));
71031875f7a3Sdrh   dummyVfs.pAppData = (void*)&autolockIoFinder;
7104211fb084Sdan   dummyVfs.zName = "dummy";
71057ed97b9dSdrh   pUnused->fd = fd;
71067ed97b9dSdrh   pUnused->flags = openFlags;
7107c68886bbSdrh   pNew->pPreallocatedUnused = pUnused;
710815edd587Sdan 
7109c02a43afSdrh   rc = fillInUnixFile(&dummyVfs, fd, (sqlite3_file*)pNew, path, 0);
71107ed97b9dSdrh   if( rc==SQLITE_OK ){
711115edd587Sdan     *ppFile = pNew;
71127ed97b9dSdrh     return SQLITE_OK;
71137ed97b9dSdrh   }
71147ed97b9dSdrh end_create_proxy:
71150e9365ceSdrh   robust_close(pNew, fd, __LINE__);
71167ed97b9dSdrh   sqlite3_free(pNew);
71177ed97b9dSdrh   sqlite3_free(pUnused);
7118715ff30eSdrh   return rc;
7119715ff30eSdrh }
7120715ff30eSdrh 
71217ed97b9dSdrh #ifdef SQLITE_TEST
71227ed97b9dSdrh /* simulate multiple hosts by creating unique hostid file paths */
71237ed97b9dSdrh int sqlite3_hostid_num = 0;
71247ed97b9dSdrh #endif
71257ed97b9dSdrh 
71267ed97b9dSdrh #define PROXY_HOSTIDLEN    16  /* conch file host id length */
71277ed97b9dSdrh 
7128e4079e1fSdrh #if HAVE_GETHOSTUUID
71290ab216a6Sdrh /* Not always defined in the headers as it ought to be */
71300ab216a6Sdrh extern int gethostuuid(uuid_t id, const struct timespec *wait);
71316bca6511Sdrh #endif
71320ab216a6Sdrh 
71337ed97b9dSdrh /* get the host ID via gethostuuid(), pHostID must point to PROXY_HOSTIDLEN
71347ed97b9dSdrh ** bytes of writable memory.
71357ed97b9dSdrh */
proxyGetHostID(unsigned char * pHostID,int * pError)71367ed97b9dSdrh static int proxyGetHostID(unsigned char *pHostID, int *pError){
71377ed97b9dSdrh   assert(PROXY_HOSTIDLEN == sizeof(uuid_t));
71387ed97b9dSdrh   memset(pHostID, 0, PROXY_HOSTIDLEN);
7139e4079e1fSdrh #if HAVE_GETHOSTUUID
714029ecd8a0Sdrh   {
71414bf66fd6Sdrh     struct timespec timeout = {1, 0}; /* 1 sec timeout */
71427ed97b9dSdrh     if( gethostuuid(pHostID, &timeout) ){
71437ed97b9dSdrh       int err = errno;
71447ed97b9dSdrh       if( pError ){
71457ed97b9dSdrh         *pError = err;
71467ed97b9dSdrh       }
71477ed97b9dSdrh       return SQLITE_IOERR;
71487ed97b9dSdrh     }
714929ecd8a0Sdrh   }
71503d4435b2Sdrh #else
71513d4435b2Sdrh   UNUSED_PARAMETER(pError);
7152e8b0c9b4Sdrh #endif
71537ed97b9dSdrh #ifdef SQLITE_TEST
71547ed97b9dSdrh   /* simulate multiple hosts by creating unique hostid file paths */
71557ed97b9dSdrh   if( sqlite3_hostid_num != 0){
71567ed97b9dSdrh     pHostID[0] = (char)(pHostID[0] + (char)(sqlite3_hostid_num & 0xFF));
71577ed97b9dSdrh   }
71587ed97b9dSdrh #endif
71597ed97b9dSdrh 
71607ed97b9dSdrh   return SQLITE_OK;
71617ed97b9dSdrh }
71627ed97b9dSdrh 
71637ed97b9dSdrh /* The conch file contains the header, host id and lock file path
71647ed97b9dSdrh  */
71657ed97b9dSdrh #define PROXY_CONCHVERSION 2   /* 1-byte header, 16-byte host id, path */
71667ed97b9dSdrh #define PROXY_HEADERLEN    1   /* conch file header length */
71677ed97b9dSdrh #define PROXY_PATHINDEX    (PROXY_HEADERLEN+PROXY_HOSTIDLEN)
71687ed97b9dSdrh #define PROXY_MAXCONCHLEN  (PROXY_HEADERLEN+PROXY_HOSTIDLEN+MAXPATHLEN)
71697ed97b9dSdrh 
71707ed97b9dSdrh /*
71717ed97b9dSdrh ** Takes an open conch file, copies the contents to a new path and then moves
71727ed97b9dSdrh ** it back.  The newly created file's file descriptor is assigned to the
71737ed97b9dSdrh ** conch file structure and finally the original conch file descriptor is
71747ed97b9dSdrh ** closed.  Returns zero if successful.
71757ed97b9dSdrh */
proxyBreakConchLock(unixFile * pFile,uuid_t myHostID)71767ed97b9dSdrh static int proxyBreakConchLock(unixFile *pFile, uuid_t myHostID){
71777ed97b9dSdrh   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
71787ed97b9dSdrh   unixFile *conchFile = pCtx->conchFile;
71797ed97b9dSdrh   char tPath[MAXPATHLEN];
71807ed97b9dSdrh   char buf[PROXY_MAXCONCHLEN];
71817ed97b9dSdrh   char *cPath = pCtx->conchFilePath;
71827ed97b9dSdrh   size_t readLen = 0;
71837ed97b9dSdrh   size_t pathLen = 0;
71847ed97b9dSdrh   char errmsg[64] = "";
71857ed97b9dSdrh   int fd = -1;
71867ed97b9dSdrh   int rc = -1;
71870ab216a6Sdrh   UNUSED_PARAMETER(myHostID);
71887ed97b9dSdrh 
71897ed97b9dSdrh   /* create a new path by replace the trailing '-conch' with '-break' */
71907ed97b9dSdrh   pathLen = strlcpy(tPath, cPath, MAXPATHLEN);
71917ed97b9dSdrh   if( pathLen>MAXPATHLEN || pathLen<6 ||
71927ed97b9dSdrh      (strlcpy(&tPath[pathLen-5], "break", 6) != 5) ){
71930cb3a1ecSdan     sqlite3_snprintf(sizeof(errmsg),errmsg,"path error (len %d)",(int)pathLen);
71947ed97b9dSdrh     goto end_breaklock;
71957ed97b9dSdrh   }
71967ed97b9dSdrh   /* read the conch content */
7197e562be52Sdrh   readLen = osPread(conchFile->h, buf, PROXY_MAXCONCHLEN, 0);
71987ed97b9dSdrh   if( readLen<PROXY_PATHINDEX ){
71990cb3a1ecSdan     sqlite3_snprintf(sizeof(errmsg),errmsg,"read error (len %d)",(int)readLen);
72007ed97b9dSdrh     goto end_breaklock;
72017ed97b9dSdrh   }
72027ed97b9dSdrh   /* write it out to the temporary break file */
7203c398c65bSdrh   fd = robust_open(tPath, (O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW), 0);
72047ed97b9dSdrh   if( fd<0 ){
72050cb3a1ecSdan     sqlite3_snprintf(sizeof(errmsg), errmsg, "create failed (%d)", errno);
72067ed97b9dSdrh     goto end_breaklock;
72077ed97b9dSdrh   }
7208e562be52Sdrh   if( osPwrite(fd, buf, readLen, 0) != (ssize_t)readLen ){
72090cb3a1ecSdan     sqlite3_snprintf(sizeof(errmsg), errmsg, "write failed (%d)", errno);
72107ed97b9dSdrh     goto end_breaklock;
72117ed97b9dSdrh   }
72127ed97b9dSdrh   if( rename(tPath, cPath) ){
72130cb3a1ecSdan     sqlite3_snprintf(sizeof(errmsg), errmsg, "rename failed (%d)", errno);
72147ed97b9dSdrh     goto end_breaklock;
72157ed97b9dSdrh   }
72167ed97b9dSdrh   rc = 0;
72177ed97b9dSdrh   fprintf(stderr, "broke stale lock on %s\n", cPath);
72180e9365ceSdrh   robust_close(pFile, conchFile->h, __LINE__);
72197ed97b9dSdrh   conchFile->h = fd;
72207ed97b9dSdrh   conchFile->openFlags = O_RDWR | O_CREAT;
72217ed97b9dSdrh 
72227ed97b9dSdrh end_breaklock:
72237ed97b9dSdrh   if( rc ){
72247ed97b9dSdrh     if( fd>=0 ){
7225036ac7faSdrh       osUnlink(tPath);
72260e9365ceSdrh       robust_close(pFile, fd, __LINE__);
72277ed97b9dSdrh     }
72287ed97b9dSdrh     fprintf(stderr, "failed to break stale lock on %s, %s\n", cPath, errmsg);
72297ed97b9dSdrh   }
72307ed97b9dSdrh   return rc;
72317ed97b9dSdrh }
72327ed97b9dSdrh 
72337ed97b9dSdrh /* Take the requested lock on the conch file and break a stale lock if the
72347ed97b9dSdrh ** host id matches.
72357ed97b9dSdrh */
proxyConchLock(unixFile * pFile,uuid_t myHostID,int lockType)72367ed97b9dSdrh static int proxyConchLock(unixFile *pFile, uuid_t myHostID, int lockType){
72377ed97b9dSdrh   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
72387ed97b9dSdrh   unixFile *conchFile = pCtx->conchFile;
72397ed97b9dSdrh   int rc = SQLITE_OK;
72407ed97b9dSdrh   int nTries = 0;
72417ed97b9dSdrh   struct timespec conchModTime;
72427ed97b9dSdrh 
72433d4435b2Sdrh   memset(&conchModTime, 0, sizeof(conchModTime));
72447ed97b9dSdrh   do {
72457ed97b9dSdrh     rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, lockType);
72467ed97b9dSdrh     nTries ++;
72477ed97b9dSdrh     if( rc==SQLITE_BUSY ){
72487ed97b9dSdrh       /* If the lock failed (busy):
72497ed97b9dSdrh        * 1st try: get the mod time of the conch, wait 0.5s and try again.
72507ed97b9dSdrh        * 2nd try: fail if the mod time changed or host id is different, wait
72517ed97b9dSdrh        *           10 sec and try again
72527ed97b9dSdrh        * 3rd try: break the lock unless the mod time has changed.
72537ed97b9dSdrh        */
72547ed97b9dSdrh       struct stat buf;
725599ab3b12Sdrh       if( osFstat(conchFile->h, &buf) ){
72564bf66fd6Sdrh         storeLastErrno(pFile, errno);
72577ed97b9dSdrh         return SQLITE_IOERR_LOCK;
72587ed97b9dSdrh       }
72597ed97b9dSdrh 
72607ed97b9dSdrh       if( nTries==1 ){
72617ed97b9dSdrh         conchModTime = buf.st_mtimespec;
7262ddcfe921Sdrh         unixSleep(0,500000); /* wait 0.5 sec and try the lock again*/
72637ed97b9dSdrh         continue;
72647ed97b9dSdrh       }
72657ed97b9dSdrh 
72667ed97b9dSdrh       assert( nTries>1 );
72677ed97b9dSdrh       if( conchModTime.tv_sec != buf.st_mtimespec.tv_sec ||
72687ed97b9dSdrh          conchModTime.tv_nsec != buf.st_mtimespec.tv_nsec ){
72697ed97b9dSdrh         return SQLITE_BUSY;
72707ed97b9dSdrh       }
72717ed97b9dSdrh 
72727ed97b9dSdrh       if( nTries==2 ){
72737ed97b9dSdrh         char tBuf[PROXY_MAXCONCHLEN];
7274e562be52Sdrh         int len = osPread(conchFile->h, tBuf, PROXY_MAXCONCHLEN, 0);
72757ed97b9dSdrh         if( len<0 ){
72764bf66fd6Sdrh           storeLastErrno(pFile, errno);
72777ed97b9dSdrh           return SQLITE_IOERR_LOCK;
72787ed97b9dSdrh         }
72797ed97b9dSdrh         if( len>PROXY_PATHINDEX && tBuf[0]==(char)PROXY_CONCHVERSION){
72807ed97b9dSdrh           /* don't break the lock if the host id doesn't match */
72817ed97b9dSdrh           if( 0!=memcmp(&tBuf[PROXY_HEADERLEN], myHostID, PROXY_HOSTIDLEN) ){
72827ed97b9dSdrh             return SQLITE_BUSY;
72837ed97b9dSdrh           }
72847ed97b9dSdrh         }else{
72857ed97b9dSdrh           /* don't break the lock on short read or a version mismatch */
72867ed97b9dSdrh           return SQLITE_BUSY;
72877ed97b9dSdrh         }
7288ddcfe921Sdrh         unixSleep(0,10000000); /* wait 10 sec and try the lock again */
72897ed97b9dSdrh         continue;
72907ed97b9dSdrh       }
72917ed97b9dSdrh 
72927ed97b9dSdrh       assert( nTries==3 );
72937ed97b9dSdrh       if( 0==proxyBreakConchLock(pFile, myHostID) ){
72947ed97b9dSdrh         rc = SQLITE_OK;
72957ed97b9dSdrh         if( lockType==EXCLUSIVE_LOCK ){
72967ed97b9dSdrh           rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, SHARED_LOCK);
72977ed97b9dSdrh         }
72987ed97b9dSdrh         if( !rc ){
72997ed97b9dSdrh           rc = conchFile->pMethod->xLock((sqlite3_file*)conchFile, lockType);
73007ed97b9dSdrh         }
73017ed97b9dSdrh       }
73027ed97b9dSdrh     }
73037ed97b9dSdrh   } while( rc==SQLITE_BUSY && nTries<3 );
73047ed97b9dSdrh 
73057ed97b9dSdrh   return rc;
73067ed97b9dSdrh }
73077ed97b9dSdrh 
73087ed97b9dSdrh /* Takes the conch by taking a shared lock and read the contents conch, if
7309715ff30eSdrh ** lockPath is non-NULL, the host ID and lock file path must match.  A NULL
7310715ff30eSdrh ** lockPath means that the lockPath in the conch file will be used if the
7311715ff30eSdrh ** host IDs match, or a new lock path will be generated automatically
7312715ff30eSdrh ** and written to the conch file.
7313715ff30eSdrh */
proxyTakeConch(unixFile * pFile)7314715ff30eSdrh static int proxyTakeConch(unixFile *pFile){
7315715ff30eSdrh   proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
7316715ff30eSdrh 
73177ed97b9dSdrh   if( pCtx->conchHeld!=0 ){
7318715ff30eSdrh     return SQLITE_OK;
7319715ff30eSdrh   }else{
7320715ff30eSdrh     unixFile *conchFile = pCtx->conchFile;
73217ed97b9dSdrh     uuid_t myHostID;
73227ed97b9dSdrh     int pError = 0;
73237ed97b9dSdrh     char readBuf[PROXY_MAXCONCHLEN];
7324715ff30eSdrh     char lockPath[MAXPATHLEN];
73257ed97b9dSdrh     char *tempLockPath = NULL;
7326715ff30eSdrh     int rc = SQLITE_OK;
73277ed97b9dSdrh     int createConch = 0;
73287ed97b9dSdrh     int hostIdMatch = 0;
73297ed97b9dSdrh     int readLen = 0;
73307ed97b9dSdrh     int tryOldLockPath = 0;
73317ed97b9dSdrh     int forceNewLockPath = 0;
7332715ff30eSdrh 
7333308c2a5cSdrh     OSTRACE(("TAKECONCH  %d for %s pid=%d\n", conchFile->h,
733491eb93c7Sdrh              (pCtx->lockProxyPath ? pCtx->lockProxyPath : ":auto:"),
73355ac93652Sdrh              osGetpid(0)));
7336715ff30eSdrh 
73377ed97b9dSdrh     rc = proxyGetHostID(myHostID, &pError);
7338715ff30eSdrh     if( (rc&0xff)==SQLITE_IOERR ){
73394bf66fd6Sdrh       storeLastErrno(pFile, pError);
73407ed97b9dSdrh       goto end_takeconch;
7341715ff30eSdrh     }
73427ed97b9dSdrh     rc = proxyConchLock(pFile, myHostID, SHARED_LOCK);
7343715ff30eSdrh     if( rc!=SQLITE_OK ){
7344715ff30eSdrh       goto end_takeconch;
7345715ff30eSdrh     }
73467ed97b9dSdrh     /* read the existing conch file */
73477ed97b9dSdrh     readLen = seekAndRead((unixFile*)conchFile, 0, readBuf, PROXY_MAXCONCHLEN);
73487ed97b9dSdrh     if( readLen<0 ){
73497ed97b9dSdrh       /* I/O error: lastErrno set by seekAndRead */
73504bf66fd6Sdrh       storeLastErrno(pFile, conchFile->lastErrno);
73517ed97b9dSdrh       rc = SQLITE_IOERR_READ;
7352715ff30eSdrh       goto end_takeconch;
73537ed97b9dSdrh     }else if( readLen<=(PROXY_HEADERLEN+PROXY_HOSTIDLEN) ||
73547ed97b9dSdrh              readBuf[0]!=(char)PROXY_CONCHVERSION ){
73557ed97b9dSdrh       /* a short read or version format mismatch means we need to create a new
73567ed97b9dSdrh       ** conch file.
73577ed97b9dSdrh       */
73587ed97b9dSdrh       createConch = 1;
7359715ff30eSdrh     }
73607ed97b9dSdrh     /* if the host id matches and the lock path already exists in the conch
73617ed97b9dSdrh     ** we'll try to use the path there, if we can't open that path, we'll
73627ed97b9dSdrh     ** retry with a new auto-generated path
73637ed97b9dSdrh     */
73647ed97b9dSdrh     do { /* in case we need to try again for an :auto: named lock file */
73657ed97b9dSdrh 
73667ed97b9dSdrh       if( !createConch && !forceNewLockPath ){
73677ed97b9dSdrh         hostIdMatch = !memcmp(&readBuf[PROXY_HEADERLEN], myHostID,
73687ed97b9dSdrh                                   PROXY_HOSTIDLEN);
7369715ff30eSdrh         /* if the conch has data compare the contents */
7370715ff30eSdrh         if( !pCtx->lockProxyPath ){
7371715ff30eSdrh           /* for auto-named local lock file, just check the host ID and we'll
73727ed97b9dSdrh            ** use the local lock file path that's already in there
7373715ff30eSdrh            */
73747ed97b9dSdrh           if( hostIdMatch ){
73757ed97b9dSdrh             size_t pathLen = (readLen - PROXY_PATHINDEX);
73767ed97b9dSdrh 
73777ed97b9dSdrh             if( pathLen>=MAXPATHLEN ){
73787ed97b9dSdrh               pathLen=MAXPATHLEN-1;
73797ed97b9dSdrh             }
73807ed97b9dSdrh             memcpy(lockPath, &readBuf[PROXY_PATHINDEX], pathLen);
73817ed97b9dSdrh             lockPath[pathLen] = 0;
73827ed97b9dSdrh             tempLockPath = lockPath;
73837ed97b9dSdrh             tryOldLockPath = 1;
73847ed97b9dSdrh             /* create a copy of the lock path if the conch is taken */
73857ed97b9dSdrh             goto end_takeconch;
73867ed97b9dSdrh           }
73877ed97b9dSdrh         }else if( hostIdMatch
73887ed97b9dSdrh                && !strncmp(pCtx->lockProxyPath, &readBuf[PROXY_PATHINDEX],
73897ed97b9dSdrh                            readLen-PROXY_PATHINDEX)
73907ed97b9dSdrh         ){
73917ed97b9dSdrh           /* conch host and lock path match */
73927ed97b9dSdrh           goto end_takeconch;
73937ed97b9dSdrh         }
7394715ff30eSdrh       }
7395715ff30eSdrh 
73967ed97b9dSdrh       /* if the conch isn't writable and doesn't match, we can't take it */
73977ed97b9dSdrh       if( (conchFile->openFlags&O_RDWR) == 0 ){
73987ed97b9dSdrh         rc = SQLITE_BUSY;
73997ed97b9dSdrh         goto end_takeconch;
74007ed97b9dSdrh       }
74017ed97b9dSdrh 
74027ed97b9dSdrh       /* either the conch didn't match or we need to create a new one */
7403715ff30eSdrh       if( !pCtx->lockProxyPath ){
7404715ff30eSdrh         proxyGetLockPath(pCtx->dbPath, lockPath, MAXPATHLEN);
74057ed97b9dSdrh         tempLockPath = lockPath;
74067ed97b9dSdrh         /* create a copy of the lock path _only_ if the conch is taken */
7407715ff30eSdrh       }
7408715ff30eSdrh 
7409715ff30eSdrh       /* update conch with host and path (this will fail if other process
74107ed97b9dSdrh       ** has a shared lock already), if the host id matches, use the big
74117ed97b9dSdrh       ** stick.
74127ed97b9dSdrh       */
74137ed97b9dSdrh       futimes(conchFile->h, NULL);
74147ed97b9dSdrh       if( hostIdMatch && !createConch ){
74158af6c228Sdrh         if( conchFile->pInode && conchFile->pInode->nShared>1 ){
74167ed97b9dSdrh           /* We are trying for an exclusive lock but another thread in this
74177ed97b9dSdrh            ** same process is still holding a shared lock. */
74187ed97b9dSdrh           rc = SQLITE_BUSY;
74197ed97b9dSdrh         } else {
74207ed97b9dSdrh           rc = proxyConchLock(pFile, myHostID, EXCLUSIVE_LOCK);
74217ed97b9dSdrh         }
74227ed97b9dSdrh       }else{
74234bf66fd6Sdrh         rc = proxyConchLock(pFile, myHostID, EXCLUSIVE_LOCK);
74247ed97b9dSdrh       }
7425715ff30eSdrh       if( rc==SQLITE_OK ){
74267ed97b9dSdrh         char writeBuffer[PROXY_MAXCONCHLEN];
74277ed97b9dSdrh         int writeSize = 0;
74287ed97b9dSdrh 
74297ed97b9dSdrh         writeBuffer[0] = (char)PROXY_CONCHVERSION;
74307ed97b9dSdrh         memcpy(&writeBuffer[PROXY_HEADERLEN], myHostID, PROXY_HOSTIDLEN);
74317ed97b9dSdrh         if( pCtx->lockProxyPath!=NULL ){
74324bf66fd6Sdrh           strlcpy(&writeBuffer[PROXY_PATHINDEX], pCtx->lockProxyPath,
74334bf66fd6Sdrh                   MAXPATHLEN);
74347ed97b9dSdrh         }else{
74357ed97b9dSdrh           strlcpy(&writeBuffer[PROXY_PATHINDEX], tempLockPath, MAXPATHLEN);
74367ed97b9dSdrh         }
74377ed97b9dSdrh         writeSize = PROXY_PATHINDEX + strlen(&writeBuffer[PROXY_PATHINDEX]);
7438ff81231eSdrh         robust_ftruncate(conchFile->h, writeSize);
74397ed97b9dSdrh         rc = unixWrite((sqlite3_file *)conchFile, writeBuffer, writeSize, 0);
74406d258995Sdrh         full_fsync(conchFile->h,0,0);
74417ed97b9dSdrh         /* If we created a new conch file (not just updated the contents of a
74427ed97b9dSdrh          ** valid conch file), try to match the permissions of the database
74437ed97b9dSdrh          */
74447ed97b9dSdrh         if( rc==SQLITE_OK && createConch ){
7445715ff30eSdrh           struct stat buf;
744699ab3b12Sdrh           int err = osFstat(pFile->h, &buf);
7447715ff30eSdrh           if( err==0 ){
74487ed97b9dSdrh             mode_t cmode = buf.st_mode&(S_IRUSR|S_IWUSR | S_IRGRP|S_IWGRP |
74497ed97b9dSdrh                                         S_IROTH|S_IWOTH);
74507ed97b9dSdrh             /* try to match the database file R/W permissions, ignore failure */
7451715ff30eSdrh #ifndef SQLITE_PROXY_DEBUG
7452e562be52Sdrh             osFchmod(conchFile->h, cmode);
7453715ff30eSdrh #else
7454ff81231eSdrh             do{
7455e562be52Sdrh               rc = osFchmod(conchFile->h, cmode);
7456ff81231eSdrh             }while( rc==(-1) && errno==EINTR );
7457ff81231eSdrh             if( rc!=0 ){
7458715ff30eSdrh               int code = errno;
7459715ff30eSdrh               fprintf(stderr, "fchmod %o FAILED with %d %s\n",
74607ed97b9dSdrh                       cmode, code, strerror(code));
7461715ff30eSdrh             } else {
74627ed97b9dSdrh               fprintf(stderr, "fchmod %o SUCCEDED\n",cmode);
7463715ff30eSdrh             }
7464715ff30eSdrh           }else{
7465715ff30eSdrh             int code = errno;
7466715ff30eSdrh             fprintf(stderr, "STAT FAILED[%d] with %d %s\n",
7467715ff30eSdrh                     err, code, strerror(code));
7468715ff30eSdrh #endif
7469715ff30eSdrh           }
7470715ff30eSdrh         }
7471715ff30eSdrh       }
7472715ff30eSdrh       conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, SHARED_LOCK);
7473715ff30eSdrh 
7474715ff30eSdrh     end_takeconch:
7475308c2a5cSdrh       OSTRACE(("TRANSPROXY: CLOSE  %d\n", pFile->h));
7476715ff30eSdrh       if( rc==SQLITE_OK && pFile->openFlags ){
74773d4435b2Sdrh         int fd;
7478715ff30eSdrh         if( pFile->h>=0 ){
7479e84009f6Sdrh           robust_close(pFile, pFile->h, __LINE__);
7480715ff30eSdrh         }
7481715ff30eSdrh         pFile->h = -1;
74828c815d14Sdrh         fd = robust_open(pCtx->dbPath, pFile->openFlags, 0);
7483308c2a5cSdrh         OSTRACE(("TRANSPROXY: OPEN  %d\n", fd));
7484715ff30eSdrh         if( fd>=0 ){
7485715ff30eSdrh           pFile->h = fd;
7486715ff30eSdrh         }else{
74879978c97eSdrh           rc=SQLITE_CANTOPEN_BKPT; /* SQLITE_BUSY? proxyTakeConch called
74881875f7a3Sdrh            during locking */
7489715ff30eSdrh         }
7490715ff30eSdrh       }
7491715ff30eSdrh       if( rc==SQLITE_OK && !pCtx->lockProxy ){
74927ed97b9dSdrh         char *path = tempLockPath ? tempLockPath : pCtx->lockProxyPath;
74937ed97b9dSdrh         rc = proxyCreateUnixFile(path, &pCtx->lockProxy, 1);
74947ed97b9dSdrh         if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && tryOldLockPath ){
74957ed97b9dSdrh           /* we couldn't create the proxy lock file with the old lock file path
74967ed97b9dSdrh            ** so try again via auto-naming
74977ed97b9dSdrh            */
74987ed97b9dSdrh           forceNewLockPath = 1;
74997ed97b9dSdrh           tryOldLockPath = 0;
75002b0ef472Sdan           continue; /* go back to the do {} while start point, try again */
75017ed97b9dSdrh         }
75027ed97b9dSdrh       }
75037ed97b9dSdrh       if( rc==SQLITE_OK ){
75047ed97b9dSdrh         /* Need to make a copy of path if we extracted the value
75057ed97b9dSdrh          ** from the conch file or the path was allocated on the stack
75067ed97b9dSdrh          */
75077ed97b9dSdrh         if( tempLockPath ){
75087ed97b9dSdrh           pCtx->lockProxyPath = sqlite3DbStrDup(0, tempLockPath);
75097ed97b9dSdrh           if( !pCtx->lockProxyPath ){
7510fad3039cSmistachkin             rc = SQLITE_NOMEM_BKPT;
75117ed97b9dSdrh           }
75127ed97b9dSdrh         }
7513715ff30eSdrh       }
7514715ff30eSdrh       if( rc==SQLITE_OK ){
7515715ff30eSdrh         pCtx->conchHeld = 1;
7516715ff30eSdrh 
7517715ff30eSdrh         if( pCtx->lockProxy->pMethod == &afpIoMethods ){
75187ed97b9dSdrh           afpLockingContext *afpCtx;
75197ed97b9dSdrh           afpCtx = (afpLockingContext *)pCtx->lockProxy->lockingContext;
75207ed97b9dSdrh           afpCtx->dbPath = pCtx->lockProxyPath;
7521715ff30eSdrh         }
7522715ff30eSdrh       } else {
7523715ff30eSdrh         conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, NO_LOCK);
7524715ff30eSdrh       }
7525308c2a5cSdrh       OSTRACE(("TAKECONCH  %d %s\n", conchFile->h,
7526308c2a5cSdrh                rc==SQLITE_OK?"ok":"failed"));
7527715ff30eSdrh       return rc;
7528308c2a5cSdrh     } while (1); /* in case we need to retry the :auto: lock file -
7529308c2a5cSdrh                  ** we should never get here except via the 'continue' call. */
7530715ff30eSdrh   }
7531715ff30eSdrh }
7532715ff30eSdrh 
7533715ff30eSdrh /*
7534715ff30eSdrh ** If pFile holds a lock on a conch file, then release that lock.
7535715ff30eSdrh */
proxyReleaseConch(unixFile * pFile)7536715ff30eSdrh static int proxyReleaseConch(unixFile *pFile){
75371c5bb4d6Sdrh   int rc = SQLITE_OK;         /* Subroutine return code */
7538715ff30eSdrh   proxyLockingContext *pCtx;  /* The locking context for the proxy lock */
7539715ff30eSdrh   unixFile *conchFile;        /* Name of the conch file */
7540715ff30eSdrh 
7541715ff30eSdrh   pCtx = (proxyLockingContext *)pFile->lockingContext;
7542715ff30eSdrh   conchFile = pCtx->conchFile;
7543308c2a5cSdrh   OSTRACE(("RELEASECONCH  %d for %s pid=%d\n", conchFile->h,
7544715ff30eSdrh            (pCtx->lockProxyPath ? pCtx->lockProxyPath : ":auto:"),
75455ac93652Sdrh            osGetpid(0)));
75467ed97b9dSdrh   if( pCtx->conchHeld>0 ){
7547715ff30eSdrh     rc = conchFile->pMethod->xUnlock((sqlite3_file*)conchFile, NO_LOCK);
75487ed97b9dSdrh   }
75497ed97b9dSdrh   pCtx->conchHeld = 0;
7550308c2a5cSdrh   OSTRACE(("RELEASECONCH  %d %s\n", conchFile->h,
7551308c2a5cSdrh            (rc==SQLITE_OK ? "ok" : "failed")));
7552715ff30eSdrh   return rc;
7553715ff30eSdrh }
7554715ff30eSdrh 
7555715ff30eSdrh /*
7556715ff30eSdrh ** Given the name of a database file, compute the name of its conch file.
7557f3cdcdccSdrh ** Store the conch filename in memory obtained from sqlite3_malloc64().
7558715ff30eSdrh ** Make *pConchPath point to the new name.  Return SQLITE_OK on success
7559715ff30eSdrh ** or SQLITE_NOMEM if unable to obtain memory.
7560715ff30eSdrh **
7561715ff30eSdrh ** The caller is responsible for ensuring that the allocated memory
7562715ff30eSdrh ** space is eventually freed.
7563715ff30eSdrh **
7564715ff30eSdrh ** *pConchPath is set to NULL if a memory allocation error occurs.
7565715ff30eSdrh */
proxyCreateConchPathname(char * dbPath,char ** pConchPath)7566715ff30eSdrh static int proxyCreateConchPathname(char *dbPath, char **pConchPath){
7567715ff30eSdrh   int i;                        /* Loop counter */
7568ea678832Sdrh   int len = (int)strlen(dbPath); /* Length of database filename - dbPath */
7569715ff30eSdrh   char *conchPath;              /* buffer in which to construct conch name */
7570715ff30eSdrh 
7571715ff30eSdrh   /* Allocate space for the conch filename and initialize the name to
7572715ff30eSdrh   ** the name of the original database file. */
7573f3cdcdccSdrh   *pConchPath = conchPath = (char *)sqlite3_malloc64(len + 8);
7574715ff30eSdrh   if( conchPath==0 ){
7575fad3039cSmistachkin     return SQLITE_NOMEM_BKPT;
7576715ff30eSdrh   }
7577715ff30eSdrh   memcpy(conchPath, dbPath, len+1);
7578715ff30eSdrh 
7579715ff30eSdrh   /* now insert a "." before the last / character */
7580715ff30eSdrh   for( i=(len-1); i>=0; i-- ){
7581715ff30eSdrh     if( conchPath[i]=='/' ){
7582715ff30eSdrh       i++;
7583715ff30eSdrh       break;
7584715ff30eSdrh     }
7585715ff30eSdrh   }
7586715ff30eSdrh   conchPath[i]='.';
7587715ff30eSdrh   while ( i<len ){
7588715ff30eSdrh     conchPath[i+1]=dbPath[i];
7589715ff30eSdrh     i++;
7590715ff30eSdrh   }
7591715ff30eSdrh 
7592715ff30eSdrh   /* append the "-conch" suffix to the file */
7593715ff30eSdrh   memcpy(&conchPath[i+1], "-conch", 7);
7594ea678832Sdrh   assert( (int)strlen(conchPath) == len+7 );
7595715ff30eSdrh 
7596715ff30eSdrh   return SQLITE_OK;
7597715ff30eSdrh }
7598715ff30eSdrh 
7599715ff30eSdrh 
7600715ff30eSdrh /* Takes a fully configured proxy locking-style unix file and switches
7601715ff30eSdrh ** the local lock file path
7602715ff30eSdrh */
switchLockProxyPath(unixFile * pFile,const char * path)7603715ff30eSdrh static int switchLockProxyPath(unixFile *pFile, const char *path) {
7604715ff30eSdrh   proxyLockingContext *pCtx = (proxyLockingContext*)pFile->lockingContext;
7605715ff30eSdrh   char *oldPath = pCtx->lockProxyPath;
7606715ff30eSdrh   int rc = SQLITE_OK;
7607715ff30eSdrh 
7608308c2a5cSdrh   if( pFile->eFileLock!=NO_LOCK ){
7609715ff30eSdrh     return SQLITE_BUSY;
7610715ff30eSdrh   }
7611715ff30eSdrh 
7612715ff30eSdrh   /* nothing to do if the path is NULL, :auto: or matches the existing path */
7613715ff30eSdrh   if( !path || path[0]=='\0' || !strcmp(path, ":auto:") ||
7614715ff30eSdrh     (oldPath && !strncmp(oldPath, path, MAXPATHLEN)) ){
7615715ff30eSdrh     return SQLITE_OK;
7616715ff30eSdrh   }else{
7617715ff30eSdrh     unixFile *lockProxy = pCtx->lockProxy;
7618715ff30eSdrh     pCtx->lockProxy=NULL;
7619715ff30eSdrh     pCtx->conchHeld = 0;
7620715ff30eSdrh     if( lockProxy!=NULL ){
7621715ff30eSdrh       rc=lockProxy->pMethod->xClose((sqlite3_file *)lockProxy);
7622715ff30eSdrh       if( rc ) return rc;
7623715ff30eSdrh       sqlite3_free(lockProxy);
7624715ff30eSdrh     }
7625715ff30eSdrh     sqlite3_free(oldPath);
7626715ff30eSdrh     pCtx->lockProxyPath = sqlite3DbStrDup(0, path);
7627715ff30eSdrh   }
7628715ff30eSdrh 
7629715ff30eSdrh   return rc;
7630715ff30eSdrh }
7631715ff30eSdrh 
7632715ff30eSdrh /*
7633715ff30eSdrh ** pFile is a file that has been opened by a prior xOpen call.  dbPath
7634715ff30eSdrh ** is a string buffer at least MAXPATHLEN+1 characters in size.
7635715ff30eSdrh **
7636715ff30eSdrh ** This routine find the filename associated with pFile and writes it
7637715ff30eSdrh ** int dbPath.
7638715ff30eSdrh */
proxyGetDbPathForUnixFile(unixFile * pFile,char * dbPath)7639715ff30eSdrh static int proxyGetDbPathForUnixFile(unixFile *pFile, char *dbPath){
7640d2cb50b7Sdrh #if defined(__APPLE__)
7641715ff30eSdrh   if( pFile->pMethod == &afpIoMethods ){
7642715ff30eSdrh     /* afp style keeps a reference to the db path in the filePath field
7643715ff30eSdrh     ** of the struct */
7644ea678832Sdrh     assert( (int)strlen((char*)pFile->lockingContext)<=MAXPATHLEN );
76454bf66fd6Sdrh     strlcpy(dbPath, ((afpLockingContext *)pFile->lockingContext)->dbPath,
76464bf66fd6Sdrh             MAXPATHLEN);
7647715ff30eSdrh   } else
7648715ff30eSdrh #endif
7649715ff30eSdrh   if( pFile->pMethod == &dotlockIoMethods ){
7650715ff30eSdrh     /* dot lock style uses the locking context to store the dot lock
7651715ff30eSdrh     ** file path */
7652715ff30eSdrh     int len = strlen((char *)pFile->lockingContext) - strlen(DOTLOCK_SUFFIX);
7653715ff30eSdrh     memcpy(dbPath, (char *)pFile->lockingContext, len + 1);
7654715ff30eSdrh   }else{
7655715ff30eSdrh     /* all other styles use the locking context to store the db file path */
7656715ff30eSdrh     assert( strlen((char*)pFile->lockingContext)<=MAXPATHLEN );
76577ed97b9dSdrh     strlcpy(dbPath, (char *)pFile->lockingContext, MAXPATHLEN);
7658715ff30eSdrh   }
7659715ff30eSdrh   return SQLITE_OK;
7660715ff30eSdrh }
7661715ff30eSdrh 
7662715ff30eSdrh /*
7663715ff30eSdrh ** Takes an already filled in unix file and alters it so all file locking
7664715ff30eSdrh ** will be performed on the local proxy lock file.  The following fields
7665715ff30eSdrh ** are preserved in the locking context so that they can be restored and
7666715ff30eSdrh ** the unix structure properly cleaned up at close time:
7667715ff30eSdrh **  ->lockingContext
7668715ff30eSdrh **  ->pMethod
7669715ff30eSdrh */
proxyTransformUnixFile(unixFile * pFile,const char * path)7670715ff30eSdrh static int proxyTransformUnixFile(unixFile *pFile, const char *path) {
7671715ff30eSdrh   proxyLockingContext *pCtx;
7672715ff30eSdrh   char dbPath[MAXPATHLEN+1];       /* Name of the database file */
7673715ff30eSdrh   char *lockPath=NULL;
7674715ff30eSdrh   int rc = SQLITE_OK;
7675715ff30eSdrh 
7676308c2a5cSdrh   if( pFile->eFileLock!=NO_LOCK ){
7677715ff30eSdrh     return SQLITE_BUSY;
7678715ff30eSdrh   }
7679715ff30eSdrh   proxyGetDbPathForUnixFile(pFile, dbPath);
7680715ff30eSdrh   if( !path || path[0]=='\0' || !strcmp(path, ":auto:") ){
7681715ff30eSdrh     lockPath=NULL;
7682715ff30eSdrh   }else{
7683715ff30eSdrh     lockPath=(char *)path;
7684715ff30eSdrh   }
7685715ff30eSdrh 
7686308c2a5cSdrh   OSTRACE(("TRANSPROXY  %d for %s pid=%d\n", pFile->h,
76875ac93652Sdrh            (lockPath ? lockPath : ":auto:"), osGetpid(0)));
7688715ff30eSdrh 
7689f3cdcdccSdrh   pCtx = sqlite3_malloc64( sizeof(*pCtx) );
7690715ff30eSdrh   if( pCtx==0 ){
7691fad3039cSmistachkin     return SQLITE_NOMEM_BKPT;
7692715ff30eSdrh   }
7693715ff30eSdrh   memset(pCtx, 0, sizeof(*pCtx));
7694715ff30eSdrh 
7695715ff30eSdrh   rc = proxyCreateConchPathname(dbPath, &pCtx->conchFilePath);
7696715ff30eSdrh   if( rc==SQLITE_OK ){
76977ed97b9dSdrh     rc = proxyCreateUnixFile(pCtx->conchFilePath, &pCtx->conchFile, 0);
76987ed97b9dSdrh     if( rc==SQLITE_CANTOPEN && ((pFile->openFlags&O_RDWR) == 0) ){
76997ed97b9dSdrh       /* if (a) the open flags are not O_RDWR, (b) the conch isn't there, and
77007ed97b9dSdrh       ** (c) the file system is read-only, then enable no-locking access.
77017ed97b9dSdrh       ** Ugh, since O_RDONLY==0x0000 we test for !O_RDWR since unixOpen asserts
77027ed97b9dSdrh       ** that openFlags will have only one of O_RDONLY or O_RDWR.
77037ed97b9dSdrh       */
77047ed97b9dSdrh       struct statfs fsInfo;
77057ed97b9dSdrh       struct stat conchInfo;
77067ed97b9dSdrh       int goLockless = 0;
77077ed97b9dSdrh 
770899ab3b12Sdrh       if( osStat(pCtx->conchFilePath, &conchInfo) == -1 ) {
77097ed97b9dSdrh         int err = errno;
77107ed97b9dSdrh         if( (err==ENOENT) && (statfs(dbPath, &fsInfo) != -1) ){
77117ed97b9dSdrh           goLockless = (fsInfo.f_flags&MNT_RDONLY) == MNT_RDONLY;
77127ed97b9dSdrh         }
77137ed97b9dSdrh       }
77147ed97b9dSdrh       if( goLockless ){
77157ed97b9dSdrh         pCtx->conchHeld = -1; /* read only FS/ lockless */
77167ed97b9dSdrh         rc = SQLITE_OK;
77177ed97b9dSdrh       }
77187ed97b9dSdrh     }
7719715ff30eSdrh   }
7720715ff30eSdrh   if( rc==SQLITE_OK && lockPath ){
7721715ff30eSdrh     pCtx->lockProxyPath = sqlite3DbStrDup(0, lockPath);
7722715ff30eSdrh   }
7723715ff30eSdrh 
7724715ff30eSdrh   if( rc==SQLITE_OK ){
77257ed97b9dSdrh     pCtx->dbPath = sqlite3DbStrDup(0, dbPath);
77267ed97b9dSdrh     if( pCtx->dbPath==NULL ){
7727fad3039cSmistachkin       rc = SQLITE_NOMEM_BKPT;
77287ed97b9dSdrh     }
77297ed97b9dSdrh   }
77307ed97b9dSdrh   if( rc==SQLITE_OK ){
7731715ff30eSdrh     /* all memory is allocated, proxys are created and assigned,
7732715ff30eSdrh     ** switch the locking context and pMethod then return.
7733715ff30eSdrh     */
7734715ff30eSdrh     pCtx->oldLockingContext = pFile->lockingContext;
7735715ff30eSdrh     pFile->lockingContext = pCtx;
7736715ff30eSdrh     pCtx->pOldMethod = pFile->pMethod;
7737715ff30eSdrh     pFile->pMethod = &proxyIoMethods;
7738715ff30eSdrh   }else{
7739715ff30eSdrh     if( pCtx->conchFile ){
77407ed97b9dSdrh       pCtx->conchFile->pMethod->xClose((sqlite3_file *)pCtx->conchFile);
7741715ff30eSdrh       sqlite3_free(pCtx->conchFile);
7742715ff30eSdrh     }
7743d56b121dSdrh     sqlite3DbFree(0, pCtx->lockProxyPath);
7744715ff30eSdrh     sqlite3_free(pCtx->conchFilePath);
7745715ff30eSdrh     sqlite3_free(pCtx);
7746715ff30eSdrh   }
7747308c2a5cSdrh   OSTRACE(("TRANSPROXY  %d %s\n", pFile->h,
7748308c2a5cSdrh            (rc==SQLITE_OK ? "ok" : "failed")));
7749715ff30eSdrh   return rc;
7750715ff30eSdrh }
7751715ff30eSdrh 
7752715ff30eSdrh 
7753715ff30eSdrh /*
7754715ff30eSdrh ** This routine handles sqlite3_file_control() calls that are specific
7755715ff30eSdrh ** to proxy locking.
7756715ff30eSdrh */
proxyFileControl(sqlite3_file * id,int op,void * pArg)7757715ff30eSdrh static int proxyFileControl(sqlite3_file *id, int op, void *pArg){
7758715ff30eSdrh   switch( op ){
77594bf66fd6Sdrh     case SQLITE_FCNTL_GET_LOCKPROXYFILE: {
7760715ff30eSdrh       unixFile *pFile = (unixFile*)id;
7761715ff30eSdrh       if( pFile->pMethod == &proxyIoMethods ){
7762715ff30eSdrh         proxyLockingContext *pCtx = (proxyLockingContext*)pFile->lockingContext;
7763715ff30eSdrh         proxyTakeConch(pFile);
7764715ff30eSdrh         if( pCtx->lockProxyPath ){
7765715ff30eSdrh           *(const char **)pArg = pCtx->lockProxyPath;
7766715ff30eSdrh         }else{
7767715ff30eSdrh           *(const char **)pArg = ":auto: (not held)";
7768715ff30eSdrh         }
7769715ff30eSdrh       } else {
7770715ff30eSdrh         *(const char **)pArg = NULL;
7771715ff30eSdrh       }
7772715ff30eSdrh       return SQLITE_OK;
7773715ff30eSdrh     }
77744bf66fd6Sdrh     case SQLITE_FCNTL_SET_LOCKPROXYFILE: {
7775715ff30eSdrh       unixFile *pFile = (unixFile*)id;
7776715ff30eSdrh       int rc = SQLITE_OK;
7777715ff30eSdrh       int isProxyStyle = (pFile->pMethod == &proxyIoMethods);
7778715ff30eSdrh       if( pArg==NULL || (const char *)pArg==0 ){
7779715ff30eSdrh         if( isProxyStyle ){
77804bf66fd6Sdrh           /* turn off proxy locking - not supported.  If support is added for
77814bf66fd6Sdrh           ** switching proxy locking mode off then it will need to fail if
77824bf66fd6Sdrh           ** the journal mode is WAL mode.
77834bf66fd6Sdrh           */
7784715ff30eSdrh           rc = SQLITE_ERROR /*SQLITE_PROTOCOL? SQLITE_MISUSE?*/;
7785715ff30eSdrh         }else{
7786715ff30eSdrh           /* turn off proxy locking - already off - NOOP */
7787715ff30eSdrh           rc = SQLITE_OK;
7788715ff30eSdrh         }
7789715ff30eSdrh       }else{
7790715ff30eSdrh         const char *proxyPath = (const char *)pArg;
7791715ff30eSdrh         if( isProxyStyle ){
7792715ff30eSdrh           proxyLockingContext *pCtx =
7793715ff30eSdrh             (proxyLockingContext*)pFile->lockingContext;
7794715ff30eSdrh           if( !strcmp(pArg, ":auto:")
7795715ff30eSdrh            || (pCtx->lockProxyPath &&
7796715ff30eSdrh                !strncmp(pCtx->lockProxyPath, proxyPath, MAXPATHLEN))
7797715ff30eSdrh           ){
7798715ff30eSdrh             rc = SQLITE_OK;
7799715ff30eSdrh           }else{
7800715ff30eSdrh             rc = switchLockProxyPath(pFile, proxyPath);
7801715ff30eSdrh           }
7802715ff30eSdrh         }else{
7803715ff30eSdrh           /* turn on proxy file locking */
7804715ff30eSdrh           rc = proxyTransformUnixFile(pFile, proxyPath);
7805715ff30eSdrh         }
7806715ff30eSdrh       }
7807715ff30eSdrh       return rc;
7808715ff30eSdrh     }
7809715ff30eSdrh     default: {
7810715ff30eSdrh       assert( 0 );  /* The call assures that only valid opcodes are sent */
7811715ff30eSdrh     }
7812715ff30eSdrh   }
78138616cff6Sdrh   /*NOTREACHED*/ assert(0);
7814715ff30eSdrh   return SQLITE_ERROR;
7815715ff30eSdrh }
7816715ff30eSdrh 
7817715ff30eSdrh /*
7818715ff30eSdrh ** Within this division (the proxying locking implementation) the procedures
7819715ff30eSdrh ** above this point are all utilities.  The lock-related methods of the
7820715ff30eSdrh ** proxy-locking sqlite3_io_method object follow.
7821715ff30eSdrh */
7822715ff30eSdrh 
7823715ff30eSdrh 
7824715ff30eSdrh /*
7825715ff30eSdrh ** This routine checks if there is a RESERVED lock held on the specified
7826715ff30eSdrh ** file by this or any other process. If such a lock is held, set *pResOut
7827715ff30eSdrh ** to a non-zero value otherwise *pResOut is set to zero.  The return value
7828715ff30eSdrh ** is set to SQLITE_OK unless an I/O error occurs during lock checking.
7829715ff30eSdrh */
proxyCheckReservedLock(sqlite3_file * id,int * pResOut)7830715ff30eSdrh static int proxyCheckReservedLock(sqlite3_file *id, int *pResOut) {
7831715ff30eSdrh   unixFile *pFile = (unixFile*)id;
7832715ff30eSdrh   int rc = proxyTakeConch(pFile);
7833715ff30eSdrh   if( rc==SQLITE_OK ){
7834715ff30eSdrh     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
78357ed97b9dSdrh     if( pCtx->conchHeld>0 ){
7836715ff30eSdrh       unixFile *proxy = pCtx->lockProxy;
7837715ff30eSdrh       return proxy->pMethod->xCheckReservedLock((sqlite3_file*)proxy, pResOut);
78387ed97b9dSdrh     }else{ /* conchHeld < 0 is lockless */
78397ed97b9dSdrh       pResOut=0;
78407ed97b9dSdrh     }
7841715ff30eSdrh   }
7842715ff30eSdrh   return rc;
7843715ff30eSdrh }
7844715ff30eSdrh 
7845715ff30eSdrh /*
7846308c2a5cSdrh ** Lock the file with the lock specified by parameter eFileLock - one
7847715ff30eSdrh ** of the following:
7848715ff30eSdrh **
7849715ff30eSdrh **     (1) SHARED_LOCK
7850715ff30eSdrh **     (2) RESERVED_LOCK
7851715ff30eSdrh **     (3) PENDING_LOCK
7852715ff30eSdrh **     (4) EXCLUSIVE_LOCK
7853715ff30eSdrh **
7854715ff30eSdrh ** Sometimes when requesting one lock state, additional lock states
7855715ff30eSdrh ** are inserted in between.  The locking might fail on one of the later
7856715ff30eSdrh ** transitions leaving the lock state different from what it started but
7857715ff30eSdrh ** still short of its goal.  The following chart shows the allowed
7858715ff30eSdrh ** transitions and the inserted intermediate states:
7859715ff30eSdrh **
7860715ff30eSdrh **    UNLOCKED -> SHARED
7861715ff30eSdrh **    SHARED -> RESERVED
7862715ff30eSdrh **    SHARED -> (PENDING) -> EXCLUSIVE
7863715ff30eSdrh **    RESERVED -> (PENDING) -> EXCLUSIVE
7864715ff30eSdrh **    PENDING -> EXCLUSIVE
7865715ff30eSdrh **
7866715ff30eSdrh ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
7867715ff30eSdrh ** routine to lower a locking level.
7868715ff30eSdrh */
proxyLock(sqlite3_file * id,int eFileLock)7869308c2a5cSdrh static int proxyLock(sqlite3_file *id, int eFileLock) {
7870715ff30eSdrh   unixFile *pFile = (unixFile*)id;
7871715ff30eSdrh   int rc = proxyTakeConch(pFile);
7872715ff30eSdrh   if( rc==SQLITE_OK ){
7873715ff30eSdrh     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
78747ed97b9dSdrh     if( pCtx->conchHeld>0 ){
7875715ff30eSdrh       unixFile *proxy = pCtx->lockProxy;
7876308c2a5cSdrh       rc = proxy->pMethod->xLock((sqlite3_file*)proxy, eFileLock);
7877308c2a5cSdrh       pFile->eFileLock = proxy->eFileLock;
78787ed97b9dSdrh     }else{
78797ed97b9dSdrh       /* conchHeld < 0 is lockless */
78807ed97b9dSdrh     }
7881715ff30eSdrh   }
7882715ff30eSdrh   return rc;
7883715ff30eSdrh }
7884715ff30eSdrh 
7885715ff30eSdrh 
7886715ff30eSdrh /*
7887308c2a5cSdrh ** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
7888715ff30eSdrh ** must be either NO_LOCK or SHARED_LOCK.
7889715ff30eSdrh **
7890715ff30eSdrh ** If the locking level of the file descriptor is already at or below
7891715ff30eSdrh ** the requested locking level, this routine is a no-op.
7892715ff30eSdrh */
proxyUnlock(sqlite3_file * id,int eFileLock)7893308c2a5cSdrh static int proxyUnlock(sqlite3_file *id, int eFileLock) {
7894715ff30eSdrh   unixFile *pFile = (unixFile*)id;
7895715ff30eSdrh   int rc = proxyTakeConch(pFile);
7896715ff30eSdrh   if( rc==SQLITE_OK ){
7897715ff30eSdrh     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
78987ed97b9dSdrh     if( pCtx->conchHeld>0 ){
7899715ff30eSdrh       unixFile *proxy = pCtx->lockProxy;
7900308c2a5cSdrh       rc = proxy->pMethod->xUnlock((sqlite3_file*)proxy, eFileLock);
7901308c2a5cSdrh       pFile->eFileLock = proxy->eFileLock;
79027ed97b9dSdrh     }else{
79037ed97b9dSdrh       /* conchHeld < 0 is lockless */
79047ed97b9dSdrh     }
7905715ff30eSdrh   }
7906715ff30eSdrh   return rc;
7907715ff30eSdrh }
7908715ff30eSdrh 
7909715ff30eSdrh /*
7910715ff30eSdrh ** Close a file that uses proxy locks.
7911715ff30eSdrh */
proxyClose(sqlite3_file * id)7912715ff30eSdrh static int proxyClose(sqlite3_file *id) {
7913a8de1e1cSdrh   if( ALWAYS(id) ){
7914715ff30eSdrh     unixFile *pFile = (unixFile*)id;
7915715ff30eSdrh     proxyLockingContext *pCtx = (proxyLockingContext *)pFile->lockingContext;
7916715ff30eSdrh     unixFile *lockProxy = pCtx->lockProxy;
7917715ff30eSdrh     unixFile *conchFile = pCtx->conchFile;
7918715ff30eSdrh     int rc = SQLITE_OK;
7919715ff30eSdrh 
7920715ff30eSdrh     if( lockProxy ){
7921715ff30eSdrh       rc = lockProxy->pMethod->xUnlock((sqlite3_file*)lockProxy, NO_LOCK);
7922715ff30eSdrh       if( rc ) return rc;
7923715ff30eSdrh       rc = lockProxy->pMethod->xClose((sqlite3_file*)lockProxy);
7924715ff30eSdrh       if( rc ) return rc;
7925715ff30eSdrh       sqlite3_free(lockProxy);
7926715ff30eSdrh       pCtx->lockProxy = 0;
7927715ff30eSdrh     }
7928715ff30eSdrh     if( conchFile ){
7929715ff30eSdrh       if( pCtx->conchHeld ){
7930715ff30eSdrh         rc = proxyReleaseConch(pFile);
7931715ff30eSdrh         if( rc ) return rc;
7932715ff30eSdrh       }
7933715ff30eSdrh       rc = conchFile->pMethod->xClose((sqlite3_file*)conchFile);
7934715ff30eSdrh       if( rc ) return rc;
7935715ff30eSdrh       sqlite3_free(conchFile);
7936715ff30eSdrh     }
7937d56b121dSdrh     sqlite3DbFree(0, pCtx->lockProxyPath);
7938715ff30eSdrh     sqlite3_free(pCtx->conchFilePath);
7939d56b121dSdrh     sqlite3DbFree(0, pCtx->dbPath);
7940715ff30eSdrh     /* restore the original locking context and pMethod then close it */
7941715ff30eSdrh     pFile->lockingContext = pCtx->oldLockingContext;
7942715ff30eSdrh     pFile->pMethod = pCtx->pOldMethod;
7943715ff30eSdrh     sqlite3_free(pCtx);
7944715ff30eSdrh     return pFile->pMethod->xClose(id);
7945715ff30eSdrh   }
7946715ff30eSdrh   return SQLITE_OK;
7947715ff30eSdrh }
7948715ff30eSdrh 
7949715ff30eSdrh 
7950715ff30eSdrh 
7951d2cb50b7Sdrh #endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
7952715ff30eSdrh /*
7953715ff30eSdrh ** The proxy locking style is intended for use with AFP filesystems.
7954715ff30eSdrh ** And since AFP is only supported on MacOSX, the proxy locking is also
7955715ff30eSdrh ** restricted to MacOSX.
7956715ff30eSdrh **
7957715ff30eSdrh **
7958715ff30eSdrh ******************* End of the proxy lock implementation **********************
7959715ff30eSdrh ******************************************************************************/
7960715ff30eSdrh 
7961734c9864Sdrh /*
7962e339d65aSdanielk1977 ** Initialize the operating system interface.
7963734c9864Sdrh **
7964734c9864Sdrh ** This routine registers all VFS implementations for unix-like operating
7965734c9864Sdrh ** systems.  This routine, and the sqlite3_os_end() routine that follows,
7966734c9864Sdrh ** should be the only routines in this file that are visible from other
7967734c9864Sdrh ** files.
79686b9d6ddcSdrh **
79696b9d6ddcSdrh ** This routine is called once during SQLite initialization and by a
79706b9d6ddcSdrh ** single thread.  The memory allocation and mutex subsystems have not
79716b9d6ddcSdrh ** necessarily been initialized when this routine is called, and so they
79726b9d6ddcSdrh ** should not be used.
7973153c62c4Sdrh */
sqlite3_os_init(void)7974c0fa4c5fSdanielk1977 int sqlite3_os_init(void){
79756b9d6ddcSdrh   /*
79766b9d6ddcSdrh   ** The following macro defines an initializer for an sqlite3_vfs object.
79771875f7a3Sdrh   ** The name of the VFS is NAME.  The pAppData is a pointer to a pointer
79781875f7a3Sdrh   ** to the "finder" function.  (pAppData is a pointer to a pointer because
79791875f7a3Sdrh   ** silly C90 rules prohibit a void* from being cast to a function pointer
79801875f7a3Sdrh   ** and so we have to go through the intermediate pointer to avoid problems
79811875f7a3Sdrh   ** when compiling with -pedantic-errors on GCC.)
79821875f7a3Sdrh   **
79831875f7a3Sdrh   ** The FINDER parameter to this macro is the name of the pointer to the
79846b9d6ddcSdrh   ** finder-function.  The finder-function returns a pointer to the
79856b9d6ddcSdrh   ** sqlite_io_methods object that implements the desired locking
79866b9d6ddcSdrh   ** behaviors.  See the division above that contains the IOMETHODS
79876b9d6ddcSdrh   ** macro for addition information on finder-functions.
7988e339d65aSdanielk1977   **
79896b9d6ddcSdrh   ** Most finders simply return a pointer to a fixed sqlite3_io_methods
79906b9d6ddcSdrh   ** object.  But the "autolockIoFinder" available on MacOSX does a little
79916b9d6ddcSdrh   ** more than that; it looks at the filesystem type that hosts the
79926b9d6ddcSdrh   ** database file and tries to choose an locking method appropriate for
79936b9d6ddcSdrh   ** that filesystem time.
7994e339d65aSdanielk1977   */
79957708e972Sdrh   #define UNIXVFS(VFSNAME, FINDER) {                        \
799699ab3b12Sdrh     3,                    /* iVersion */                    \
7997e339d65aSdanielk1977     sizeof(unixFile),     /* szOsFile */                    \
7998e339d65aSdanielk1977     MAX_PATHNAME,         /* mxPathname */                  \
7999e339d65aSdanielk1977     0,                    /* pNext */                       \
80007708e972Sdrh     VFSNAME,              /* zName */                       \
80011875f7a3Sdrh     (void*)&FINDER,       /* pAppData */                    \
8002e339d65aSdanielk1977     unixOpen,             /* xOpen */                       \
8003e339d65aSdanielk1977     unixDelete,           /* xDelete */                     \
8004e339d65aSdanielk1977     unixAccess,           /* xAccess */                     \
8005e339d65aSdanielk1977     unixFullPathname,     /* xFullPathname */               \
8006e339d65aSdanielk1977     unixDlOpen,           /* xDlOpen */                     \
8007e339d65aSdanielk1977     unixDlError,          /* xDlError */                    \
8008e339d65aSdanielk1977     unixDlSym,            /* xDlSym */                      \
8009e339d65aSdanielk1977     unixDlClose,          /* xDlClose */                    \
8010e339d65aSdanielk1977     unixRandomness,       /* xRandomness */                 \
8011e339d65aSdanielk1977     unixSleep,            /* xSleep */                      \
8012e339d65aSdanielk1977     unixCurrentTime,      /* xCurrentTime */                \
8013f2424c52Sdrh     unixGetLastError,     /* xGetLastError */               \
8014b7e8ea20Sdrh     unixCurrentTimeInt64, /* xCurrentTimeInt64 */           \
801599ab3b12Sdrh     unixSetSystemCall,    /* xSetSystemCall */              \
80161df30967Sdrh     unixGetSystemCall,    /* xGetSystemCall */              \
80171df30967Sdrh     unixNextSystemCall,   /* xNextSystemCall */             \
8018e339d65aSdanielk1977   }
8019b4b47411Sdanielk1977 
80206b9d6ddcSdrh   /*
80216b9d6ddcSdrh   ** All default VFSes for unix are contained in the following array.
80226b9d6ddcSdrh   **
80236b9d6ddcSdrh   ** Note that the sqlite3_vfs.pNext field of the VFS object is modified
80246b9d6ddcSdrh   ** by the SQLite core when the VFS is registered.  So the following
80256b9d6ddcSdrh   ** array cannot be const.
80266b9d6ddcSdrh   */
8027e339d65aSdanielk1977   static sqlite3_vfs aVfs[] = {
8028e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
80297708e972Sdrh     UNIXVFS("unix",          autolockIoFinder ),
8030e89b2918Sdrh #elif OS_VXWORKS
8031e89b2918Sdrh     UNIXVFS("unix",          vxworksIoFinder ),
80327708e972Sdrh #else
80337708e972Sdrh     UNIXVFS("unix",          posixIoFinder ),
80347708e972Sdrh #endif
80357708e972Sdrh     UNIXVFS("unix-none",     nolockIoFinder ),
80367708e972Sdrh     UNIXVFS("unix-dotfile",  dotlockIoFinder ),
8037a7e61d8bSdrh     UNIXVFS("unix-excl",     posixIoFinder ),
8038734c9864Sdrh #if OS_VXWORKS
80397708e972Sdrh     UNIXVFS("unix-namedsem", semIoFinder ),
8040734c9864Sdrh #endif
8041e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE || OS_VXWORKS
80427708e972Sdrh     UNIXVFS("unix-posix",    posixIoFinder ),
8043734c9864Sdrh #endif
8044e89b2918Sdrh #if SQLITE_ENABLE_LOCKING_STYLE
8045e89b2918Sdrh     UNIXVFS("unix-flock",    flockIoFinder ),
804678a1318bSchw #endif
8047d2cb50b7Sdrh #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
80487708e972Sdrh     UNIXVFS("unix-afp",      afpIoFinder ),
80497ed97b9dSdrh     UNIXVFS("unix-nfs",      nfsIoFinder ),
80507708e972Sdrh     UNIXVFS("unix-proxy",    proxyIoFinder ),
8051734c9864Sdrh #endif
8052b4b47411Sdanielk1977   };
80536b9d6ddcSdrh   unsigned int i;          /* Loop counter */
80546b9d6ddcSdrh 
80552aa5a00eSdrh   /* Double-check that the aSyscall[] array has been constructed
80562aa5a00eSdrh   ** correctly.  See ticket [bb3a86e890c8e96ab] */
8057efe16971Sdan   assert( ArraySize(aSyscall)==29 );
80582aa5a00eSdrh 
80596b9d6ddcSdrh   /* Register all VFSes defined in the aVfs[] array */
8060e339d65aSdanielk1977   for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){
8061087f143fSdrh #ifdef SQLITE_DEFAULT_UNIX_VFS
8062087f143fSdrh     sqlite3_vfs_register(&aVfs[i],
8063087f143fSdrh            0==strcmp(aVfs[i].zName,SQLITE_DEFAULT_UNIX_VFS));
8064087f143fSdrh #else
8065734c9864Sdrh     sqlite3_vfs_register(&aVfs[i], i==0);
8066087f143fSdrh #endif
8067e339d65aSdanielk1977   }
8068*20a9ed1dSdrh #ifdef SQLITE_OS_KV_OPTIONAL
8069*20a9ed1dSdrh   sqlite3KvvfsInit();
8070*20a9ed1dSdrh #endif
807156115893Sdrh   unixBigLock = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_VFS1);
8072187e2e48Sdrh 
8073f54e7940Sdrh #ifndef SQLITE_OMIT_WAL
8074187e2e48Sdrh   /* Validate lock assumptions */
8075187e2e48Sdrh   assert( SQLITE_SHM_NLOCK==8 );  /* Number of available locks */
8076187e2e48Sdrh   assert( UNIX_SHM_BASE==120  );  /* Start of locking area */
8077187e2e48Sdrh   /* Locks:
807804f4b683Sdrh   **    WRITE       UNIX_SHM_BASE      120
807904f4b683Sdrh   **    CKPT        UNIX_SHM_BASE+1    121
808004f4b683Sdrh   **    RECOVER     UNIX_SHM_BASE+2    122
808104f4b683Sdrh   **    READ-0      UNIX_SHM_BASE+3    123
808204f4b683Sdrh   **    READ-1      UNIX_SHM_BASE+4    124
808304f4b683Sdrh   **    READ-2      UNIX_SHM_BASE+5    125
808404f4b683Sdrh   **    READ-3      UNIX_SHM_BASE+6    126
808504f4b683Sdrh   **    READ-4      UNIX_SHM_BASE+7    127
808604f4b683Sdrh   **    DMS         UNIX_SHM_BASE+8    128
808704f4b683Sdrh   */
8088187e2e48Sdrh   assert( UNIX_SHM_DMS==128   );  /* Byte offset of the deadman-switch */
8089f54e7940Sdrh #endif
8090f54e7940Sdrh 
8091d9137e3bSdan   /* Initialize temp file dir array. */
8092d9137e3bSdan   unixTempFileInit();
8093d9137e3bSdan 
8094c0fa4c5fSdanielk1977   return SQLITE_OK;
8095153c62c4Sdrh }
8096e339d65aSdanielk1977 
8097e339d65aSdanielk1977 /*
80986b9d6ddcSdrh ** Shutdown the operating system interface.
80996b9d6ddcSdrh **
81006b9d6ddcSdrh ** Some operating systems might need to do some cleanup in this routine,
81016b9d6ddcSdrh ** to release dynamically allocated objects.  But not on unix.
81026b9d6ddcSdrh ** This routine is a no-op for unix.
8103e339d65aSdanielk1977 */
sqlite3_os_end(void)8104c0fa4c5fSdanielk1977 int sqlite3_os_end(void){
810556115893Sdrh   unixBigLock = 0;
8106c0fa4c5fSdanielk1977   return SQLITE_OK;
8107c0fa4c5fSdanielk1977 }
810840257ffdSdrh 
810929bafeabSdanielk1977 #endif /* SQLITE_OS_UNIX */
8110