xref: /sqlite-3.40.0/src/os_unix.c (revision 4dcbdbff)
1 /*
2 ** 2004 May 22
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains code that is specific to Unix systems.
14 */
15 #include "sqliteInt.h"
16 #include "os.h"
17 #if OS_UNIX              /* This file is used on unix only */
18 
19 
20 #include <time.h>
21 #include <errno.h>
22 #include <unistd.h>
23 
24 /*
25 ** Do not include any of the File I/O interface procedures if the
26 ** SQLITE_OMIT_DISKIO macro is defined (indicating that there database
27 ** will be in-memory only)
28 */
29 #ifndef SQLITE_OMIT_DISKIO
30 
31 
32 /*
33 ** Define various macros that are missing from some systems.
34 */
35 #ifndef O_LARGEFILE
36 # define O_LARGEFILE 0
37 #endif
38 #ifdef SQLITE_DISABLE_LFS
39 # undef O_LARGEFILE
40 # define O_LARGEFILE 0
41 #endif
42 #ifndef O_NOFOLLOW
43 # define O_NOFOLLOW 0
44 #endif
45 #ifndef O_BINARY
46 # define O_BINARY 0
47 #endif
48 
49 /*
50 ** The DJGPP compiler environment looks mostly like Unix, but it
51 ** lacks the fcntl() system call.  So redefine fcntl() to be something
52 ** that always succeeds.  This means that locking does not occur under
53 ** DJGPP.  But its DOS - what did you expect?
54 */
55 #ifdef __DJGPP__
56 # define fcntl(A,B,C) 0
57 #endif
58 
59 /*
60 ** Include code that is common to all os_*.c files
61 */
62 #include "os_common.h"
63 
64 /*
65 ** The threadid macro resolves to the thread-id or to 0.  Used for
66 ** testing and debugging only.
67 */
68 #ifdef SQLITE_UNIX_THREADS
69 #define threadid pthread_self()
70 #else
71 #define threadid 0
72 #endif
73 
74 /*
75 ** Set or check the OsFile.tid field.  This field is set when an OsFile
76 ** is first opened.  All subsequent uses of the OsFile verify that the
77 ** same thread is operating on the OsFile.  Some operating systems do
78 ** not allow locks to be overridden by other threads and that restriction
79 ** means that sqlite3* database handles cannot be moved from one thread
80 ** to another.  This logic makes sure a user does not try to do that
81 ** by mistake.
82 */
83 #ifdef SQLITE_UNIX_THREADS
84 # define SET_THREADID(X)   X->tid = pthread_self()
85 # define CHECK_THREADID(X) (!pthread_equal(X->tid, pthread_self()))
86 #else
87 # define SET_THREADID(X)
88 # define CHECK_THREADID(X) 0
89 #endif
90 
91 /*
92 ** Here is the dirt on POSIX advisory locks:  ANSI STD 1003.1 (1996)
93 ** section 6.5.2.2 lines 483 through 490 specify that when a process
94 ** sets or clears a lock, that operation overrides any prior locks set
95 ** by the same process.  It does not explicitly say so, but this implies
96 ** that it overrides locks set by the same process using a different
97 ** file descriptor.  Consider this test case:
98 **
99 **       int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
100 **       int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
101 **
102 ** Suppose ./file1 and ./file2 are really the same file (because
103 ** one is a hard or symbolic link to the other) then if you set
104 ** an exclusive lock on fd1, then try to get an exclusive lock
105 ** on fd2, it works.  I would have expected the second lock to
106 ** fail since there was already a lock on the file due to fd1.
107 ** But not so.  Since both locks came from the same process, the
108 ** second overrides the first, even though they were on different
109 ** file descriptors opened on different file names.
110 **
111 ** Bummer.  If you ask me, this is broken.  Badly broken.  It means
112 ** that we cannot use POSIX locks to synchronize file access among
113 ** competing threads of the same process.  POSIX locks will work fine
114 ** to synchronize access for threads in separate processes, but not
115 ** threads within the same process.
116 **
117 ** To work around the problem, SQLite has to manage file locks internally
118 ** on its own.  Whenever a new database is opened, we have to find the
119 ** specific inode of the database file (the inode is determined by the
120 ** st_dev and st_ino fields of the stat structure that fstat() fills in)
121 ** and check for locks already existing on that inode.  When locks are
122 ** created or removed, we have to look at our own internal record of the
123 ** locks to see if another thread has previously set a lock on that same
124 ** inode.
125 **
126 ** The OsFile structure for POSIX is no longer just an integer file
127 ** descriptor.  It is now a structure that holds the integer file
128 ** descriptor and a pointer to a structure that describes the internal
129 ** locks on the corresponding inode.  There is one locking structure
130 ** per inode, so if the same inode is opened twice, both OsFile structures
131 ** point to the same locking structure.  The locking structure keeps
132 ** a reference count (so we will know when to delete it) and a "cnt"
133 ** field that tells us its internal lock status.  cnt==0 means the
134 ** file is unlocked.  cnt==-1 means the file has an exclusive lock.
135 ** cnt>0 means there are cnt shared locks on the file.
136 **
137 ** Any attempt to lock or unlock a file first checks the locking
138 ** structure.  The fcntl() system call is only invoked to set a
139 ** POSIX lock if the internal lock structure transitions between
140 ** a locked and an unlocked state.
141 **
142 ** 2004-Jan-11:
143 ** More recent discoveries about POSIX advisory locks.  (The more
144 ** I discover, the more I realize the a POSIX advisory locks are
145 ** an abomination.)
146 **
147 ** If you close a file descriptor that points to a file that has locks,
148 ** all locks on that file that are owned by the current process are
149 ** released.  To work around this problem, each OsFile structure contains
150 ** a pointer to an openCnt structure.  There is one openCnt structure
151 ** per open inode, which means that multiple OsFiles can point to a single
152 ** openCnt.  When an attempt is made to close an OsFile, if there are
153 ** other OsFiles open on the same inode that are holding locks, the call
154 ** to close() the file descriptor is deferred until all of the locks clear.
155 ** The openCnt structure keeps a list of file descriptors that need to
156 ** be closed and that list is walked (and cleared) when the last lock
157 ** clears.
158 **
159 ** First, under Linux threads, because each thread has a separate
160 ** process ID, lock operations in one thread do not override locks
161 ** to the same file in other threads.  Linux threads behave like
162 ** separate processes in this respect.  But, if you close a file
163 ** descriptor in linux threads, all locks are cleared, even locks
164 ** on other threads and even though the other threads have different
165 ** process IDs.  Linux threads is inconsistent in this respect.
166 ** (I'm beginning to think that linux threads is an abomination too.)
167 ** The consequence of this all is that the hash table for the lockInfo
168 ** structure has to include the process id as part of its key because
169 ** locks in different threads are treated as distinct.  But the
170 ** openCnt structure should not include the process id in its
171 ** key because close() clears lock on all threads, not just the current
172 ** thread.  Were it not for this goofiness in linux threads, we could
173 ** combine the lockInfo and openCnt structures into a single structure.
174 **
175 ** 2004-Jun-28:
176 ** On some versions of linux, threads can override each others locks.
177 ** On others not.  Sometimes you can change the behavior on the same
178 ** system by setting the LD_ASSUME_KERNEL environment variable.  The
179 ** POSIX standard is silent as to which behavior is correct, as far
180 ** as I can tell, so other versions of unix might show the same
181 ** inconsistency.  There is no little doubt in my mind that posix
182 ** advisory locks and linux threads are profoundly broken.
183 **
184 ** To work around the inconsistencies, we have to test at runtime
185 ** whether or not threads can override each others locks.  This test
186 ** is run once, the first time any lock is attempted.  A static
187 ** variable is set to record the results of this test for future
188 ** use.
189 */
190 
191 /*
192 ** An instance of the following structure serves as the key used
193 ** to locate a particular lockInfo structure given its inode.
194 **
195 ** If threads cannot override each others locks, then we set the
196 ** lockKey.tid field to the thread ID.  If threads can override
197 ** each others locks then tid is always set to zero.  tid is also
198 ** set to zero if we compile without threading support.
199 */
200 struct lockKey {
201   dev_t dev;       /* Device number */
202   ino_t ino;       /* Inode number */
203 #ifdef SQLITE_UNIX_THREADS
204   pthread_t tid;   /* Thread ID or zero if threads cannot override each other */
205 #endif
206 };
207 
208 /*
209 ** An instance of the following structure is allocated for each open
210 ** inode on each thread with a different process ID.  (Threads have
211 ** different process IDs on linux, but not on most other unixes.)
212 **
213 ** A single inode can have multiple file descriptors, so each OsFile
214 ** structure contains a pointer to an instance of this object and this
215 ** object keeps a count of the number of OsFiles pointing to it.
216 */
217 struct lockInfo {
218   struct lockKey key;  /* The lookup key */
219   int cnt;             /* Number of SHARED locks held */
220   int locktype;        /* One of SHARED_LOCK, RESERVED_LOCK etc. */
221   int nRef;            /* Number of pointers to this structure */
222 };
223 
224 /*
225 ** An instance of the following structure serves as the key used
226 ** to locate a particular openCnt structure given its inode.  This
227 ** is the same as the lockKey except that the thread ID is omitted.
228 */
229 struct openKey {
230   dev_t dev;   /* Device number */
231   ino_t ino;   /* Inode number */
232 };
233 
234 /*
235 ** An instance of the following structure is allocated for each open
236 ** inode.  This structure keeps track of the number of locks on that
237 ** inode.  If a close is attempted against an inode that is holding
238 ** locks, the close is deferred until all locks clear by adding the
239 ** file descriptor to be closed to the pending list.
240 */
241 struct openCnt {
242   struct openKey key;   /* The lookup key */
243   int nRef;             /* Number of pointers to this structure */
244   int nLock;            /* Number of outstanding locks */
245   int nPending;         /* Number of pending close() operations */
246   int *aPending;        /* Malloced space holding fd's awaiting a close() */
247 };
248 
249 /*
250 ** These hash table maps inodes and process IDs into lockInfo and openCnt
251 ** structures.  Access to these hash tables must be protected by a mutex.
252 */
253 static Hash lockHash = { SQLITE_HASH_BINARY, 0, 0, 0, 0, 0 };
254 static Hash openHash = { SQLITE_HASH_BINARY, 0, 0, 0, 0, 0 };
255 
256 
257 #ifdef SQLITE_UNIX_THREADS
258 /*
259 ** This variable records whether or not threads can override each others
260 ** locks.
261 **
262 **    0:  No.  Threads cannot override each others locks.
263 **    1:  Yes.  Threads can override each others locks.
264 **   -1:  We don't know yet.
265 */
266 static int threadsOverrideEachOthersLocks = -1;
267 
268 /*
269 ** This structure holds information passed into individual test
270 ** threads by the testThreadLockingBehavior() routine.
271 */
272 struct threadTestData {
273   int fd;                /* File to be locked */
274   struct flock lock;     /* The locking operation */
275   int result;            /* Result of the locking operation */
276 };
277 
278 #ifdef SQLITE_LOCK_TRACE
279 /*
280 ** Print out information about all locking operations.
281 **
282 ** This routine is used for troubleshooting locks on multithreaded
283 ** platforms.  Enable by compiling with the -DSQLITE_LOCK_TRACE
284 ** command-line option on the compiler.  This code is normally
285 ** turnned off.
286 */
287 static int lockTrace(int fd, int op, struct flock *p){
288   char *zOpName, *zType;
289   int s;
290   int savedErrno;
291   if( op==F_GETLK ){
292     zOpName = "GETLK";
293   }else if( op==F_SETLK ){
294     zOpName = "SETLK";
295   }else{
296     s = fcntl(fd, op, p);
297     sqlite3DebugPrintf("fcntl unknown %d %d %d\n", fd, op, s);
298     return s;
299   }
300   if( p->l_type==F_RDLCK ){
301     zType = "RDLCK";
302   }else if( p->l_type==F_WRLCK ){
303     zType = "WRLCK";
304   }else if( p->l_type==F_UNLCK ){
305     zType = "UNLCK";
306   }else{
307     assert( 0 );
308   }
309   assert( p->l_whence==SEEK_SET );
310   s = fcntl(fd, op, p);
311   savedErrno = errno;
312   sqlite3DebugPrintf("fcntl %d %d %s %s %d %d %d %d\n",
313      threadid, fd, zOpName, zType, (int)p->l_start, (int)p->l_len,
314      (int)p->l_pid, s);
315   if( s && op==F_SETLK && (p->l_type==F_RDLCK || p->l_type==F_WRLCK) ){
316     struct flock l2;
317     l2 = *p;
318     fcntl(fd, F_GETLK, &l2);
319     if( l2.l_type==F_RDLCK ){
320       zType = "RDLCK";
321     }else if( l2.l_type==F_WRLCK ){
322       zType = "WRLCK";
323     }else if( l2.l_type==F_UNLCK ){
324       zType = "UNLCK";
325     }else{
326       assert( 0 );
327     }
328     sqlite3DebugPrintf("fcntl-failure-reason: %s %d %d %d\n",
329        zType, (int)l2.l_start, (int)l2.l_len, (int)l2.l_pid);
330   }
331   errno = savedErrno;
332   return s;
333 }
334 #define fcntl lockTrace
335 #endif /* SQLITE_LOCK_TRACE */
336 
337 /*
338 ** The testThreadLockingBehavior() routine launches two separate
339 ** threads on this routine.  This routine attempts to lock a file
340 ** descriptor then returns.  The success or failure of that attempt
341 ** allows the testThreadLockingBehavior() procedure to determine
342 ** whether or not threads can override each others locks.
343 */
344 static void *threadLockingTest(void *pArg){
345   struct threadTestData *pData = (struct threadTestData*)pArg;
346   pData->result = fcntl(pData->fd, F_SETLK, &pData->lock);
347   return pArg;
348 }
349 
350 /*
351 ** This procedure attempts to determine whether or not threads
352 ** can override each others locks then sets the
353 ** threadsOverrideEachOthersLocks variable appropriately.
354 */
355 static void testThreadLockingBehavior(fd_orig){
356   int fd;
357   struct threadTestData d[2];
358   pthread_t t[2];
359 
360   fd = dup(fd_orig);
361   if( fd<0 ) return;
362   memset(d, 0, sizeof(d));
363   d[0].fd = fd;
364   d[0].lock.l_type = F_RDLCK;
365   d[0].lock.l_len = 1;
366   d[0].lock.l_start = 0;
367   d[0].lock.l_whence = SEEK_SET;
368   d[1] = d[0];
369   d[1].lock.l_type = F_WRLCK;
370   pthread_create(&t[0], 0, threadLockingTest, &d[0]);
371   pthread_create(&t[1], 0, threadLockingTest, &d[1]);
372   pthread_join(t[0], 0);
373   pthread_join(t[1], 0);
374   close(fd);
375   threadsOverrideEachOthersLocks =  d[0].result==0 && d[1].result==0;
376 }
377 #endif /* SQLITE_UNIX_THREADS */
378 
379 /*
380 ** Release a lockInfo structure previously allocated by findLockInfo().
381 */
382 static void releaseLockInfo(struct lockInfo *pLock){
383   pLock->nRef--;
384   if( pLock->nRef==0 ){
385     sqlite3HashInsert(&lockHash, &pLock->key, sizeof(pLock->key), 0);
386     sqliteFree(pLock);
387   }
388 }
389 
390 /*
391 ** Release a openCnt structure previously allocated by findLockInfo().
392 */
393 static void releaseOpenCnt(struct openCnt *pOpen){
394   pOpen->nRef--;
395   if( pOpen->nRef==0 ){
396     sqlite3HashInsert(&openHash, &pOpen->key, sizeof(pOpen->key), 0);
397     sqliteFree(pOpen->aPending);
398     sqliteFree(pOpen);
399   }
400 }
401 
402 /*
403 ** Given a file descriptor, locate lockInfo and openCnt structures that
404 ** describes that file descriptor.  Create a new ones if necessary.  The
405 ** return values might be unset if an error occurs.
406 **
407 ** Return the number of errors.
408 */
409 static int findLockInfo(
410   int fd,                      /* The file descriptor used in the key */
411   struct lockInfo **ppLock,    /* Return the lockInfo structure here */
412   struct openCnt **ppOpen      /* Return the openCnt structure here */
413 ){
414   int rc;
415   struct lockKey key1;
416   struct openKey key2;
417   struct stat statbuf;
418   struct lockInfo *pLock;
419   struct openCnt *pOpen;
420   rc = fstat(fd, &statbuf);
421   if( rc!=0 ) return 1;
422   memset(&key1, 0, sizeof(key1));
423   key1.dev = statbuf.st_dev;
424   key1.ino = statbuf.st_ino;
425 #ifdef SQLITE_UNIX_THREADS
426   if( threadsOverrideEachOthersLocks<0 ){
427     testThreadLockingBehavior(fd);
428   }
429   key1.tid = threadsOverrideEachOthersLocks ? 0 : pthread_self();
430 #endif
431   memset(&key2, 0, sizeof(key2));
432   key2.dev = statbuf.st_dev;
433   key2.ino = statbuf.st_ino;
434   pLock = (struct lockInfo*)sqlite3HashFind(&lockHash, &key1, sizeof(key1));
435   if( pLock==0 ){
436     struct lockInfo *pOld;
437     pLock = sqliteMallocRaw( sizeof(*pLock) );
438     if( pLock==0 ) return 1;
439     pLock->key = key1;
440     pLock->nRef = 1;
441     pLock->cnt = 0;
442     pLock->locktype = 0;
443     pOld = sqlite3HashInsert(&lockHash, &pLock->key, sizeof(key1), pLock);
444     if( pOld!=0 ){
445       assert( pOld==pLock );
446       sqliteFree(pLock);
447       return 1;
448     }
449   }else{
450     pLock->nRef++;
451   }
452   *ppLock = pLock;
453   pOpen = (struct openCnt*)sqlite3HashFind(&openHash, &key2, sizeof(key2));
454   if( pOpen==0 ){
455     struct openCnt *pOld;
456     pOpen = sqliteMallocRaw( sizeof(*pOpen) );
457     if( pOpen==0 ){
458       releaseLockInfo(pLock);
459       return 1;
460     }
461     pOpen->key = key2;
462     pOpen->nRef = 1;
463     pOpen->nLock = 0;
464     pOpen->nPending = 0;
465     pOpen->aPending = 0;
466     pOld = sqlite3HashInsert(&openHash, &pOpen->key, sizeof(key2), pOpen);
467     if( pOld!=0 ){
468       assert( pOld==pOpen );
469       sqliteFree(pOpen);
470       releaseLockInfo(pLock);
471       return 1;
472     }
473   }else{
474     pOpen->nRef++;
475   }
476   *ppOpen = pOpen;
477   return 0;
478 }
479 
480 /*
481 ** Delete the named file
482 */
483 int sqlite3OsDelete(const char *zFilename){
484   unlink(zFilename);
485   return SQLITE_OK;
486 }
487 
488 /*
489 ** Return TRUE if the named file exists.
490 */
491 int sqlite3OsFileExists(const char *zFilename){
492   return access(zFilename, 0)==0;
493 }
494 
495 /*
496 ** Attempt to open a file for both reading and writing.  If that
497 ** fails, try opening it read-only.  If the file does not exist,
498 ** try to create it.
499 **
500 ** On success, a handle for the open file is written to *id
501 ** and *pReadonly is set to 0 if the file was opened for reading and
502 ** writing or 1 if the file was opened read-only.  The function returns
503 ** SQLITE_OK.
504 **
505 ** On failure, the function returns SQLITE_CANTOPEN and leaves
506 ** *id and *pReadonly unchanged.
507 */
508 int sqlite3OsOpenReadWrite(
509   const char *zFilename,
510   OsFile *id,
511   int *pReadonly
512 ){
513   int rc;
514   assert( !id->isOpen );
515   id->dirfd = -1;
516   SET_THREADID(id);
517   id->h = open(zFilename, O_RDWR|O_CREAT|O_LARGEFILE|O_BINARY,
518                           SQLITE_DEFAULT_FILE_PERMISSIONS);
519   if( id->h<0 ){
520 #ifdef EISDIR
521     if( errno==EISDIR ){
522       return SQLITE_CANTOPEN;
523     }
524 #endif
525     id->h = open(zFilename, O_RDONLY|O_LARGEFILE|O_BINARY);
526     if( id->h<0 ){
527       return SQLITE_CANTOPEN;
528     }
529     *pReadonly = 1;
530   }else{
531     *pReadonly = 0;
532   }
533   sqlite3OsEnterMutex();
534   rc = findLockInfo(id->h, &id->pLock, &id->pOpen);
535   sqlite3OsLeaveMutex();
536   if( rc ){
537     close(id->h);
538     return SQLITE_NOMEM;
539   }
540   id->locktype = 0;
541   id->isOpen = 1;
542   TRACE3("OPEN    %-3d %s\n", id->h, zFilename);
543   OpenCounter(+1);
544   return SQLITE_OK;
545 }
546 
547 
548 /*
549 ** Attempt to open a new file for exclusive access by this process.
550 ** The file will be opened for both reading and writing.  To avoid
551 ** a potential security problem, we do not allow the file to have
552 ** previously existed.  Nor do we allow the file to be a symbolic
553 ** link.
554 **
555 ** If delFlag is true, then make arrangements to automatically delete
556 ** the file when it is closed.
557 **
558 ** On success, write the file handle into *id and return SQLITE_OK.
559 **
560 ** On failure, return SQLITE_CANTOPEN.
561 */
562 int sqlite3OsOpenExclusive(const char *zFilename, OsFile *id, int delFlag){
563   int rc;
564   assert( !id->isOpen );
565   if( access(zFilename, 0)==0 ){
566     return SQLITE_CANTOPEN;
567   }
568   SET_THREADID(id);
569   id->dirfd = -1;
570   id->h = open(zFilename,
571                 O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW|O_LARGEFILE|O_BINARY, 0600);
572   if( id->h<0 ){
573     return SQLITE_CANTOPEN;
574   }
575   sqlite3OsEnterMutex();
576   rc = findLockInfo(id->h, &id->pLock, &id->pOpen);
577   sqlite3OsLeaveMutex();
578   if( rc ){
579     close(id->h);
580     unlink(zFilename);
581     return SQLITE_NOMEM;
582   }
583   id->locktype = 0;
584   id->isOpen = 1;
585   if( delFlag ){
586     unlink(zFilename);
587   }
588   TRACE3("OPEN-EX %-3d %s\n", id->h, zFilename);
589   OpenCounter(+1);
590   return SQLITE_OK;
591 }
592 
593 /*
594 ** Attempt to open a new file for read-only access.
595 **
596 ** On success, write the file handle into *id and return SQLITE_OK.
597 **
598 ** On failure, return SQLITE_CANTOPEN.
599 */
600 int sqlite3OsOpenReadOnly(const char *zFilename, OsFile *id){
601   int rc;
602   assert( !id->isOpen );
603   SET_THREADID(id);
604   id->dirfd = -1;
605   id->h = open(zFilename, O_RDONLY|O_LARGEFILE|O_BINARY);
606   if( id->h<0 ){
607     return SQLITE_CANTOPEN;
608   }
609   sqlite3OsEnterMutex();
610   rc = findLockInfo(id->h, &id->pLock, &id->pOpen);
611   sqlite3OsLeaveMutex();
612   if( rc ){
613     close(id->h);
614     return SQLITE_NOMEM;
615   }
616   id->locktype = 0;
617   id->isOpen = 1;
618   TRACE3("OPEN-RO %-3d %s\n", id->h, zFilename);
619   OpenCounter(+1);
620   return SQLITE_OK;
621 }
622 
623 /*
624 ** Attempt to open a file descriptor for the directory that contains a
625 ** file.  This file descriptor can be used to fsync() the directory
626 ** in order to make sure the creation of a new file is actually written
627 ** to disk.
628 **
629 ** This routine is only meaningful for Unix.  It is a no-op under
630 ** windows since windows does not support hard links.
631 **
632 ** On success, a handle for a previously open file is at *id is
633 ** updated with the new directory file descriptor and SQLITE_OK is
634 ** returned.
635 **
636 ** On failure, the function returns SQLITE_CANTOPEN and leaves
637 ** *id unchanged.
638 */
639 int sqlite3OsOpenDirectory(
640   const char *zDirname,
641   OsFile *id
642 ){
643   if( !id->isOpen ){
644     /* Do not open the directory if the corresponding file is not already
645     ** open. */
646     return SQLITE_CANTOPEN;
647   }
648   SET_THREADID(id);
649   assert( id->dirfd<0 );
650   id->dirfd = open(zDirname, O_RDONLY|O_BINARY, 0);
651   if( id->dirfd<0 ){
652     return SQLITE_CANTOPEN;
653   }
654   TRACE3("OPENDIR %-3d %s\n", id->dirfd, zDirname);
655   return SQLITE_OK;
656 }
657 
658 /*
659 ** If the following global variable points to a string which is the
660 ** name of a directory, then that directory will be used to store
661 ** temporary files.
662 */
663 char *sqlite3_temp_directory = 0;
664 
665 /*
666 ** Create a temporary file name in zBuf.  zBuf must be big enough to
667 ** hold at least SQLITE_TEMPNAME_SIZE characters.
668 */
669 int sqlite3OsTempFileName(char *zBuf){
670   static const char *azDirs[] = {
671      0,
672      "/var/tmp",
673      "/usr/tmp",
674      "/tmp",
675      ".",
676   };
677   static const unsigned char zChars[] =
678     "abcdefghijklmnopqrstuvwxyz"
679     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
680     "0123456789";
681   int i, j;
682   struct stat buf;
683   const char *zDir = ".";
684   azDirs[0] = sqlite3_temp_directory;
685   for(i=0; i<sizeof(azDirs)/sizeof(azDirs[0]); i++){
686     if( azDirs[i]==0 ) continue;
687     if( stat(azDirs[i], &buf) ) continue;
688     if( !S_ISDIR(buf.st_mode) ) continue;
689     if( access(azDirs[i], 07) ) continue;
690     zDir = azDirs[i];
691     break;
692   }
693   do{
694     sprintf(zBuf, "%s/"TEMP_FILE_PREFIX, zDir);
695     j = strlen(zBuf);
696     sqlite3Randomness(15, &zBuf[j]);
697     for(i=0; i<15; i++, j++){
698       zBuf[j] = (char)zChars[ ((unsigned char)zBuf[j])%(sizeof(zChars)-1) ];
699     }
700     zBuf[j] = 0;
701   }while( access(zBuf,0)==0 );
702   return SQLITE_OK;
703 }
704 
705 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
706 /*
707 ** Check that a given pathname is a directory and is writable
708 **
709 */
710 int sqlite3OsIsDirWritable(char *zBuf){
711   struct stat buf;
712   if( zBuf==0 ) return 0;
713   if( zBuf[0]==0 ) return 0;
714   if( stat(zBuf, &buf) ) return 0;
715   if( !S_ISDIR(buf.st_mode) ) return 0;
716   if( access(zBuf, 07) ) return 0;
717   return 1;
718 }
719 #endif /* SQLITE_OMIT_PAGER_PRAGMAS */
720 
721 /*
722 ** Read data from a file into a buffer.  Return SQLITE_OK if all
723 ** bytes were read successfully and SQLITE_IOERR if anything goes
724 ** wrong.
725 */
726 int sqlite3OsRead(OsFile *id, void *pBuf, int amt){
727   int got;
728   assert( id->isOpen );
729   SimulateIOError(SQLITE_IOERR);
730   TIMER_START;
731   got = read(id->h, pBuf, amt);
732   TIMER_END;
733   TRACE5("READ    %-3d %5d %7d %d\n", id->h, got, last_page, TIMER_ELAPSED);
734   SEEK(0);
735   /* if( got<0 ) got = 0; */
736   if( got==amt ){
737     return SQLITE_OK;
738   }else{
739     return SQLITE_IOERR;
740   }
741 }
742 
743 /*
744 ** Write data from a buffer into a file.  Return SQLITE_OK on success
745 ** or some other error code on failure.
746 */
747 int sqlite3OsWrite(OsFile *id, const void *pBuf, int amt){
748   int wrote = 0;
749   assert( id->isOpen );
750   assert( amt>0 );
751   SimulateIOError(SQLITE_IOERR);
752   SimulateDiskfullError;
753   TIMER_START;
754   while( amt>0 && (wrote = write(id->h, pBuf, amt))>0 ){
755     amt -= wrote;
756     pBuf = &((char*)pBuf)[wrote];
757   }
758   TIMER_END;
759   TRACE5("WRITE   %-3d %5d %7d %d\n", id->h, wrote, last_page, TIMER_ELAPSED);
760   SEEK(0);
761   if( amt>0 ){
762     return SQLITE_FULL;
763   }
764   return SQLITE_OK;
765 }
766 
767 /*
768 ** Move the read/write pointer in a file.
769 */
770 int sqlite3OsSeek(OsFile *id, i64 offset){
771   assert( id->isOpen );
772   SEEK(offset/1024 + 1);
773   lseek(id->h, offset, SEEK_SET);
774   return SQLITE_OK;
775 }
776 
777 #ifdef SQLITE_TEST
778 /*
779 ** Count the number of fullsyncs and normal syncs.  This is used to test
780 ** that syncs and fullsyncs are occuring at the right times.
781 */
782 int sqlite3_sync_count = 0;
783 int sqlite3_fullsync_count = 0;
784 #endif
785 
786 
787 /*
788 ** The fsync() system call does not work as advertised on many
789 ** unix systems.  The following procedure is an attempt to make
790 ** it work better.
791 **
792 ** The SQLITE_NO_SYNC macro disables all fsync()s.  This is useful
793 ** for testing when we want to run through the test suite quickly.
794 ** You are strongly advised *not* to deploy with SQLITE_NO_SYNC
795 ** enabled, however, since with SQLITE_NO_SYNC enabled, an OS crash
796 ** or power failure will likely corrupt the database file.
797 */
798 static int full_fsync(int fd, int fullSync){
799   int rc;
800 
801   /* Record the number of times that we do a normal fsync() and
802   ** FULLSYNC.  This is used during testing to verify that this procedure
803   ** gets called with the correct arguments.
804   */
805 #ifdef SQLITE_TEST
806   if( fullSync ) sqlite3_fullsync_count++;
807   sqlite3_sync_count++;
808 #endif
809 
810   /* If we compiled with the SQLITE_NO_SYNC flag, then syncing is a
811   ** no-op
812   */
813 #ifdef SQLITE_NO_SYNC
814   rc = SQLITE_OK;
815 #else
816 
817 #ifdef F_FULLFSYNC
818   if( fullSync ){
819     rc = fcntl(fd, F_FULLFSYNC, 0);
820   }else{
821     rc = 1;
822   }
823   /* If the FULLSYNC failed, try to do a normal fsync() */
824   if( rc ) rc = fsync(fd);
825 
826 #else
827   rc = fsync(fd);
828 #endif /* defined(F_FULLFSYNC) */
829 #endif /* defined(SQLITE_NO_SYNC) */
830 
831   return rc;
832 }
833 
834 /*
835 ** Make sure all writes to a particular file are committed to disk.
836 **
837 ** Under Unix, also make sure that the directory entry for the file
838 ** has been created by fsync-ing the directory that contains the file.
839 ** If we do not do this and we encounter a power failure, the directory
840 ** entry for the journal might not exist after we reboot.  The next
841 ** SQLite to access the file will not know that the journal exists (because
842 ** the directory entry for the journal was never created) and the transaction
843 ** will not roll back - possibly leading to database corruption.
844 */
845 int sqlite3OsSync(OsFile *id){
846   assert( id->isOpen );
847   SimulateIOError(SQLITE_IOERR);
848   TRACE2("SYNC    %-3d\n", id->h);
849   if( full_fsync(id->h, id->fullSync) ){
850     return SQLITE_IOERR;
851   }
852   if( id->dirfd>=0 ){
853     TRACE2("DIRSYNC %-3d\n", id->dirfd);
854     full_fsync(id->dirfd, id->fullSync);
855     close(id->dirfd);  /* Only need to sync once, so close the directory */
856     id->dirfd = -1;    /* when we are done. */
857   }
858   return SQLITE_OK;
859 }
860 
861 /*
862 ** Sync the directory zDirname. This is a no-op on operating systems other
863 ** than UNIX.
864 **
865 ** This is used to make sure the master journal file has truely been deleted
866 ** before making changes to individual journals on a multi-database commit.
867 ** The F_FULLFSYNC option is not needed here.
868 */
869 int sqlite3OsSyncDirectory(const char *zDirname){
870   int fd;
871   int r;
872   SimulateIOError(SQLITE_IOERR);
873   fd = open(zDirname, O_RDONLY|O_BINARY, 0);
874   TRACE3("DIRSYNC %-3d (%s)\n", fd, zDirname);
875   if( fd<0 ){
876     return SQLITE_CANTOPEN;
877   }
878   r = fsync(fd);
879   close(fd);
880   return ((r==0)?SQLITE_OK:SQLITE_IOERR);
881 }
882 
883 /*
884 ** Truncate an open file to a specified size
885 */
886 int sqlite3OsTruncate(OsFile *id, i64 nByte){
887   assert( id->isOpen );
888   SimulateIOError(SQLITE_IOERR);
889   return ftruncate(id->h, nByte)==0 ? SQLITE_OK : SQLITE_IOERR;
890 }
891 
892 /*
893 ** Determine the current size of a file in bytes
894 */
895 int sqlite3OsFileSize(OsFile *id, i64 *pSize){
896   struct stat buf;
897   assert( id->isOpen );
898   SimulateIOError(SQLITE_IOERR);
899   if( fstat(id->h, &buf)!=0 ){
900     return SQLITE_IOERR;
901   }
902   *pSize = buf.st_size;
903   return SQLITE_OK;
904 }
905 
906 /*
907 ** This routine checks if there is a RESERVED lock held on the specified
908 ** file by this or any other process. If such a lock is held, return
909 ** non-zero.  If the file is unlocked or holds only SHARED locks, then
910 ** return zero.
911 */
912 int sqlite3OsCheckReservedLock(OsFile *id){
913   int r = 0;
914 
915   assert( id->isOpen );
916   if( CHECK_THREADID(id) ) return SQLITE_MISUSE;
917   sqlite3OsEnterMutex(); /* Needed because id->pLock is shared across threads */
918 
919   /* Check if a thread in this process holds such a lock */
920   if( id->pLock->locktype>SHARED_LOCK ){
921     r = 1;
922   }
923 
924   /* Otherwise see if some other process holds it.
925   */
926   if( !r ){
927     struct flock lock;
928     lock.l_whence = SEEK_SET;
929     lock.l_start = RESERVED_BYTE;
930     lock.l_len = 1;
931     lock.l_type = F_WRLCK;
932     fcntl(id->h, F_GETLK, &lock);
933     if( lock.l_type!=F_UNLCK ){
934       r = 1;
935     }
936   }
937 
938   sqlite3OsLeaveMutex();
939   TRACE3("TEST WR-LOCK %d %d\n", id->h, r);
940 
941   return r;
942 }
943 
944 #ifdef SQLITE_DEBUG
945 /*
946 ** Helper function for printing out trace information from debugging
947 ** binaries. This returns the string represetation of the supplied
948 ** integer lock-type.
949 */
950 static const char * locktypeName(int locktype){
951   switch( locktype ){
952   case NO_LOCK: return "NONE";
953   case SHARED_LOCK: return "SHARED";
954   case RESERVED_LOCK: return "RESERVED";
955   case PENDING_LOCK: return "PENDING";
956   case EXCLUSIVE_LOCK: return "EXCLUSIVE";
957   }
958   return "ERROR";
959 }
960 #endif
961 
962 /*
963 ** Lock the file with the lock specified by parameter locktype - one
964 ** of the following:
965 **
966 **     (1) SHARED_LOCK
967 **     (2) RESERVED_LOCK
968 **     (3) PENDING_LOCK
969 **     (4) EXCLUSIVE_LOCK
970 **
971 ** Sometimes when requesting one lock state, additional lock states
972 ** are inserted in between.  The locking might fail on one of the later
973 ** transitions leaving the lock state different from what it started but
974 ** still short of its goal.  The following chart shows the allowed
975 ** transitions and the inserted intermediate states:
976 **
977 **    UNLOCKED -> SHARED
978 **    SHARED -> RESERVED
979 **    SHARED -> (PENDING) -> EXCLUSIVE
980 **    RESERVED -> (PENDING) -> EXCLUSIVE
981 **    PENDING -> EXCLUSIVE
982 **
983 ** This routine will only increase a lock.  Use the sqlite3OsUnlock()
984 ** routine to lower a locking level.
985 */
986 int sqlite3OsLock(OsFile *id, int locktype){
987   /* The following describes the implementation of the various locks and
988   ** lock transitions in terms of the POSIX advisory shared and exclusive
989   ** lock primitives (called read-locks and write-locks below, to avoid
990   ** confusion with SQLite lock names). The algorithms are complicated
991   ** slightly in order to be compatible with windows systems simultaneously
992   ** accessing the same database file, in case that is ever required.
993   **
994   ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
995   ** byte', each single bytes at well known offsets, and the 'shared byte
996   ** range', a range of 510 bytes at a well known offset.
997   **
998   ** To obtain a SHARED lock, a read-lock is obtained on the 'pending
999   ** byte'.  If this is successful, a random byte from the 'shared byte
1000   ** range' is read-locked and the lock on the 'pending byte' released.
1001   **
1002   ** A process may only obtain a RESERVED lock after it has a SHARED lock.
1003   ** A RESERVED lock is implemented by grabbing a write-lock on the
1004   ** 'reserved byte'.
1005   **
1006   ** A process may only obtain a PENDING lock after it has obtained a
1007   ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
1008   ** on the 'pending byte'. This ensures that no new SHARED locks can be
1009   ** obtained, but existing SHARED locks are allowed to persist. A process
1010   ** does not have to obtain a RESERVED lock on the way to a PENDING lock.
1011   ** This property is used by the algorithm for rolling back a journal file
1012   ** after a crash.
1013   **
1014   ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
1015   ** implemented by obtaining a write-lock on the entire 'shared byte
1016   ** range'. Since all other locks require a read-lock on one of the bytes
1017   ** within this range, this ensures that no other locks are held on the
1018   ** database.
1019   **
1020   ** The reason a single byte cannot be used instead of the 'shared byte
1021   ** range' is that some versions of windows do not support read-locks. By
1022   ** locking a random byte from a range, concurrent SHARED locks may exist
1023   ** even if the locking primitive used is always a write-lock.
1024   */
1025   int rc = SQLITE_OK;
1026   struct lockInfo *pLock = id->pLock;
1027   struct flock lock;
1028   int s;
1029 
1030   assert( id->isOpen );
1031   TRACE7("LOCK    %d %s was %s(%s,%d) pid=%d\n", id->h, locktypeName(locktype),
1032       locktypeName(id->locktype), locktypeName(pLock->locktype), pLock->cnt
1033       ,getpid() );
1034   if( CHECK_THREADID(id) ) return SQLITE_MISUSE;
1035 
1036   /* If there is already a lock of this type or more restrictive on the
1037   ** OsFile, do nothing. Don't use the end_lock: exit path, as
1038   ** sqlite3OsEnterMutex() hasn't been called yet.
1039   */
1040   if( id->locktype>=locktype ){
1041     TRACE3("LOCK    %d %s ok (already held)\n", id->h, locktypeName(locktype));
1042     return SQLITE_OK;
1043   }
1044 
1045   /* Make sure the locking sequence is correct
1046   */
1047   assert( id->locktype!=NO_LOCK || locktype==SHARED_LOCK );
1048   assert( locktype!=PENDING_LOCK );
1049   assert( locktype!=RESERVED_LOCK || id->locktype==SHARED_LOCK );
1050 
1051   /* This mutex is needed because id->pLock is shared across threads
1052   */
1053   sqlite3OsEnterMutex();
1054 
1055   /* If some thread using this PID has a lock via a different OsFile*
1056   ** handle that precludes the requested lock, return BUSY.
1057   */
1058   if( (id->locktype!=pLock->locktype &&
1059           (pLock->locktype>=PENDING_LOCK || locktype>SHARED_LOCK))
1060   ){
1061     rc = SQLITE_BUSY;
1062     goto end_lock;
1063   }
1064 
1065   /* If a SHARED lock is requested, and some thread using this PID already
1066   ** has a SHARED or RESERVED lock, then increment reference counts and
1067   ** return SQLITE_OK.
1068   */
1069   if( locktype==SHARED_LOCK &&
1070       (pLock->locktype==SHARED_LOCK || pLock->locktype==RESERVED_LOCK) ){
1071     assert( locktype==SHARED_LOCK );
1072     assert( id->locktype==0 );
1073     assert( pLock->cnt>0 );
1074     id->locktype = SHARED_LOCK;
1075     pLock->cnt++;
1076     id->pOpen->nLock++;
1077     goto end_lock;
1078   }
1079 
1080   lock.l_len = 1L;
1081 
1082   lock.l_whence = SEEK_SET;
1083 
1084   /* A PENDING lock is needed before acquiring a SHARED lock and before
1085   ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
1086   ** be released.
1087   */
1088   if( locktype==SHARED_LOCK
1089       || (locktype==EXCLUSIVE_LOCK && id->locktype<PENDING_LOCK)
1090   ){
1091     lock.l_type = (locktype==SHARED_LOCK?F_RDLCK:F_WRLCK);
1092     lock.l_start = PENDING_BYTE;
1093     s = fcntl(id->h, F_SETLK, &lock);
1094     if( s ){
1095       rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
1096       goto end_lock;
1097     }
1098   }
1099 
1100 
1101   /* If control gets to this point, then actually go ahead and make
1102   ** operating system calls for the specified lock.
1103   */
1104   if( locktype==SHARED_LOCK ){
1105     assert( pLock->cnt==0 );
1106     assert( pLock->locktype==0 );
1107 
1108     /* Now get the read-lock */
1109     lock.l_start = SHARED_FIRST;
1110     lock.l_len = SHARED_SIZE;
1111     s = fcntl(id->h, F_SETLK, &lock);
1112 
1113     /* Drop the temporary PENDING lock */
1114     lock.l_start = PENDING_BYTE;
1115     lock.l_len = 1L;
1116     lock.l_type = F_UNLCK;
1117     if( fcntl(id->h, F_SETLK, &lock)!=0 ){
1118       rc = SQLITE_IOERR;  /* This should never happen */
1119       goto end_lock;
1120     }
1121     if( s ){
1122       rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
1123     }else{
1124       id->locktype = SHARED_LOCK;
1125       id->pOpen->nLock++;
1126       pLock->cnt = 1;
1127     }
1128   }else if( locktype==EXCLUSIVE_LOCK && pLock->cnt>1 ){
1129     /* We are trying for an exclusive lock but another thread in this
1130     ** same process is still holding a shared lock. */
1131     rc = SQLITE_BUSY;
1132   }else{
1133     /* The request was for a RESERVED or EXCLUSIVE lock.  It is
1134     ** assumed that there is a SHARED or greater lock on the file
1135     ** already.
1136     */
1137     assert( 0!=id->locktype );
1138     lock.l_type = F_WRLCK;
1139     switch( locktype ){
1140       case RESERVED_LOCK:
1141         lock.l_start = RESERVED_BYTE;
1142         break;
1143       case EXCLUSIVE_LOCK:
1144         lock.l_start = SHARED_FIRST;
1145         lock.l_len = SHARED_SIZE;
1146         break;
1147       default:
1148         assert(0);
1149     }
1150     s = fcntl(id->h, F_SETLK, &lock);
1151     if( s ){
1152       rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
1153     }
1154   }
1155 
1156   if( rc==SQLITE_OK ){
1157     id->locktype = locktype;
1158     pLock->locktype = locktype;
1159   }else if( locktype==EXCLUSIVE_LOCK ){
1160     id->locktype = PENDING_LOCK;
1161     pLock->locktype = PENDING_LOCK;
1162   }
1163 
1164 end_lock:
1165   sqlite3OsLeaveMutex();
1166   TRACE4("LOCK    %d %s %s\n", id->h, locktypeName(locktype),
1167       rc==SQLITE_OK ? "ok" : "failed");
1168   return rc;
1169 }
1170 
1171 /*
1172 ** Lower the locking level on file descriptor id to locktype.  locktype
1173 ** must be either NO_LOCK or SHARED_LOCK.
1174 **
1175 ** If the locking level of the file descriptor is already at or below
1176 ** the requested locking level, this routine is a no-op.
1177 **
1178 ** It is not possible for this routine to fail if the second argument
1179 ** is NO_LOCK.  If the second argument is SHARED_LOCK, this routine
1180 ** might return SQLITE_IOERR instead of SQLITE_OK.
1181 */
1182 int sqlite3OsUnlock(OsFile *id, int locktype){
1183   struct lockInfo *pLock;
1184   struct flock lock;
1185   int rc = SQLITE_OK;
1186 
1187   assert( id->isOpen );
1188   TRACE7("UNLOCK  %d %d was %d(%d,%d) pid=%d\n", id->h, locktype, id->locktype,
1189       id->pLock->locktype, id->pLock->cnt, getpid());
1190   if( CHECK_THREADID(id) ) return SQLITE_MISUSE;
1191 
1192   assert( locktype<=SHARED_LOCK );
1193   if( id->locktype<=locktype ){
1194     return SQLITE_OK;
1195   }
1196   sqlite3OsEnterMutex();
1197   pLock = id->pLock;
1198   assert( pLock->cnt!=0 );
1199   if( id->locktype>SHARED_LOCK ){
1200     assert( pLock->locktype==id->locktype );
1201     if( locktype==SHARED_LOCK ){
1202       lock.l_type = F_RDLCK;
1203       lock.l_whence = SEEK_SET;
1204       lock.l_start = SHARED_FIRST;
1205       lock.l_len = SHARED_SIZE;
1206       if( fcntl(id->h, F_SETLK, &lock)!=0 ){
1207         /* This should never happen */
1208         rc = SQLITE_IOERR;
1209       }
1210     }
1211     lock.l_type = F_UNLCK;
1212     lock.l_whence = SEEK_SET;
1213     lock.l_start = PENDING_BYTE;
1214     lock.l_len = 2L;  assert( PENDING_BYTE+1==RESERVED_BYTE );
1215     if( fcntl(id->h, F_SETLK, &lock)==0 ){
1216       pLock->locktype = SHARED_LOCK;
1217     }else{
1218       rc = SQLITE_IOERR;  /* This should never happen */
1219     }
1220   }
1221   if( locktype==NO_LOCK ){
1222     struct openCnt *pOpen;
1223 
1224     /* Decrement the shared lock counter.  Release the lock using an
1225     ** OS call only when all threads in this same process have released
1226     ** the lock.
1227     */
1228     pLock->cnt--;
1229     if( pLock->cnt==0 ){
1230       lock.l_type = F_UNLCK;
1231       lock.l_whence = SEEK_SET;
1232       lock.l_start = lock.l_len = 0L;
1233       if( fcntl(id->h, F_SETLK, &lock)==0 ){
1234         pLock->locktype = NO_LOCK;
1235       }else{
1236         rc = SQLITE_IOERR;  /* This should never happen */
1237       }
1238     }
1239 
1240     /* Decrement the count of locks against this same file.  When the
1241     ** count reaches zero, close any other file descriptors whose close
1242     ** was deferred because of outstanding locks.
1243     */
1244     pOpen = id->pOpen;
1245     pOpen->nLock--;
1246     assert( pOpen->nLock>=0 );
1247     if( pOpen->nLock==0 && pOpen->nPending>0 ){
1248       int i;
1249       for(i=0; i<pOpen->nPending; i++){
1250         close(pOpen->aPending[i]);
1251       }
1252       sqliteFree(pOpen->aPending);
1253       pOpen->nPending = 0;
1254       pOpen->aPending = 0;
1255     }
1256   }
1257   sqlite3OsLeaveMutex();
1258   id->locktype = locktype;
1259   return rc;
1260 }
1261 
1262 /*
1263 ** Close a file.
1264 */
1265 int sqlite3OsClose(OsFile *id){
1266   if( !id->isOpen ) return SQLITE_OK;
1267   if( CHECK_THREADID(id) ) return SQLITE_MISUSE;
1268   sqlite3OsUnlock(id, NO_LOCK);
1269   if( id->dirfd>=0 ) close(id->dirfd);
1270   id->dirfd = -1;
1271   sqlite3OsEnterMutex();
1272   if( id->pOpen->nLock ){
1273     /* If there are outstanding locks, do not actually close the file just
1274     ** yet because that would clear those locks.  Instead, add the file
1275     ** descriptor to pOpen->aPending.  It will be automatically closed when
1276     ** the last lock is cleared.
1277     */
1278     int *aNew;
1279     struct openCnt *pOpen = id->pOpen;
1280     pOpen->nPending++;
1281     aNew = sqliteRealloc( pOpen->aPending, pOpen->nPending*sizeof(int) );
1282     if( aNew==0 ){
1283       /* If a malloc fails, just leak the file descriptor */
1284     }else{
1285       pOpen->aPending = aNew;
1286       pOpen->aPending[pOpen->nPending-1] = id->h;
1287     }
1288   }else{
1289     /* There are no outstanding locks so we can close the file immediately */
1290     close(id->h);
1291   }
1292   releaseLockInfo(id->pLock);
1293   releaseOpenCnt(id->pOpen);
1294   sqlite3OsLeaveMutex();
1295   id->isOpen = 0;
1296   TRACE2("CLOSE   %-3d\n", id->h);
1297   OpenCounter(-1);
1298   return SQLITE_OK;
1299 }
1300 
1301 /*
1302 ** Turn a relative pathname into a full pathname.  Return a pointer
1303 ** to the full pathname stored in space obtained from sqliteMalloc().
1304 ** The calling function is responsible for freeing this space once it
1305 ** is no longer needed.
1306 */
1307 char *sqlite3OsFullPathname(const char *zRelative){
1308   char *zFull = 0;
1309   if( zRelative[0]=='/' ){
1310     sqlite3SetString(&zFull, zRelative, (char*)0);
1311   }else{
1312     char zBuf[5000];
1313     zBuf[0] = 0;
1314     sqlite3SetString(&zFull, getcwd(zBuf, sizeof(zBuf)), "/", zRelative,
1315                     (char*)0);
1316   }
1317   return zFull;
1318 }
1319 
1320 
1321 #endif /* SQLITE_OMIT_DISKIO */
1322 /***************************************************************************
1323 ** Everything above deals with file I/O.  Everything that follows deals
1324 ** with other miscellanous aspects of the operating system interface
1325 ****************************************************************************/
1326 
1327 
1328 /*
1329 ** Get information to seed the random number generator.  The seed
1330 ** is written into the buffer zBuf[256].  The calling function must
1331 ** supply a sufficiently large buffer.
1332 */
1333 int sqlite3OsRandomSeed(char *zBuf){
1334   /* We have to initialize zBuf to prevent valgrind from reporting
1335   ** errors.  The reports issued by valgrind are incorrect - we would
1336   ** prefer that the randomness be increased by making use of the
1337   ** uninitialized space in zBuf - but valgrind errors tend to worry
1338   ** some users.  Rather than argue, it seems easier just to initialize
1339   ** the whole array and silence valgrind, even if that means less randomness
1340   ** in the random seed.
1341   **
1342   ** When testing, initializing zBuf[] to zero is all we do.  That means
1343   ** that we always use the same random number sequence.* This makes the
1344   ** tests repeatable.
1345   */
1346   memset(zBuf, 0, 256);
1347 #if !defined(SQLITE_TEST)
1348   {
1349     int pid, fd;
1350     fd = open("/dev/urandom", O_RDONLY);
1351     if( fd<0 ){
1352       time((time_t*)zBuf);
1353       pid = getpid();
1354       memcpy(&zBuf[sizeof(time_t)], &pid, sizeof(pid));
1355     }else{
1356       read(fd, zBuf, 256);
1357       close(fd);
1358     }
1359   }
1360 #endif
1361   return SQLITE_OK;
1362 }
1363 
1364 /*
1365 ** Sleep for a little while.  Return the amount of time slept.
1366 */
1367 int sqlite3OsSleep(int ms){
1368 #if defined(HAVE_USLEEP) && HAVE_USLEEP
1369   usleep(ms*1000);
1370   return ms;
1371 #else
1372   sleep((ms+999)/1000);
1373   return 1000*((ms+999)/1000);
1374 #endif
1375 }
1376 
1377 /*
1378 ** Static variables used for thread synchronization
1379 */
1380 static int inMutex = 0;
1381 #ifdef SQLITE_UNIX_THREADS
1382 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
1383 #endif
1384 
1385 /*
1386 ** The following pair of routine implement mutual exclusion for
1387 ** multi-threaded processes.  Only a single thread is allowed to
1388 ** executed code that is surrounded by EnterMutex() and LeaveMutex().
1389 **
1390 ** SQLite uses only a single Mutex.  There is not much critical
1391 ** code and what little there is executes quickly and without blocking.
1392 */
1393 void sqlite3OsEnterMutex(){
1394 #ifdef SQLITE_UNIX_THREADS
1395   pthread_mutex_lock(&mutex);
1396 #endif
1397   assert( !inMutex );
1398   inMutex = 1;
1399 }
1400 void sqlite3OsLeaveMutex(){
1401   assert( inMutex );
1402   inMutex = 0;
1403 #ifdef SQLITE_UNIX_THREADS
1404   pthread_mutex_unlock(&mutex);
1405 #endif
1406 }
1407 
1408 /*
1409 ** The following variable, if set to a non-zero value, becomes the result
1410 ** returned from sqlite3OsCurrentTime().  This is used for testing.
1411 */
1412 #ifdef SQLITE_TEST
1413 int sqlite3_current_time = 0;
1414 #endif
1415 
1416 /*
1417 ** Find the current time (in Universal Coordinated Time).  Write the
1418 ** current time and date as a Julian Day number into *prNow and
1419 ** return 0.  Return 1 if the time and date cannot be found.
1420 */
1421 int sqlite3OsCurrentTime(double *prNow){
1422   time_t t;
1423   time(&t);
1424   *prNow = t/86400.0 + 2440587.5;
1425 #ifdef SQLITE_TEST
1426   if( sqlite3_current_time ){
1427     *prNow = sqlite3_current_time/86400.0 + 2440587.5;
1428   }
1429 #endif
1430   return 0;
1431 }
1432 
1433 #endif /* OS_UNIX */
1434