blob: 0ca78c5738f45d61db361b819ea73a70f957f602 [file] [log] [blame]
danielk1977a3f06592009-04-23 14:58:39 +00001/*
2** 2005 December 14
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
13** $Id: sqlite3async.c,v 1.1 2009/04/23 14:58:40 danielk1977 Exp $
14**
15** This file contains an example implementation of an asynchronous IO
16** backend for SQLite.
17**
18** WHAT IS ASYNCHRONOUS I/O?
19**
20** With asynchronous I/O, write requests are handled by a separate thread
21** running in the background. This means that the thread that initiates
22** a database write does not have to wait for (sometimes slow) disk I/O
23** to occur. The write seems to happen very quickly, though in reality
24** it is happening at its usual slow pace in the background.
25**
26** Asynchronous I/O appears to give better responsiveness, but at a price.
27** You lose the Durable property. With the default I/O backend of SQLite,
28** once a write completes, you know that the information you wrote is
29** safely on disk. With the asynchronous I/O, this is not the case. If
30** your program crashes or if a power loss occurs after the database
31** write but before the asynchronous write thread has completed, then the
32** database change might never make it to disk and the next user of the
33** database might not see your change.
34**
35** You lose Durability with asynchronous I/O, but you still retain the
36** other parts of ACID: Atomic, Consistent, and Isolated. Many
37** appliations get along fine without the Durablity.
38**
39** HOW IT WORKS
40**
41** Asynchronous I/O works by creating a special SQLite "vfs" structure
42** and registering it with sqlite3_vfs_register(). When files opened via
43** this vfs are written to (using sqlite3OsWrite()), the data is not
44** written directly to disk, but is placed in the "write-queue" to be
45** handled by the background thread.
46**
47** When files opened with the asynchronous vfs are read from
48** (using sqlite3OsRead()), the data is read from the file on
49** disk and the write-queue, so that from the point of view of
50** the vfs reader the OsWrite() appears to have already completed.
51**
52** The special vfs is registered (and unregistered) by calls to
53** function asyncEnable() (see below).
54**
55** LIMITATIONS
56**
57** This demonstration code is deliberately kept simple in order to keep
58** the main ideas clear and easy to understand. Real applications that
59** want to do asynchronous I/O might want to add additional capabilities.
60** For example, in this demonstration if writes are happening at a steady
61** stream that exceeds the I/O capability of the background writer thread,
62** the queue of pending write operations will grow without bound until we
63** run out of memory. Users of this technique may want to keep track of
64** the quantity of pending writes and stop accepting new write requests
65** when the buffer gets to be too big.
66**
67** LOCKING + CONCURRENCY
68**
69** Multiple connections from within a single process that use this
70** implementation of asynchronous IO may access a single database
71** file concurrently. From the point of view of the user, if all
72** connections are from within a single process, there is no difference
73** between the concurrency offered by "normal" SQLite and SQLite
74** using the asynchronous backend.
75**
76** If connections from within multiple processes may access the
77** database file, the ENABLE_FILE_LOCKING symbol (see below) must be
78** defined. If it is not defined, then no locks are established on
79** the database file. In this case, if multiple processes access
80** the database file, corruption will quickly result.
81**
82** If ENABLE_FILE_LOCKING is defined (the default), then connections
83** from within multiple processes may access a single database file
84** without risking corruption. However concurrency is reduced as
85** follows:
86**
87** * When a connection using asynchronous IO begins a database
88** transaction, the database is locked immediately. However the
89** lock is not released until after all relevant operations
90** in the write-queue have been flushed to disk. This means
91** (for example) that the database may remain locked for some
92** time after a "COMMIT" or "ROLLBACK" is issued.
93**
94** * If an application using asynchronous IO executes transactions
95** in quick succession, other database users may be effectively
96** locked out of the database. This is because when a BEGIN
97** is executed, a database lock is established immediately. But
98** when the corresponding COMMIT or ROLLBACK occurs, the lock
99** is not released until the relevant part of the write-queue
100** has been flushed through. As a result, if a COMMIT is followed
101** by a BEGIN before the write-queue is flushed through, the database
102** is never unlocked,preventing other processes from accessing
103** the database.
104**
105** Defining ENABLE_FILE_LOCKING when using an NFS or other remote
106** file-system may slow things down, as synchronous round-trips to the
107** server may be required to establish database file locks.
108*/
109
110#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
111
112#include "sqlite3async.h"
113
114#define ENABLE_FILE_LOCKING
115
116#ifndef SQLITE_AMALGAMATION
117# include "sqliteInt.h"
118# include <assert.h>
119# include <string.h>
120#endif
121
122/* Useful macros used in several places */
123#define MIN(x,y) ((x)<(y)?(x):(y))
124#define MAX(x,y) ((x)>(y)?(x):(y))
125
126/* Forward references */
127typedef struct AsyncWrite AsyncWrite;
128typedef struct AsyncFile AsyncFile;
129typedef struct AsyncFileData AsyncFileData;
130typedef struct AsyncFileLock AsyncFileLock;
131typedef struct AsyncLock AsyncLock;
132
133/* Enable for debugging */
134static int sqlite3async_trace = 0;
135# define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
136static void asyncTrace(const char *zFormat, ...){
137 char *z;
138 va_list ap;
139 va_start(ap, zFormat);
140 z = sqlite3_vmprintf(zFormat, ap);
141 va_end(ap);
142 fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
143 sqlite3_free(z);
144}
145
146/*
147** THREAD SAFETY NOTES
148**
149** Basic rules:
150**
151** * Both read and write access to the global write-op queue must be
152** protected by the async.queueMutex. As are the async.ioError and
153** async.nFile variables.
154**
155** * The async.pLock list and all AsyncLock and AsyncFileLock
156** structures must be protected by the async.lockMutex mutex.
157**
158** * The file handles from the underlying system are not assumed to
159** be thread safe.
160**
161** * See the last two paragraphs under "The Writer Thread" for
162** an assumption to do with file-handle synchronization by the Os.
163**
164** Deadlock prevention:
165**
166** There are three mutex used by the system: the "writer" mutex,
167** the "queue" mutex and the "lock" mutex. Rules are:
168**
169** * It is illegal to block on the writer mutex when any other mutex
170** are held, and
171**
172** * It is illegal to block on the queue mutex when the lock mutex
173** is held.
174**
175** i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
176**
177** File system operations (invoked by SQLite thread):
178**
179** xOpen
180** xDelete
181** xFileExists
182**
183** File handle operations (invoked by SQLite thread):
184**
185** asyncWrite, asyncClose, asyncTruncate, asyncSync
186**
187** The operations above add an entry to the global write-op list. They
188** prepare the entry, acquire the async.queueMutex momentarily while
189** list pointers are manipulated to insert the new entry, then release
190** the mutex and signal the writer thread to wake up in case it happens
191** to be asleep.
192**
193**
194** asyncRead, asyncFileSize.
195**
196** Read operations. Both of these read from both the underlying file
197** first then adjust their result based on pending writes in the
198** write-op queue. So async.queueMutex is held for the duration
199** of these operations to prevent other threads from changing the
200** queue in mid operation.
201**
202**
203** asyncLock, asyncUnlock, asyncCheckReservedLock
204**
205** These primitives implement in-process locking using a hash table
206** on the file name. Files are locked correctly for connections coming
207** from the same process. But other processes cannot see these locks
208** and will therefore not honor them.
209**
210**
211** The writer thread:
212**
213** The async.writerMutex is used to make sure only there is only
214** a single writer thread running at a time.
215**
216** Inside the writer thread is a loop that works like this:
217**
218** WHILE (write-op list is not empty)
219** Do IO operation at head of write-op list
220** Remove entry from head of write-op list
221** END WHILE
222**
223** The async.queueMutex is always held during the <write-op list is
224** not empty> test, and when the entry is removed from the head
225** of the write-op list. Sometimes it is held for the interim
226** period (while the IO is performed), and sometimes it is
227** relinquished. It is relinquished if (a) the IO op is an
228** ASYNC_CLOSE or (b) when the file handle was opened, two of
229** the underlying systems handles were opened on the same
230** file-system entry.
231**
232** If condition (b) above is true, then one file-handle
233** (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
234** file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
235** threads to perform write() operations. This means that read
236** operations are not blocked by asynchronous writes (although
237** asynchronous writes may still be blocked by reads).
238**
239** This assumes that the OS keeps two handles open on the same file
240** properly in sync. That is, any read operation that starts after a
241** write operation on the same file system entry has completed returns
242** data consistent with the write. We also assume that if one thread
243** reads a file while another is writing it all bytes other than the
244** ones actually being written contain valid data.
245**
246** If the above assumptions are not true, set the preprocessor symbol
247** SQLITE_ASYNC_TWO_FILEHANDLES to 0.
248*/
249
250
251#ifndef NDEBUG
252# define TESTONLY( X ) X
253#else
254# define TESTONLY( X )
255#endif
256
257/*
258** There are two definitions of the following functions. One for pthreads
259** compatible systems and one for Win32. These functions isolate the OS
260** specific code required by each platform.
261**
262** The system uses three mutexes and a single condition variable. To
263** block on a mutex, async_mutex_enter() is called. The parameter passed
264** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
265** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
266** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
267** called with a parameter identifying the mutex being unlocked. Mutexes
268** are not recursive - it is an error to call async_mutex_enter() to
269** lock a mutex that is already locked, or to call async_mutex_leave()
270** to unlock a mutex that is not currently locked.
271**
272** The async_cond_wait() and async_cond_signal() functions are modelled
273** on the pthreads functions with similar names. The first parameter to
274** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
275** is called the mutex identified by the second parameter must be held.
276** The mutex is unlocked, and the calling thread simultaneously begins
277** waiting for the condition variable to be signalled by another thread.
278** After another thread signals the condition variable, the calling
279** thread stops waiting, locks mutex eMutex and returns. The
280** async_cond_signal() function is used to signal the condition variable.
281** It is assumed that the mutex used by the thread calling async_cond_wait()
282** is held by the caller of async_cond_signal() (otherwise there would be
283** a race condition).
284**
285** It is guaranteed that no other thread will call async_cond_wait() when
286** there is already a thread waiting on the condition variable.
287**
288** The async_sched_yield() function is called to suggest to the operating
289** system that it would be a good time to shift the current thread off the
290** CPU. The system will still work if this function is not implemented
291** (it is not currently implemented for win32), but it might be marginally
292** more efficient if it is.
293*/
294static void async_mutex_enter(int eMutex);
295static void async_mutex_leave(int eMutex);
296static void async_cond_wait(int eCond, int eMutex);
297static void async_cond_signal(int eCond);
298static void async_sched_yield(void);
299
300/*
301** There are also two definitions of the following. async_os_initialize()
302** is called when the asynchronous VFS is first installed, and os_shutdown()
303** is called when it is uninstalled (from within sqlite3async_shutdown()).
304**
305** For pthreads builds, both of these functions are no-ops. For win32,
306** they provide an opportunity to initialize and finalize the required
307** mutex and condition variables.
308**
309** If async_os_initialize() returns other than zero, then the initialization
310** fails and SQLITE_ERROR is returned to the user.
311*/
312static int async_os_initialize(void);
313static void async_os_shutdown(void);
314
315/* Values for use as the 'eMutex' argument of the above functions. The
316** integer values assigned to these constants are important for assert()
317** statements that verify that mutexes are locked in the correct order.
318** Specifically, it is unsafe to try to lock mutex N while holding a lock
319** on mutex M if (M<=N).
320*/
321#define ASYNC_MUTEX_LOCK 0
322#define ASYNC_MUTEX_QUEUE 1
323#define ASYNC_MUTEX_WRITER 2
324
325/* Values for use as the 'eCond' argument of the above functions. */
326#define ASYNC_COND_QUEUE 0
327
328/*************************************************************************
329** Start of OS specific code.
330*/
331#if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
332
333/* The following block contains the win32 specific code. */
334
335#define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
336
337static struct AsyncPrimitives {
338 int isInit;
339 DWORD aHolder[3];
340 CRITICAL_SECTION aMutex[3];
341 HANDLE aCond[1];
342} primitives = { 0 };
343
344static int async_os_initialize(void){
345 if( !primitives.isInit ){
346 primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
347 if( primitives.aCond[0]==NULL ){
348 return 1;
349 }
350 InitializeCriticalSection(&primitives.aMutex[0]);
351 InitializeCriticalSection(&primitives.aMutex[1]);
352 InitializeCriticalSection(&primitives.aMutex[2]);
353 primitives.isInit = 1;
354 }
355 return 0;
356}
357static void async_os_shutdown(void){
358 if( primitives.isInit ){
359 DeleteCriticalSection(&primitives.aMutex[0]);
360 DeleteCriticalSection(&primitives.aMutex[1]);
361 DeleteCriticalSection(&primitives.aMutex[2]);
362 CloseHandle(primitives.aCond[0]);
363 primitives.isInit = 0;
364 }
365}
366
367/* The following block contains the Win32 specific code. */
368static void async_mutex_enter(int eMutex){
369 assert( eMutex==0 || eMutex==1 || eMutex==2 );
370 assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
371 assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
372 assert( eMutex!=0 || (!mutex_held(0)) );
373 EnterCriticalSection(&primitives.aMutex[eMutex]);
374 TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
375}
376static void async_mutex_leave(int eMutex){
377 assert( eMutex==0 || eMutex==1 || eMutex==2 );
378 assert( mutex_held(eMutex) );
379 TESTONLY( primitives.aHolder[eMutex] = 0; )
380 LeaveCriticalSection(&primitives.aMutex[eMutex]);
381}
382static void async_cond_wait(int eCond, int eMutex){
383 ResetEvent(primitives.aCond[eCond]);
384 async_mutex_leave(eMutex);
385 WaitForSingleObject(primitives.aCond[eCond], INFINITE);
386 async_mutex_enter(eMutex);
387}
388static void async_cond_signal(int eCond){
389 assert( mutex_held(ASYNC_MUTEX_QUEUE) );
390 SetEvent(primitives.aCond[eCond]);
391}
392static void async_sched_yield(void){
393 /* Todo: Find out if win32 offers anything like sched_yield() */
394}
395#else
396
397/* The following block contains the pthreads specific code. */
398#include <pthread.h>
399#include <sched.h>
400
401#define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
402
403static int async_os_initialize(void) {return 0;}
404static void async_os_shutdown(void) {}
405
406static struct AsyncPrimitives {
407 pthread_mutex_t aMutex[3];
408 pthread_cond_t aCond[1];
409 pthread_t aHolder[3];
410} primitives = {
411 { PTHREAD_MUTEX_INITIALIZER,
412 PTHREAD_MUTEX_INITIALIZER,
413 PTHREAD_MUTEX_INITIALIZER
414 } , {
415 PTHREAD_COND_INITIALIZER
416 } , { 0, 0, 0 }
417};
418
419static void async_mutex_enter(int eMutex){
420 assert( eMutex==0 || eMutex==1 || eMutex==2 );
421 assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
422 assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
423 assert( eMutex!=0 || (!mutex_held(0)) );
424 pthread_mutex_lock(&primitives.aMutex[eMutex]);
425 TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
426}
427static void async_mutex_leave(int eMutex){
428 assert( eMutex==0 || eMutex==1 || eMutex==2 );
429 assert( mutex_held(eMutex) );
430 TESTONLY( primitives.aHolder[eMutex] = 0; )
431 pthread_mutex_unlock(&primitives.aMutex[eMutex]);
432}
433static void async_cond_wait(int eCond, int eMutex){
434 assert( eMutex==0 || eMutex==1 || eMutex==2 );
435 assert( mutex_held(eMutex) );
436 TESTONLY( primitives.aHolder[eMutex] = 0; )
437 pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
438 TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
439}
440static void async_cond_signal(int eCond){
441 assert( mutex_held(ASYNC_MUTEX_QUEUE) );
442 pthread_cond_signal(&primitives.aCond[eCond]);
443}
444static void async_sched_yield(void){
445 sched_yield();
446}
447#endif
448/*
449** End of OS specific code.
450*************************************************************************/
451
452#define assert_mutex_is_held(X) assert( mutex_held(X) )
453
454
455#ifndef SQLITE_ASYNC_TWO_FILEHANDLES
456/* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
457#define SQLITE_ASYNC_TWO_FILEHANDLES 1
458#endif
459
460/*
461** State information is held in the static variable "async" defined
462** as the following structure.
463**
464** Both async.ioError and async.nFile are protected by async.queueMutex.
465*/
466static struct TestAsyncStaticData {
467 AsyncWrite *pQueueFirst; /* Next write operation to be processed */
468 AsyncWrite *pQueueLast; /* Last write operation on the list */
469 AsyncLock *pLock; /* Linked list of all AsyncLock structures */
470 volatile int ioDelay; /* Extra delay between write operations */
471 volatile int eHalt; /* One of the SQLITEASYNC_HALT_XXX values */
472 int ioError; /* True if an IO error has occurred */
473 int nFile; /* Number of open files (from sqlite pov) */
474} async = { 0,0,0,0,0,0,0 };
475
476/* Possible values of AsyncWrite.op */
477#define ASYNC_NOOP 0
478#define ASYNC_WRITE 1
479#define ASYNC_SYNC 2
480#define ASYNC_TRUNCATE 3
481#define ASYNC_CLOSE 4
482#define ASYNC_DELETE 5
483#define ASYNC_OPENEXCLUSIVE 6
484#define ASYNC_UNLOCK 7
485
486/* Names of opcodes. Used for debugging only.
487** Make sure these stay in sync with the macros above!
488*/
489static const char *azOpcodeName[] = {
490 "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
491};
492
493/*
494** Entries on the write-op queue are instances of the AsyncWrite
495** structure, defined here.
496**
497** The interpretation of the iOffset and nByte variables varies depending
498** on the value of AsyncWrite.op:
499**
500** ASYNC_NOOP:
501** No values used.
502**
503** ASYNC_WRITE:
504** iOffset -> Offset in file to write to.
505** nByte -> Number of bytes of data to write (pointed to by zBuf).
506**
507** ASYNC_SYNC:
508** nByte -> flags to pass to sqlite3OsSync().
509**
510** ASYNC_TRUNCATE:
511** iOffset -> Size to truncate file to.
512** nByte -> Unused.
513**
514** ASYNC_CLOSE:
515** iOffset -> Unused.
516** nByte -> Unused.
517**
518** ASYNC_DELETE:
519** iOffset -> Contains the "syncDir" flag.
520** nByte -> Number of bytes of zBuf points to (file name).
521**
522** ASYNC_OPENEXCLUSIVE:
523** iOffset -> Value of "delflag".
524** nByte -> Number of bytes of zBuf points to (file name).
525**
526** ASYNC_UNLOCK:
527** nByte -> Argument to sqlite3OsUnlock().
528**
529**
530** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
531** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
532** single blob, so is deleted when sqlite3_free() is called on the parent
533** structure.
534*/
535struct AsyncWrite {
536 AsyncFileData *pFileData; /* File to write data to or sync */
537 int op; /* One of ASYNC_xxx etc. */
538 sqlite_int64 iOffset; /* See above */
539 int nByte; /* See above */
540 char *zBuf; /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
541 AsyncWrite *pNext; /* Next write operation (to any file) */
542};
543
544/*
545** An instance of this structure is created for each distinct open file
546** (i.e. if two handles are opened on the one file, only one of these
547** structures is allocated) and stored in the async.aLock hash table. The
548** keys for async.aLock are the full pathnames of the opened files.
549**
550** AsyncLock.pList points to the head of a linked list of AsyncFileLock
551** structures, one for each handle currently open on the file.
552**
553** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
554** not passed to the sqlite3OsOpen() call), or if ENABLE_FILE_LOCKING is
555** not defined at compile time, variables AsyncLock.pFile and
556** AsyncLock.eLock are never used. Otherwise, pFile is a file handle
557** opened on the file in question and used to obtain the file-system
558** locks required by database connections within this process.
559**
560** See comments above the asyncLock() function for more details on
561** the implementation of database locking used by this backend.
562*/
563struct AsyncLock {
564 char *zFile;
565 int nFile;
566 sqlite3_file *pFile;
567 int eLock;
568 AsyncFileLock *pList;
569 AsyncLock *pNext; /* Next in linked list headed by async.pLock */
570};
571
572/*
573** An instance of the following structure is allocated along with each
574** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
575** file was opened with the SQLITE_OPEN_MAIN_DB.
576*/
577struct AsyncFileLock {
578 int eLock; /* Internally visible lock state (sqlite pov) */
579 int eAsyncLock; /* Lock-state with write-queue unlock */
580 AsyncFileLock *pNext;
581};
582
583/*
584** The AsyncFile structure is a subclass of sqlite3_file used for
585** asynchronous IO.
586**
587** All of the actual data for the structure is stored in the structure
588** pointed to by AsyncFile.pData, which is allocated as part of the
589** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
590** lifetime of the AsyncFile structure is ended by the caller after OsClose()
591** is called, but the data in AsyncFileData may be required by the
592** writer thread after that point.
593*/
594struct AsyncFile {
595 sqlite3_io_methods *pMethod;
596 AsyncFileData *pData;
597};
598struct AsyncFileData {
599 char *zName; /* Underlying OS filename - used for debugging */
600 int nName; /* Number of characters in zName */
601 sqlite3_file *pBaseRead; /* Read handle to the underlying Os file */
602 sqlite3_file *pBaseWrite; /* Write handle to the underlying Os file */
603 AsyncFileLock lock; /* Lock state for this handle */
604 AsyncLock *pLock; /* AsyncLock object for this file system entry */
605 AsyncWrite closeOp; /* Preallocated close operation */
606};
607
608/*
609** Add an entry to the end of the global write-op list. pWrite should point
610** to an AsyncWrite structure allocated using sqlite3_malloc(). The writer
611** thread will call sqlite3_free() to free the structure after the specified
612** operation has been completed.
613**
614** Once an AsyncWrite structure has been added to the list, it becomes the
615** property of the writer thread and must not be read or modified by the
616** caller.
617*/
618static void addAsyncWrite(AsyncWrite *pWrite){
619 /* We must hold the queue mutex in order to modify the queue pointers */
620 if( pWrite->op!=ASYNC_UNLOCK ){
621 async_mutex_enter(ASYNC_MUTEX_QUEUE);
622 }
623
624 /* Add the record to the end of the write-op queue */
625 assert( !pWrite->pNext );
626 if( async.pQueueLast ){
627 assert( async.pQueueFirst );
628 async.pQueueLast->pNext = pWrite;
629 }else{
630 async.pQueueFirst = pWrite;
631 }
632 async.pQueueLast = pWrite;
633 ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
634 pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
635
636 if( pWrite->op==ASYNC_CLOSE ){
637 async.nFile--;
638 }
639
640 /* The writer thread might have been idle because there was nothing
641 ** on the write-op queue for it to do. So wake it up. */
642 async_cond_signal(ASYNC_COND_QUEUE);
643
644 /* Drop the queue mutex */
645 if( pWrite->op!=ASYNC_UNLOCK ){
646 async_mutex_leave(ASYNC_MUTEX_QUEUE);
647 }
648}
649
650/*
651** Increment async.nFile in a thread-safe manner.
652*/
653static void incrOpenFileCount(void){
654 /* We must hold the queue mutex in order to modify async.nFile */
655 async_mutex_enter(ASYNC_MUTEX_QUEUE);
656 if( async.nFile==0 ){
657 async.ioError = SQLITE_OK;
658 }
659 async.nFile++;
660 async_mutex_leave(ASYNC_MUTEX_QUEUE);
661}
662
663/*
664** This is a utility function to allocate and populate a new AsyncWrite
665** structure and insert it (via addAsyncWrite() ) into the global list.
666*/
667static int addNewAsyncWrite(
668 AsyncFileData *pFileData,
669 int op,
670 sqlite3_int64 iOffset,
671 int nByte,
672 const char *zByte
673){
674 AsyncWrite *p;
675 if( op!=ASYNC_CLOSE && async.ioError ){
676 return async.ioError;
677 }
678 p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
679 if( !p ){
680 /* The upper layer does not expect operations like OsWrite() to
681 ** return SQLITE_NOMEM. This is partly because under normal conditions
682 ** SQLite is required to do rollback without calling malloc(). So
683 ** if malloc() fails here, treat it as an I/O error. The above
684 ** layer knows how to handle that.
685 */
686 return SQLITE_IOERR;
687 }
688 p->op = op;
689 p->iOffset = iOffset;
690 p->nByte = nByte;
691 p->pFileData = pFileData;
692 p->pNext = 0;
693 if( zByte ){
694 p->zBuf = (char *)&p[1];
695 memcpy(p->zBuf, zByte, nByte);
696 }else{
697 p->zBuf = 0;
698 }
699 addAsyncWrite(p);
700 return SQLITE_OK;
701}
702
703/*
704** Close the file. This just adds an entry to the write-op list, the file is
705** not actually closed.
706*/
707static int asyncClose(sqlite3_file *pFile){
708 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
709
710 /* Unlock the file, if it is locked */
711 async_mutex_enter(ASYNC_MUTEX_LOCK);
712 p->lock.eLock = 0;
713 async_mutex_leave(ASYNC_MUTEX_LOCK);
714
715 addAsyncWrite(&p->closeOp);
716 return SQLITE_OK;
717}
718
719/*
720** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
721** writing to the underlying file, this function adds an entry to the end of
722** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
723** returned.
724*/
725static int asyncWrite(
726 sqlite3_file *pFile,
727 const void *pBuf,
728 int amt,
729 sqlite3_int64 iOff
730){
731 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
732 return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
733}
734
735/*
736** Read data from the file. First we read from the filesystem, then adjust
737** the contents of the buffer based on ASYNC_WRITE operations in the
738** write-op queue.
739**
740** This method holds the mutex from start to finish.
741*/
742static int asyncRead(
743 sqlite3_file *pFile,
744 void *zOut,
745 int iAmt,
746 sqlite3_int64 iOffset
747){
748 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
749 int rc = SQLITE_OK;
750 sqlite3_int64 filesize;
751 int nRead;
752 sqlite3_file *pBase = p->pBaseRead;
753
754 /* Grab the write queue mutex for the duration of the call */
755 async_mutex_enter(ASYNC_MUTEX_QUEUE);
756
757 /* If an I/O error has previously occurred in this virtual file
758 ** system, then all subsequent operations fail.
759 */
760 if( async.ioError!=SQLITE_OK ){
761 rc = async.ioError;
762 goto asyncread_out;
763 }
764
765 if( pBase->pMethods ){
766 rc = pBase->pMethods->xFileSize(pBase, &filesize);
767 if( rc!=SQLITE_OK ){
768 goto asyncread_out;
769 }
770 nRead = MIN(filesize - iOffset, iAmt);
771 if( nRead>0 ){
772 rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
773 ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
774 }
775 }
776
777 if( rc==SQLITE_OK ){
778 AsyncWrite *pWrite;
779 char *zName = p->zName;
780
781 for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
782 if( pWrite->op==ASYNC_WRITE && (
783 (pWrite->pFileData==p) ||
784 (zName && pWrite->pFileData->zName==zName)
785 )){
786 int iBeginOut = (pWrite->iOffset-iOffset);
787 int iBeginIn = -iBeginOut;
788 int nCopy;
789
790 if( iBeginIn<0 ) iBeginIn = 0;
791 if( iBeginOut<0 ) iBeginOut = 0;
792 nCopy = MIN(pWrite->nByte-iBeginIn, iAmt-iBeginOut);
793
794 if( nCopy>0 ){
795 memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
796 ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
797 }
798 }
799 }
800 }
801
802asyncread_out:
803 async_mutex_leave(ASYNC_MUTEX_QUEUE);
804 return rc;
805}
806
807/*
808** Truncate the file to nByte bytes in length. This just adds an entry to
809** the write-op list, no IO actually takes place.
810*/
811static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
812 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
813 return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
814}
815
816/*
817** Sync the file. This just adds an entry to the write-op list, the
818** sync() is done later by sqlite3_async_flush().
819*/
820static int asyncSync(sqlite3_file *pFile, int flags){
821 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
822 return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
823}
824
825/*
826** Read the size of the file. First we read the size of the file system
827** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
828** currently in the write-op list.
829**
830** This method holds the mutex from start to finish.
831*/
832int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
833 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
834 int rc = SQLITE_OK;
835 sqlite3_int64 s = 0;
836 sqlite3_file *pBase;
837
838 async_mutex_enter(ASYNC_MUTEX_QUEUE);
839
840 /* Read the filesystem size from the base file. If pBaseRead is NULL, this
841 ** means the file hasn't been opened yet. In this case all relevant data
842 ** must be in the write-op queue anyway, so we can omit reading from the
843 ** file-system.
844 */
845 pBase = p->pBaseRead;
846 if( pBase->pMethods ){
847 rc = pBase->pMethods->xFileSize(pBase, &s);
848 }
849
850 if( rc==SQLITE_OK ){
851 AsyncWrite *pWrite;
852 for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
853 if( pWrite->op==ASYNC_DELETE
854 && p->zName
855 && strcmp(p->zName, pWrite->zBuf)==0
856 ){
857 s = 0;
858 }else if( pWrite->pFileData && (
859 (pWrite->pFileData==p)
860 || (p->zName && pWrite->pFileData->zName==p->zName)
861 )){
862 switch( pWrite->op ){
863 case ASYNC_WRITE:
864 s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
865 break;
866 case ASYNC_TRUNCATE:
867 s = MIN(s, pWrite->iOffset);
868 break;
869 }
870 }
871 }
872 *piSize = s;
873 }
874 async_mutex_leave(ASYNC_MUTEX_QUEUE);
875 return rc;
876}
877
878/*
879** Lock or unlock the actual file-system entry.
880*/
881static int getFileLock(AsyncLock *pLock){
882 int rc = SQLITE_OK;
883 AsyncFileLock *pIter;
884 int eRequired = 0;
885
886 if( pLock->pFile ){
887 for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
888 assert(pIter->eAsyncLock>=pIter->eLock);
889 if( pIter->eAsyncLock>eRequired ){
890 eRequired = pIter->eAsyncLock;
891 assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
892 }
893 }
894
895 if( eRequired>pLock->eLock ){
896 rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
897 if( rc==SQLITE_OK ){
898 pLock->eLock = eRequired;
899 }
900 }
901 else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
902 rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
903 if( rc==SQLITE_OK ){
904 pLock->eLock = eRequired;
905 }
906 }
907 }
908
909 return rc;
910}
911
912/*
913** Return the AsyncLock structure from the global async.pLock list
914** associated with the file-system entry identified by path zName
915** (a string of nName bytes). If no such structure exists, return 0.
916*/
917static AsyncLock *findLock(const char *zName, int nName){
918 AsyncLock *p = async.pLock;
919 while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
920 p = p->pNext;
921 }
922 return p;
923}
924
925/*
926** The following two methods - asyncLock() and asyncUnlock() - are used
927** to obtain and release locks on database files opened with the
928** asynchronous backend.
929*/
930static int asyncLock(sqlite3_file *pFile, int eLock){
931 int rc = SQLITE_OK;
932 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
933
934 if( p->zName ){
935 async_mutex_enter(ASYNC_MUTEX_LOCK);
936 if( p->lock.eLock<eLock ){
937 AsyncLock *pLock = p->pLock;
938 AsyncFileLock *pIter;
939 assert(pLock && pLock->pList);
940 for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
941 if( pIter!=&p->lock && (
942 (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
943 (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
944 (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
945 (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
946 )){
947 rc = SQLITE_BUSY;
948 }
949 }
950 if( rc==SQLITE_OK ){
951 p->lock.eLock = eLock;
952 p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
953 }
954 assert(p->lock.eAsyncLock>=p->lock.eLock);
955 if( rc==SQLITE_OK ){
956 rc = getFileLock(pLock);
957 }
958 }
959 async_mutex_leave(ASYNC_MUTEX_LOCK);
960 }
961
962 ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
963 return rc;
964}
965static int asyncUnlock(sqlite3_file *pFile, int eLock){
966 int rc = SQLITE_OK;
967 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
968 if( p->zName ){
969 AsyncFileLock *pLock = &p->lock;
970 async_mutex_enter(ASYNC_MUTEX_QUEUE);
971 async_mutex_enter(ASYNC_MUTEX_LOCK);
972 pLock->eLock = MIN(pLock->eLock, eLock);
973 rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
974 async_mutex_leave(ASYNC_MUTEX_LOCK);
975 async_mutex_leave(ASYNC_MUTEX_QUEUE);
976 }
977 return rc;
978}
979
980/*
981** This function is called when the pager layer first opens a database file
982** and is checking for a hot-journal.
983*/
984static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
985 int ret = 0;
986 AsyncFileLock *pIter;
987 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
988
989 async_mutex_enter(ASYNC_MUTEX_LOCK);
990 for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
991 if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
992 ret = 1;
993 }
994 }
995 async_mutex_leave(ASYNC_MUTEX_LOCK);
996
997 ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
998 *pResOut = ret;
999 return SQLITE_OK;
1000}
1001
1002/*
1003** sqlite3_file_control() implementation.
1004*/
1005static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
1006 switch( op ){
1007 case SQLITE_FCNTL_LOCKSTATE: {
1008 async_mutex_enter(ASYNC_MUTEX_LOCK);
1009 *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
1010 async_mutex_leave(ASYNC_MUTEX_LOCK);
1011 return SQLITE_OK;
1012 }
1013 }
1014 return SQLITE_ERROR;
1015}
1016
1017/*
1018** Return the device characteristics and sector-size of the device. It
1019** is not tricky to implement these correctly, as this backend might
1020** not have an open file handle at this point.
1021*/
1022static int asyncSectorSize(sqlite3_file *pFile){
1023 return 512;
1024}
1025static int asyncDeviceCharacteristics(sqlite3_file *pFile){
1026 return 0;
1027}
1028
1029static int unlinkAsyncFile(AsyncFileData *pData){
1030 AsyncFileLock **ppIter;
1031 int rc = SQLITE_OK;
1032
1033 if( pData->zName ){
1034 AsyncLock *pLock = pData->pLock;
1035 for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
1036 if( (*ppIter)==&pData->lock ){
1037 *ppIter = pData->lock.pNext;
1038 break;
1039 }
1040 }
1041 if( !pLock->pList ){
1042 AsyncLock **pp;
1043 if( pLock->pFile ){
1044 pLock->pFile->pMethods->xClose(pLock->pFile);
1045 }
1046 for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
1047 *pp = pLock->pNext;
1048 sqlite3_free(pLock);
1049 }else{
1050 rc = getFileLock(pLock);
1051 }
1052 }
1053
1054 return rc;
1055}
1056
1057/*
1058** The parameter passed to this function is a copy of a 'flags' parameter
1059** passed to this modules xOpen() method. This function returns true
1060** if the file should be opened asynchronously, or false if it should
1061** be opened immediately.
1062**
1063** If the file is to be opened asynchronously, then asyncOpen() will add
1064** an entry to the event queue and the file will not actually be opened
1065** until the event is processed. Otherwise, the file is opened directly
1066** by the caller.
1067*/
1068static int doAsynchronousOpen(int flags){
1069 return (flags&SQLITE_OPEN_CREATE) && (
1070 (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1071 (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1072 (flags&SQLITE_OPEN_DELETEONCLOSE)
1073 );
1074}
1075
1076/*
1077** Open a file.
1078*/
1079static int asyncOpen(
1080 sqlite3_vfs *pAsyncVfs,
1081 const char *zName,
1082 sqlite3_file *pFile,
1083 int flags,
1084 int *pOutFlags
1085){
1086 static sqlite3_io_methods async_methods = {
1087 1, /* iVersion */
1088 asyncClose, /* xClose */
1089 asyncRead, /* xRead */
1090 asyncWrite, /* xWrite */
1091 asyncTruncate, /* xTruncate */
1092 asyncSync, /* xSync */
1093 asyncFileSize, /* xFileSize */
1094 asyncLock, /* xLock */
1095 asyncUnlock, /* xUnlock */
1096 asyncCheckReservedLock, /* xCheckReservedLock */
1097 asyncFileControl, /* xFileControl */
1098 asyncSectorSize, /* xSectorSize */
1099 asyncDeviceCharacteristics /* xDeviceCharacteristics */
1100 };
1101
1102 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1103 AsyncFile *p = (AsyncFile *)pFile;
1104 int nName = 0;
1105 int rc = SQLITE_OK;
1106 int nByte;
1107 AsyncFileData *pData;
1108 AsyncLock *pLock = 0;
1109 char *z;
1110 int isAsyncOpen = doAsynchronousOpen(flags);
1111
1112 /* If zName is NULL, then the upper layer is requesting an anonymous file */
1113 if( zName ){
1114 nName = strlen(zName)+1;
1115 }
1116
1117 nByte = (
1118 sizeof(AsyncFileData) + /* AsyncFileData structure */
1119 2 * pVfs->szOsFile + /* AsyncFileData.pBaseRead and pBaseWrite */
1120 nName /* AsyncFileData.zName */
1121 );
1122 z = sqlite3_malloc(nByte);
1123 if( !z ){
1124 return SQLITE_NOMEM;
1125 }
1126 memset(z, 0, nByte);
1127 pData = (AsyncFileData*)z;
1128 z += sizeof(pData[0]);
1129 pData->pBaseRead = (sqlite3_file*)z;
1130 z += pVfs->szOsFile;
1131 pData->pBaseWrite = (sqlite3_file*)z;
1132 pData->closeOp.pFileData = pData;
1133 pData->closeOp.op = ASYNC_CLOSE;
1134
1135 if( zName ){
1136 z += pVfs->szOsFile;
1137 pData->zName = z;
1138 pData->nName = nName;
1139 memcpy(pData->zName, zName, nName);
1140 }
1141
1142 if( !isAsyncOpen ){
1143 int flagsout;
1144 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
1145 if( rc==SQLITE_OK && (flagsout&SQLITE_OPEN_READWRITE) ){
1146 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1147 }
1148 if( pOutFlags ){
1149 *pOutFlags = flagsout;
1150 }
1151 }
1152
1153 async_mutex_enter(ASYNC_MUTEX_LOCK);
1154
1155 if( zName && rc==SQLITE_OK ){
1156 pLock = findLock(pData->zName, pData->nName);
1157 if( !pLock ){
1158 int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1159 pLock = (AsyncLock *)sqlite3_malloc(nByte);
1160 if( pLock ){
1161 memset(pLock, 0, nByte);
1162#ifdef ENABLE_FILE_LOCKING
1163 if( flags&SQLITE_OPEN_MAIN_DB ){
1164 pLock->pFile = (sqlite3_file *)&pLock[1];
1165 rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1166 if( rc!=SQLITE_OK ){
1167 sqlite3_free(pLock);
1168 pLock = 0;
1169 }
1170 }
1171#endif
1172 if( pLock ){
1173 pLock->nFile = pData->nName;
1174 pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1175 memcpy(pLock->zFile, pData->zName, pLock->nFile);
1176 pLock->pNext = async.pLock;
1177 async.pLock = pLock;
1178 }
1179 }else{
1180 rc = SQLITE_NOMEM;
1181 }
1182 }
1183 }
1184
1185 if( rc==SQLITE_OK ){
1186 p->pMethod = &async_methods;
1187 p->pData = pData;
1188
1189 /* Link AsyncFileData.lock into the linked list of
1190 ** AsyncFileLock structures for this file.
1191 */
1192 if( zName ){
1193 pData->lock.pNext = pLock->pList;
1194 pLock->pList = &pData->lock;
1195 pData->zName = pLock->zFile;
1196 }
1197 }else{
1198 if( pData->pBaseRead->pMethods ){
1199 pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1200 }
1201 if( pData->pBaseWrite->pMethods ){
1202 pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1203 }
1204 sqlite3_free(pData);
1205 }
1206
1207 async_mutex_leave(ASYNC_MUTEX_LOCK);
1208
1209 if( rc==SQLITE_OK ){
1210 incrOpenFileCount();
1211 pData->pLock = pLock;
1212 }
1213
1214 if( rc==SQLITE_OK && isAsyncOpen ){
1215 rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1216 if( rc==SQLITE_OK ){
1217 if( pOutFlags ) *pOutFlags = flags;
1218 }else{
1219 async_mutex_enter(ASYNC_MUTEX_LOCK);
1220 unlinkAsyncFile(pData);
1221 async_mutex_leave(ASYNC_MUTEX_LOCK);
1222 sqlite3_free(pData);
1223 }
1224 }
1225 if( rc!=SQLITE_OK ){
1226 p->pMethod = 0;
1227 }
1228 return rc;
1229}
1230
1231/*
1232** Implementation of sqlite3OsDelete. Add an entry to the end of the
1233** write-op queue to perform the delete.
1234*/
1235static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
1236 return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, strlen(z)+1, z);
1237}
1238
1239/*
1240** Implementation of sqlite3OsAccess. This method holds the mutex from
1241** start to finish.
1242*/
1243static int asyncAccess(
1244 sqlite3_vfs *pAsyncVfs,
1245 const char *zName,
1246 int flags,
1247 int *pResOut
1248){
1249 int rc;
1250 int ret;
1251 AsyncWrite *p;
1252 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1253
1254 assert(flags==SQLITE_ACCESS_READWRITE
1255 || flags==SQLITE_ACCESS_READ
1256 || flags==SQLITE_ACCESS_EXISTS
1257 );
1258
1259 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1260 rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1261 if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1262 for(p=async.pQueueFirst; p; p = p->pNext){
1263 if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1264 ret = 0;
1265 }else if( p->op==ASYNC_OPENEXCLUSIVE
1266 && p->pFileData->zName
1267 && 0==strcmp(p->pFileData->zName, zName)
1268 ){
1269 ret = 1;
1270 }
1271 }
1272 }
1273 ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1274 flags==SQLITE_ACCESS_READWRITE?"read-write":
1275 flags==SQLITE_ACCESS_READ?"read":"exists"
1276 , zName, ret)
1277 );
1278 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1279 *pResOut = ret;
1280 return rc;
1281}
1282
1283/*
1284** Fill in zPathOut with the full path to the file identified by zPath.
1285*/
1286static int asyncFullPathname(
1287 sqlite3_vfs *pAsyncVfs,
1288 const char *zPath,
1289 int nPathOut,
1290 char *zPathOut
1291){
1292 int rc;
1293 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1294 rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1295
1296 /* Because of the way intra-process file locking works, this backend
1297 ** needs to return a canonical path. The following block assumes the
1298 ** file-system uses unix style paths.
1299 */
1300 if( rc==SQLITE_OK ){
1301 int i, j;
1302 int n = nPathOut;
1303 char *z = zPathOut;
1304 while( n>1 && z[n-1]=='/' ){ n--; }
1305 for(i=j=0; i<n; i++){
1306 if( z[i]=='/' ){
1307 if( z[i+1]=='/' ) continue;
1308 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1309 i += 1;
1310 continue;
1311 }
1312 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1313 while( j>0 && z[j-1]!='/' ){ j--; }
1314 if( j>0 ){ j--; }
1315 i += 2;
1316 continue;
1317 }
1318 }
1319 z[j++] = z[i];
1320 }
1321 z[j] = 0;
1322 }
1323
1324 return rc;
1325}
1326static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1327 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1328 return pVfs->xDlOpen(pVfs, zPath);
1329}
1330static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1331 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1332 pVfs->xDlError(pVfs, nByte, zErrMsg);
1333}
1334static void (*asyncDlSym(
1335 sqlite3_vfs *pAsyncVfs,
1336 void *pHandle,
1337 const char *zSymbol
1338))(void){
1339 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1340 return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1341}
1342static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1343 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1344 pVfs->xDlClose(pVfs, pHandle);
1345}
1346static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1347 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1348 return pVfs->xRandomness(pVfs, nByte, zBufOut);
1349}
1350static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1351 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1352 return pVfs->xSleep(pVfs, nMicro);
1353}
1354static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1355 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1356 return pVfs->xCurrentTime(pVfs, pTimeOut);
1357}
1358
1359static sqlite3_vfs async_vfs = {
1360 1, /* iVersion */
1361 sizeof(AsyncFile), /* szOsFile */
1362 0, /* mxPathname */
1363 0, /* pNext */
1364 SQLITEASYNC_VFSNAME, /* zName */
1365 0, /* pAppData */
1366 asyncOpen, /* xOpen */
1367 asyncDelete, /* xDelete */
1368 asyncAccess, /* xAccess */
1369 asyncFullPathname, /* xFullPathname */
1370 asyncDlOpen, /* xDlOpen */
1371 asyncDlError, /* xDlError */
1372 asyncDlSym, /* xDlSym */
1373 asyncDlClose, /* xDlClose */
1374 asyncRandomness, /* xDlError */
1375 asyncSleep, /* xDlSym */
1376 asyncCurrentTime /* xDlClose */
1377};
1378
1379/*
1380** This procedure runs in a separate thread, reading messages off of the
1381** write queue and processing them one by one.
1382**
1383** If async.writerHaltNow is true, then this procedure exits
1384** after processing a single message.
1385**
1386** If async.writerHaltWhenIdle is true, then this procedure exits when
1387** the write queue is empty.
1388**
1389** If both of the above variables are false, this procedure runs
1390** indefinately, waiting for operations to be added to the write queue
1391** and processing them in the order in which they arrive.
1392**
1393** An artifical delay of async.ioDelay milliseconds is inserted before
1394** each write operation in order to simulate the effect of a slow disk.
1395**
1396** Only one instance of this procedure may be running at a time.
1397*/
1398static void asyncWriterThread(void){
1399 sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1400 AsyncWrite *p = 0;
1401 int rc = SQLITE_OK;
1402 int holdingMutex = 0;
1403
1404 async_mutex_enter(ASYNC_MUTEX_WRITER);
1405
1406 while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1407 int doNotFree = 0;
1408 sqlite3_file *pBase = 0;
1409
1410 if( !holdingMutex ){
1411 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1412 }
1413 while( (p = async.pQueueFirst)==0 ){
1414 if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1415 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1416 break;
1417 }else{
1418 ASYNC_TRACE(("IDLE\n"));
1419 async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1420 ASYNC_TRACE(("WAKEUP\n"));
1421 }
1422 }
1423 if( p==0 ) break;
1424 holdingMutex = 1;
1425
1426 /* Right now this thread is holding the mutex on the write-op queue.
1427 ** Variable 'p' points to the first entry in the write-op queue. In
1428 ** the general case, we hold on to the mutex for the entire body of
1429 ** the loop.
1430 **
1431 ** However in the cases enumerated below, we relinquish the mutex,
1432 ** perform the IO, and then re-request the mutex before removing 'p' from
1433 ** the head of the write-op queue. The idea is to increase concurrency with
1434 ** sqlite threads.
1435 **
1436 ** * An ASYNC_CLOSE operation.
1437 ** * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1438 ** the mutex, call the underlying xOpenExclusive() function, then
1439 ** re-aquire the mutex before seting the AsyncFile.pBaseRead
1440 ** variable.
1441 ** * ASYNC_SYNC and ASYNC_WRITE operations, if
1442 ** SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1443 ** file-handles are open for the particular file being "synced".
1444 */
1445 if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1446 p->op = ASYNC_NOOP;
1447 }
1448 if( p->pFileData ){
1449 pBase = p->pFileData->pBaseWrite;
1450 if(
1451 p->op==ASYNC_CLOSE ||
1452 p->op==ASYNC_OPENEXCLUSIVE ||
1453 (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1454 ){
1455 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1456 holdingMutex = 0;
1457 }
1458 if( !pBase->pMethods ){
1459 pBase = p->pFileData->pBaseRead;
1460 }
1461 }
1462
1463 switch( p->op ){
1464 case ASYNC_NOOP:
1465 break;
1466
1467 case ASYNC_WRITE:
1468 assert( pBase );
1469 ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1470 p->pFileData->zName, p->nByte, p->iOffset));
1471 rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1472 break;
1473
1474 case ASYNC_SYNC:
1475 assert( pBase );
1476 ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1477 rc = pBase->pMethods->xSync(pBase, p->nByte);
1478 break;
1479
1480 case ASYNC_TRUNCATE:
1481 assert( pBase );
1482 ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1483 p->pFileData->zName, p->iOffset));
1484 rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1485 break;
1486
1487 case ASYNC_CLOSE: {
1488 AsyncFileData *pData = p->pFileData;
1489 ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1490 if( pData->pBaseWrite->pMethods ){
1491 pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1492 }
1493 if( pData->pBaseRead->pMethods ){
1494 pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1495 }
1496
1497 /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1498 ** structures for this file. Obtain the async.lockMutex mutex
1499 ** before doing so.
1500 */
1501 async_mutex_enter(ASYNC_MUTEX_LOCK);
1502 rc = unlinkAsyncFile(pData);
1503 async_mutex_leave(ASYNC_MUTEX_LOCK);
1504
1505 if( !holdingMutex ){
1506 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1507 holdingMutex = 1;
1508 }
1509 assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1510 async.pQueueFirst = p->pNext;
1511 sqlite3_free(pData);
1512 doNotFree = 1;
1513 break;
1514 }
1515
1516 case ASYNC_UNLOCK: {
1517 AsyncWrite *pIter;
1518 AsyncFileData *pData = p->pFileData;
1519 int eLock = p->nByte;
1520
1521 /* When a file is locked by SQLite using the async backend, it is
1522 ** locked within the 'real' file-system synchronously. When it is
1523 ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1524 ** unlock the file asynchronously. The design of the async backend
1525 ** requires that the 'real' file-system file be locked from the
1526 ** time that SQLite first locks it (and probably reads from it)
1527 ** until all asynchronous write events that were scheduled before
1528 ** SQLite unlocked the file have been processed.
1529 **
1530 ** This is more complex if SQLite locks and unlocks the file multiple
1531 ** times in quick succession. For example, if SQLite does:
1532 **
1533 ** lock, write, unlock, lock, write, unlock
1534 **
1535 ** Each "lock" operation locks the file immediately. Each "write"
1536 ** and "unlock" operation adds an event to the event queue. If the
1537 ** second "lock" operation is performed before the first "unlock"
1538 ** operation has been processed asynchronously, then the first
1539 ** "unlock" cannot be safely processed as is, since this would mean
1540 ** the file was unlocked when the second "write" operation is
1541 ** processed. To work around this, when processing an ASYNC_UNLOCK
1542 ** operation, SQLite:
1543 **
1544 ** 1) Unlocks the file to the minimum of the argument passed to
1545 ** the xUnlock() call and the current lock from SQLite's point
1546 ** of view, and
1547 **
1548 ** 2) Only unlocks the file at all if this event is the last
1549 ** ASYNC_UNLOCK event on this file in the write-queue.
1550 */
1551 assert( holdingMutex==1 );
1552 assert( async.pQueueFirst==p );
1553 for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1554 if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1555 }
1556 if( !pIter ){
1557 async_mutex_enter(ASYNC_MUTEX_LOCK);
1558 pData->lock.eAsyncLock = MIN(
1559 pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1560 );
1561 assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1562 rc = getFileLock(pData->pLock);
1563 async_mutex_leave(ASYNC_MUTEX_LOCK);
1564 }
1565 break;
1566 }
1567
1568 case ASYNC_DELETE:
1569 ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1570 rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1571 break;
1572
1573 case ASYNC_OPENEXCLUSIVE: {
1574 int flags = (int)p->iOffset;
1575 AsyncFileData *pData = p->pFileData;
1576 ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1577 assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1578 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1579 assert( holdingMutex==0 );
1580 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1581 holdingMutex = 1;
1582 break;
1583 }
1584
1585 default: assert(!"Illegal value for AsyncWrite.op");
1586 }
1587
1588 /* If we didn't hang on to the mutex during the IO op, obtain it now
1589 ** so that the AsyncWrite structure can be safely removed from the
1590 ** global write-op queue.
1591 */
1592 if( !holdingMutex ){
1593 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1594 holdingMutex = 1;
1595 }
1596 /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1597 if( p==async.pQueueLast ){
1598 async.pQueueLast = 0;
1599 }
1600 if( !doNotFree ){
1601 assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1602 async.pQueueFirst = p->pNext;
1603 sqlite3_free(p);
1604 }
1605 assert( holdingMutex );
1606
1607 /* An IO error has occurred. We cannot report the error back to the
1608 ** connection that requested the I/O since the error happened
1609 ** asynchronously. The connection has already moved on. There
1610 ** really is nobody to report the error to.
1611 **
1612 ** The file for which the error occurred may have been a database or
1613 ** journal file. Regardless, none of the currently queued operations
1614 ** associated with the same database should now be performed. Nor should
1615 ** any subsequently requested IO on either a database or journal file
1616 ** handle for the same database be accepted until the main database
1617 ** file handle has been closed and reopened.
1618 **
1619 ** Furthermore, no further IO should be queued or performed on any file
1620 ** handle associated with a database that may have been part of a
1621 ** multi-file transaction that included the database associated with
1622 ** the IO error (i.e. a database ATTACHed to the same handle at some
1623 ** point in time).
1624 */
1625 if( rc!=SQLITE_OK ){
1626 async.ioError = rc;
1627 }
1628
1629 if( async.ioError && !async.pQueueFirst ){
1630 async_mutex_enter(ASYNC_MUTEX_LOCK);
1631 if( 0==async.pLock ){
1632 async.ioError = SQLITE_OK;
1633 }
1634 async_mutex_leave(ASYNC_MUTEX_LOCK);
1635 }
1636
1637 /* Drop the queue mutex before continuing to the next write operation
1638 ** in order to give other threads a chance to work with the write queue.
1639 */
1640 if( !async.pQueueFirst || !async.ioError ){
1641 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1642 holdingMutex = 0;
1643 if( async.ioDelay>0 ){
1644 pVfs->xSleep(pVfs, async.ioDelay);
1645 }else{
1646 async_sched_yield();
1647 }
1648 }
1649 }
1650
1651 async_mutex_leave(ASYNC_MUTEX_WRITER);
1652 return;
1653}
1654
1655/*
1656** Install the asynchronous VFS.
1657*/
1658int sqlite3async_initialize(const char *zParent, int isDefault){
1659 int rc = SQLITE_OK;
1660 if( async_vfs.pAppData==0 ){
1661 sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1662 if( !pParent || async_os_initialize() ){
1663 rc = SQLITE_ERROR;
1664 }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1665 async_os_shutdown();
1666 }else{
1667 async_vfs.pAppData = (void *)pParent;
1668 async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1669 }
1670 }
1671 return rc;
1672}
1673
1674/*
1675** Uninstall the asynchronous VFS.
1676*/
1677void sqlite3async_shutdown(void){
1678 if( async_vfs.pAppData ){
1679 async_os_shutdown();
1680 sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1681 async_vfs.pAppData = 0;
1682 }
1683}
1684
1685/*
1686** Process events on the write-queue.
1687*/
1688void sqlite3async_run(void){
1689 asyncWriterThread();
1690}
1691
1692/*
1693** Control/configure the asynchronous IO system.
1694*/
1695int sqlite3async_control(int op, ...){
1696 va_list ap;
1697 va_start(ap, op);
1698 switch( op ){
1699 case SQLITEASYNC_HALT: {
1700 int eWhen = va_arg(ap, int);
1701 if( eWhen!=SQLITEASYNC_HALT_NEVER
1702 && eWhen!=SQLITEASYNC_HALT_NOW
1703 && eWhen!=SQLITEASYNC_HALT_IDLE
1704 ){
1705 return SQLITE_ERROR;
1706 }
1707 async.eHalt = eWhen;
1708 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1709 async_cond_signal(ASYNC_COND_QUEUE);
1710 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1711 break;
1712 }
1713
1714 case SQLITEASYNC_DELAY: {
1715 int iDelay = va_arg(ap, int);
1716 async.ioDelay = iDelay;
1717 break;
1718 }
1719
1720 case SQLITEASYNC_GET_HALT: {
1721 int *peWhen = va_arg(ap, int *);
1722 *peWhen = async.eHalt;
1723 break;
1724 }
1725 case SQLITEASYNC_GET_DELAY: {
1726 int *piDelay = va_arg(ap, int *);
1727 *piDelay = async.ioDelay;
1728 break;
1729 }
1730
1731 default:
1732 return SQLITE_ERROR;
1733 }
1734 return SQLITE_OK;
1735}
1736
1737#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1738