blob: 65399a57a839d0ef9d9472643d6d54ac5b06eba1 [file] [log] [blame]
danielk1977a3f06592009-04-23 14:58:39 +00001/*
2** 2005 December 14
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
danielk19775368f292009-07-18 11:52:04 +000013** $Id: sqlite3async.c,v 1.7 2009/07/18 11:52:04 danielk1977 Exp $
danielk1977a3f06592009-04-23 14:58:39 +000014**
danielk1977debcfd22009-04-24 09:27:16 +000015** This file contains the implementation of an asynchronous IO backend
16** for SQLite.
danielk1977a3f06592009-04-23 14:58:39 +000017*/
18
19#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO)
20
21#include "sqlite3async.h"
shaneeb4ac062009-04-30 17:45:33 +000022#include "sqlite3.h"
danielk19774598b8e2009-04-24 10:13:05 +000023#include <stdarg.h>
24#include <string.h>
25#include <assert.h>
danielk1977a3f06592009-04-23 14:58:39 +000026
danielk1977a3f06592009-04-23 14:58:39 +000027/* Useful macros used in several places */
28#define MIN(x,y) ((x)<(y)?(x):(y))
29#define MAX(x,y) ((x)>(y)?(x):(y))
30
shaneeb4ac062009-04-30 17:45:33 +000031#ifndef SQLITE_AMALGAMATION
32/* Macro to mark parameters as unused and silence compiler warnings. */
33#define UNUSED_PARAMETER(x) (void)(x)
34#endif
35
danielk1977a3f06592009-04-23 14:58:39 +000036/* Forward references */
37typedef struct AsyncWrite AsyncWrite;
38typedef struct AsyncFile AsyncFile;
39typedef struct AsyncFileData AsyncFileData;
40typedef struct AsyncFileLock AsyncFileLock;
41typedef struct AsyncLock AsyncLock;
42
43/* Enable for debugging */
danielk19774598b8e2009-04-24 10:13:05 +000044#ifndef NDEBUG
45#include <stdio.h>
danielk1977a3f06592009-04-23 14:58:39 +000046static int sqlite3async_trace = 0;
47# define ASYNC_TRACE(X) if( sqlite3async_trace ) asyncTrace X
48static void asyncTrace(const char *zFormat, ...){
49 char *z;
50 va_list ap;
51 va_start(ap, zFormat);
52 z = sqlite3_vmprintf(zFormat, ap);
53 va_end(ap);
54 fprintf(stderr, "[%d] %s", 0 /* (int)pthread_self() */, z);
55 sqlite3_free(z);
56}
shanea3628d12009-04-29 18:11:59 +000057#else
58# define ASYNC_TRACE(X)
danielk19774598b8e2009-04-24 10:13:05 +000059#endif
danielk1977a3f06592009-04-23 14:58:39 +000060
61/*
62** THREAD SAFETY NOTES
63**
64** Basic rules:
65**
66** * Both read and write access to the global write-op queue must be
67** protected by the async.queueMutex. As are the async.ioError and
68** async.nFile variables.
69**
70** * The async.pLock list and all AsyncLock and AsyncFileLock
71** structures must be protected by the async.lockMutex mutex.
72**
73** * The file handles from the underlying system are not assumed to
74** be thread safe.
75**
76** * See the last two paragraphs under "The Writer Thread" for
77** an assumption to do with file-handle synchronization by the Os.
78**
79** Deadlock prevention:
80**
81** There are three mutex used by the system: the "writer" mutex,
82** the "queue" mutex and the "lock" mutex. Rules are:
83**
84** * It is illegal to block on the writer mutex when any other mutex
85** are held, and
86**
87** * It is illegal to block on the queue mutex when the lock mutex
88** is held.
89**
90** i.e. mutex's must be grabbed in the order "writer", "queue", "lock".
91**
92** File system operations (invoked by SQLite thread):
93**
94** xOpen
95** xDelete
96** xFileExists
97**
98** File handle operations (invoked by SQLite thread):
99**
100** asyncWrite, asyncClose, asyncTruncate, asyncSync
101**
102** The operations above add an entry to the global write-op list. They
103** prepare the entry, acquire the async.queueMutex momentarily while
104** list pointers are manipulated to insert the new entry, then release
105** the mutex and signal the writer thread to wake up in case it happens
106** to be asleep.
107**
108**
109** asyncRead, asyncFileSize.
110**
111** Read operations. Both of these read from both the underlying file
112** first then adjust their result based on pending writes in the
113** write-op queue. So async.queueMutex is held for the duration
114** of these operations to prevent other threads from changing the
115** queue in mid operation.
116**
117**
118** asyncLock, asyncUnlock, asyncCheckReservedLock
119**
120** These primitives implement in-process locking using a hash table
121** on the file name. Files are locked correctly for connections coming
122** from the same process. But other processes cannot see these locks
123** and will therefore not honor them.
124**
125**
126** The writer thread:
127**
128** The async.writerMutex is used to make sure only there is only
129** a single writer thread running at a time.
130**
131** Inside the writer thread is a loop that works like this:
132**
133** WHILE (write-op list is not empty)
134** Do IO operation at head of write-op list
135** Remove entry from head of write-op list
136** END WHILE
137**
138** The async.queueMutex is always held during the <write-op list is
139** not empty> test, and when the entry is removed from the head
140** of the write-op list. Sometimes it is held for the interim
141** period (while the IO is performed), and sometimes it is
142** relinquished. It is relinquished if (a) the IO op is an
143** ASYNC_CLOSE or (b) when the file handle was opened, two of
144** the underlying systems handles were opened on the same
145** file-system entry.
146**
147** If condition (b) above is true, then one file-handle
148** (AsyncFile.pBaseRead) is used exclusively by sqlite threads to read the
149** file, the other (AsyncFile.pBaseWrite) by sqlite3_async_flush()
150** threads to perform write() operations. This means that read
151** operations are not blocked by asynchronous writes (although
152** asynchronous writes may still be blocked by reads).
153**
154** This assumes that the OS keeps two handles open on the same file
155** properly in sync. That is, any read operation that starts after a
156** write operation on the same file system entry has completed returns
157** data consistent with the write. We also assume that if one thread
158** reads a file while another is writing it all bytes other than the
159** ones actually being written contain valid data.
160**
161** If the above assumptions are not true, set the preprocessor symbol
162** SQLITE_ASYNC_TWO_FILEHANDLES to 0.
163*/
164
165
166#ifndef NDEBUG
167# define TESTONLY( X ) X
168#else
169# define TESTONLY( X )
170#endif
171
172/*
danielk1977debcfd22009-04-24 09:27:16 +0000173** PORTING FUNCTIONS
174**
danielk1977a3f06592009-04-23 14:58:39 +0000175** There are two definitions of the following functions. One for pthreads
176** compatible systems and one for Win32. These functions isolate the OS
177** specific code required by each platform.
178**
179** The system uses three mutexes and a single condition variable. To
180** block on a mutex, async_mutex_enter() is called. The parameter passed
181** to async_mutex_enter(), which must be one of ASYNC_MUTEX_LOCK,
182** ASYNC_MUTEX_QUEUE or ASYNC_MUTEX_WRITER, identifies which of the three
183** mutexes to lock. Similarly, to unlock a mutex, async_mutex_leave() is
184** called with a parameter identifying the mutex being unlocked. Mutexes
185** are not recursive - it is an error to call async_mutex_enter() to
186** lock a mutex that is already locked, or to call async_mutex_leave()
187** to unlock a mutex that is not currently locked.
188**
189** The async_cond_wait() and async_cond_signal() functions are modelled
190** on the pthreads functions with similar names. The first parameter to
191** both functions is always ASYNC_COND_QUEUE. When async_cond_wait()
192** is called the mutex identified by the second parameter must be held.
193** The mutex is unlocked, and the calling thread simultaneously begins
194** waiting for the condition variable to be signalled by another thread.
195** After another thread signals the condition variable, the calling
196** thread stops waiting, locks mutex eMutex and returns. The
197** async_cond_signal() function is used to signal the condition variable.
198** It is assumed that the mutex used by the thread calling async_cond_wait()
199** is held by the caller of async_cond_signal() (otherwise there would be
200** a race condition).
201**
202** It is guaranteed that no other thread will call async_cond_wait() when
203** there is already a thread waiting on the condition variable.
204**
205** The async_sched_yield() function is called to suggest to the operating
206** system that it would be a good time to shift the current thread off the
207** CPU. The system will still work if this function is not implemented
208** (it is not currently implemented for win32), but it might be marginally
209** more efficient if it is.
210*/
211static void async_mutex_enter(int eMutex);
212static void async_mutex_leave(int eMutex);
213static void async_cond_wait(int eCond, int eMutex);
214static void async_cond_signal(int eCond);
215static void async_sched_yield(void);
216
217/*
218** There are also two definitions of the following. async_os_initialize()
219** is called when the asynchronous VFS is first installed, and os_shutdown()
220** is called when it is uninstalled (from within sqlite3async_shutdown()).
221**
222** For pthreads builds, both of these functions are no-ops. For win32,
223** they provide an opportunity to initialize and finalize the required
224** mutex and condition variables.
225**
226** If async_os_initialize() returns other than zero, then the initialization
227** fails and SQLITE_ERROR is returned to the user.
228*/
229static int async_os_initialize(void);
230static void async_os_shutdown(void);
231
232/* Values for use as the 'eMutex' argument of the above functions. The
233** integer values assigned to these constants are important for assert()
234** statements that verify that mutexes are locked in the correct order.
235** Specifically, it is unsafe to try to lock mutex N while holding a lock
236** on mutex M if (M<=N).
237*/
238#define ASYNC_MUTEX_LOCK 0
239#define ASYNC_MUTEX_QUEUE 1
240#define ASYNC_MUTEX_WRITER 2
241
242/* Values for use as the 'eCond' argument of the above functions. */
243#define ASYNC_COND_QUEUE 0
244
245/*************************************************************************
246** Start of OS specific code.
247*/
248#if SQLITE_OS_WIN || defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
249
shaneeb4ac062009-04-30 17:45:33 +0000250#include <windows.h>
251
danielk1977a3f06592009-04-23 14:58:39 +0000252/* The following block contains the win32 specific code. */
253
254#define mutex_held(X) (GetCurrentThreadId()==primitives.aHolder[X])
255
256static struct AsyncPrimitives {
257 int isInit;
258 DWORD aHolder[3];
259 CRITICAL_SECTION aMutex[3];
260 HANDLE aCond[1];
261} primitives = { 0 };
262
263static int async_os_initialize(void){
264 if( !primitives.isInit ){
265 primitives.aCond[0] = CreateEvent(NULL, TRUE, FALSE, 0);
266 if( primitives.aCond[0]==NULL ){
267 return 1;
268 }
269 InitializeCriticalSection(&primitives.aMutex[0]);
270 InitializeCriticalSection(&primitives.aMutex[1]);
271 InitializeCriticalSection(&primitives.aMutex[2]);
272 primitives.isInit = 1;
273 }
274 return 0;
275}
276static void async_os_shutdown(void){
277 if( primitives.isInit ){
278 DeleteCriticalSection(&primitives.aMutex[0]);
279 DeleteCriticalSection(&primitives.aMutex[1]);
280 DeleteCriticalSection(&primitives.aMutex[2]);
281 CloseHandle(primitives.aCond[0]);
282 primitives.isInit = 0;
283 }
284}
285
286/* The following block contains the Win32 specific code. */
287static void async_mutex_enter(int eMutex){
288 assert( eMutex==0 || eMutex==1 || eMutex==2 );
289 assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
290 assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
291 assert( eMutex!=0 || (!mutex_held(0)) );
292 EnterCriticalSection(&primitives.aMutex[eMutex]);
293 TESTONLY( primitives.aHolder[eMutex] = GetCurrentThreadId(); )
294}
295static void async_mutex_leave(int eMutex){
296 assert( eMutex==0 || eMutex==1 || eMutex==2 );
297 assert( mutex_held(eMutex) );
298 TESTONLY( primitives.aHolder[eMutex] = 0; )
299 LeaveCriticalSection(&primitives.aMutex[eMutex]);
300}
301static void async_cond_wait(int eCond, int eMutex){
302 ResetEvent(primitives.aCond[eCond]);
303 async_mutex_leave(eMutex);
304 WaitForSingleObject(primitives.aCond[eCond], INFINITE);
305 async_mutex_enter(eMutex);
306}
307static void async_cond_signal(int eCond){
308 assert( mutex_held(ASYNC_MUTEX_QUEUE) );
309 SetEvent(primitives.aCond[eCond]);
310}
311static void async_sched_yield(void){
shanea3628d12009-04-29 18:11:59 +0000312 Sleep(0);
danielk1977a3f06592009-04-23 14:58:39 +0000313}
314#else
315
316/* The following block contains the pthreads specific code. */
317#include <pthread.h>
318#include <sched.h>
319
320#define mutex_held(X) pthread_equal(primitives.aHolder[X], pthread_self())
321
322static int async_os_initialize(void) {return 0;}
323static void async_os_shutdown(void) {}
324
325static struct AsyncPrimitives {
326 pthread_mutex_t aMutex[3];
327 pthread_cond_t aCond[1];
328 pthread_t aHolder[3];
329} primitives = {
330 { PTHREAD_MUTEX_INITIALIZER,
331 PTHREAD_MUTEX_INITIALIZER,
332 PTHREAD_MUTEX_INITIALIZER
333 } , {
334 PTHREAD_COND_INITIALIZER
335 } , { 0, 0, 0 }
336};
337
338static void async_mutex_enter(int eMutex){
339 assert( eMutex==0 || eMutex==1 || eMutex==2 );
340 assert( eMutex!=2 || (!mutex_held(0) && !mutex_held(1) && !mutex_held(2)) );
341 assert( eMutex!=1 || (!mutex_held(0) && !mutex_held(1)) );
342 assert( eMutex!=0 || (!mutex_held(0)) );
343 pthread_mutex_lock(&primitives.aMutex[eMutex]);
344 TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
345}
346static void async_mutex_leave(int eMutex){
347 assert( eMutex==0 || eMutex==1 || eMutex==2 );
348 assert( mutex_held(eMutex) );
349 TESTONLY( primitives.aHolder[eMutex] = 0; )
350 pthread_mutex_unlock(&primitives.aMutex[eMutex]);
351}
352static void async_cond_wait(int eCond, int eMutex){
353 assert( eMutex==0 || eMutex==1 || eMutex==2 );
354 assert( mutex_held(eMutex) );
355 TESTONLY( primitives.aHolder[eMutex] = 0; )
356 pthread_cond_wait(&primitives.aCond[eCond], &primitives.aMutex[eMutex]);
357 TESTONLY( primitives.aHolder[eMutex] = pthread_self(); )
358}
359static void async_cond_signal(int eCond){
360 assert( mutex_held(ASYNC_MUTEX_QUEUE) );
361 pthread_cond_signal(&primitives.aCond[eCond]);
362}
363static void async_sched_yield(void){
364 sched_yield();
365}
366#endif
367/*
368** End of OS specific code.
369*************************************************************************/
370
371#define assert_mutex_is_held(X) assert( mutex_held(X) )
372
373
374#ifndef SQLITE_ASYNC_TWO_FILEHANDLES
375/* #define SQLITE_ASYNC_TWO_FILEHANDLES 0 */
376#define SQLITE_ASYNC_TWO_FILEHANDLES 1
377#endif
378
379/*
380** State information is held in the static variable "async" defined
381** as the following structure.
382**
383** Both async.ioError and async.nFile are protected by async.queueMutex.
384*/
385static struct TestAsyncStaticData {
386 AsyncWrite *pQueueFirst; /* Next write operation to be processed */
387 AsyncWrite *pQueueLast; /* Last write operation on the list */
388 AsyncLock *pLock; /* Linked list of all AsyncLock structures */
389 volatile int ioDelay; /* Extra delay between write operations */
390 volatile int eHalt; /* One of the SQLITEASYNC_HALT_XXX values */
danielk19774598b8e2009-04-24 10:13:05 +0000391 volatile int bLockFiles; /* Current value of "lockfiles" parameter */
danielk1977a3f06592009-04-23 14:58:39 +0000392 int ioError; /* True if an IO error has occurred */
393 int nFile; /* Number of open files (from sqlite pov) */
danielk19774598b8e2009-04-24 10:13:05 +0000394} async = { 0,0,0,0,0,1,0,0 };
danielk1977a3f06592009-04-23 14:58:39 +0000395
396/* Possible values of AsyncWrite.op */
397#define ASYNC_NOOP 0
398#define ASYNC_WRITE 1
399#define ASYNC_SYNC 2
400#define ASYNC_TRUNCATE 3
401#define ASYNC_CLOSE 4
402#define ASYNC_DELETE 5
403#define ASYNC_OPENEXCLUSIVE 6
404#define ASYNC_UNLOCK 7
405
406/* Names of opcodes. Used for debugging only.
407** Make sure these stay in sync with the macros above!
408*/
409static const char *azOpcodeName[] = {
410 "NOOP", "WRITE", "SYNC", "TRUNCATE", "CLOSE", "DELETE", "OPENEX", "UNLOCK"
411};
412
413/*
414** Entries on the write-op queue are instances of the AsyncWrite
415** structure, defined here.
416**
417** The interpretation of the iOffset and nByte variables varies depending
418** on the value of AsyncWrite.op:
419**
420** ASYNC_NOOP:
421** No values used.
422**
423** ASYNC_WRITE:
424** iOffset -> Offset in file to write to.
425** nByte -> Number of bytes of data to write (pointed to by zBuf).
426**
427** ASYNC_SYNC:
428** nByte -> flags to pass to sqlite3OsSync().
429**
430** ASYNC_TRUNCATE:
431** iOffset -> Size to truncate file to.
432** nByte -> Unused.
433**
434** ASYNC_CLOSE:
435** iOffset -> Unused.
436** nByte -> Unused.
437**
438** ASYNC_DELETE:
439** iOffset -> Contains the "syncDir" flag.
440** nByte -> Number of bytes of zBuf points to (file name).
441**
442** ASYNC_OPENEXCLUSIVE:
443** iOffset -> Value of "delflag".
444** nByte -> Number of bytes of zBuf points to (file name).
445**
446** ASYNC_UNLOCK:
447** nByte -> Argument to sqlite3OsUnlock().
448**
449**
450** For an ASYNC_WRITE operation, zBuf points to the data to write to the file.
451** This space is sqlite3_malloc()d along with the AsyncWrite structure in a
452** single blob, so is deleted when sqlite3_free() is called on the parent
453** structure.
454*/
455struct AsyncWrite {
456 AsyncFileData *pFileData; /* File to write data to or sync */
457 int op; /* One of ASYNC_xxx etc. */
458 sqlite_int64 iOffset; /* See above */
459 int nByte; /* See above */
460 char *zBuf; /* Data to write to file (or NULL if op!=ASYNC_WRITE) */
461 AsyncWrite *pNext; /* Next write operation (to any file) */
462};
463
464/*
465** An instance of this structure is created for each distinct open file
466** (i.e. if two handles are opened on the one file, only one of these
467** structures is allocated) and stored in the async.aLock hash table. The
468** keys for async.aLock are the full pathnames of the opened files.
469**
470** AsyncLock.pList points to the head of a linked list of AsyncFileLock
471** structures, one for each handle currently open on the file.
472**
473** If the opened file is not a main-database (the SQLITE_OPEN_MAIN_DB is
danielk19774598b8e2009-04-24 10:13:05 +0000474** not passed to the sqlite3OsOpen() call), or if async.bLockFiles is
475** false, variables AsyncLock.pFile and AsyncLock.eLock are never used.
476** Otherwise, pFile is a file handle opened on the file in question and
477** used to obtain the file-system locks required by database connections
478** within this process.
danielk1977a3f06592009-04-23 14:58:39 +0000479**
480** See comments above the asyncLock() function for more details on
481** the implementation of database locking used by this backend.
482*/
483struct AsyncLock {
484 char *zFile;
485 int nFile;
486 sqlite3_file *pFile;
487 int eLock;
488 AsyncFileLock *pList;
489 AsyncLock *pNext; /* Next in linked list headed by async.pLock */
490};
491
492/*
493** An instance of the following structure is allocated along with each
494** AsyncFileData structure (see AsyncFileData.lock), but is only used if the
495** file was opened with the SQLITE_OPEN_MAIN_DB.
496*/
497struct AsyncFileLock {
498 int eLock; /* Internally visible lock state (sqlite pov) */
499 int eAsyncLock; /* Lock-state with write-queue unlock */
500 AsyncFileLock *pNext;
501};
502
503/*
504** The AsyncFile structure is a subclass of sqlite3_file used for
505** asynchronous IO.
506**
507** All of the actual data for the structure is stored in the structure
508** pointed to by AsyncFile.pData, which is allocated as part of the
509** sqlite3OsOpen() using sqlite3_malloc(). The reason for this is that the
510** lifetime of the AsyncFile structure is ended by the caller after OsClose()
511** is called, but the data in AsyncFileData may be required by the
512** writer thread after that point.
513*/
514struct AsyncFile {
515 sqlite3_io_methods *pMethod;
516 AsyncFileData *pData;
517};
518struct AsyncFileData {
519 char *zName; /* Underlying OS filename - used for debugging */
520 int nName; /* Number of characters in zName */
521 sqlite3_file *pBaseRead; /* Read handle to the underlying Os file */
522 sqlite3_file *pBaseWrite; /* Write handle to the underlying Os file */
523 AsyncFileLock lock; /* Lock state for this handle */
524 AsyncLock *pLock; /* AsyncLock object for this file system entry */
525 AsyncWrite closeOp; /* Preallocated close operation */
526};
527
528/*
529** Add an entry to the end of the global write-op list. pWrite should point
530** to an AsyncWrite structure allocated using sqlite3_malloc(). The writer
531** thread will call sqlite3_free() to free the structure after the specified
532** operation has been completed.
533**
534** Once an AsyncWrite structure has been added to the list, it becomes the
535** property of the writer thread and must not be read or modified by the
536** caller.
537*/
538static void addAsyncWrite(AsyncWrite *pWrite){
539 /* We must hold the queue mutex in order to modify the queue pointers */
540 if( pWrite->op!=ASYNC_UNLOCK ){
541 async_mutex_enter(ASYNC_MUTEX_QUEUE);
542 }
543
544 /* Add the record to the end of the write-op queue */
545 assert( !pWrite->pNext );
546 if( async.pQueueLast ){
547 assert( async.pQueueFirst );
548 async.pQueueLast->pNext = pWrite;
549 }else{
550 async.pQueueFirst = pWrite;
551 }
552 async.pQueueLast = pWrite;
553 ASYNC_TRACE(("PUSH %p (%s %s %d)\n", pWrite, azOpcodeName[pWrite->op],
554 pWrite->pFileData ? pWrite->pFileData->zName : "-", pWrite->iOffset));
555
556 if( pWrite->op==ASYNC_CLOSE ){
557 async.nFile--;
558 }
559
560 /* The writer thread might have been idle because there was nothing
561 ** on the write-op queue for it to do. So wake it up. */
562 async_cond_signal(ASYNC_COND_QUEUE);
563
564 /* Drop the queue mutex */
565 if( pWrite->op!=ASYNC_UNLOCK ){
566 async_mutex_leave(ASYNC_MUTEX_QUEUE);
567 }
568}
569
570/*
571** Increment async.nFile in a thread-safe manner.
572*/
573static void incrOpenFileCount(void){
574 /* We must hold the queue mutex in order to modify async.nFile */
575 async_mutex_enter(ASYNC_MUTEX_QUEUE);
576 if( async.nFile==0 ){
577 async.ioError = SQLITE_OK;
578 }
579 async.nFile++;
580 async_mutex_leave(ASYNC_MUTEX_QUEUE);
581}
582
583/*
584** This is a utility function to allocate and populate a new AsyncWrite
585** structure and insert it (via addAsyncWrite() ) into the global list.
586*/
587static int addNewAsyncWrite(
588 AsyncFileData *pFileData,
589 int op,
590 sqlite3_int64 iOffset,
591 int nByte,
592 const char *zByte
593){
594 AsyncWrite *p;
595 if( op!=ASYNC_CLOSE && async.ioError ){
596 return async.ioError;
597 }
598 p = sqlite3_malloc(sizeof(AsyncWrite) + (zByte?nByte:0));
599 if( !p ){
600 /* The upper layer does not expect operations like OsWrite() to
601 ** return SQLITE_NOMEM. This is partly because under normal conditions
602 ** SQLite is required to do rollback without calling malloc(). So
603 ** if malloc() fails here, treat it as an I/O error. The above
604 ** layer knows how to handle that.
605 */
606 return SQLITE_IOERR;
607 }
608 p->op = op;
609 p->iOffset = iOffset;
610 p->nByte = nByte;
611 p->pFileData = pFileData;
612 p->pNext = 0;
613 if( zByte ){
614 p->zBuf = (char *)&p[1];
615 memcpy(p->zBuf, zByte, nByte);
616 }else{
617 p->zBuf = 0;
618 }
619 addAsyncWrite(p);
620 return SQLITE_OK;
621}
622
623/*
624** Close the file. This just adds an entry to the write-op list, the file is
625** not actually closed.
626*/
627static int asyncClose(sqlite3_file *pFile){
628 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
629
630 /* Unlock the file, if it is locked */
631 async_mutex_enter(ASYNC_MUTEX_LOCK);
632 p->lock.eLock = 0;
633 async_mutex_leave(ASYNC_MUTEX_LOCK);
634
635 addAsyncWrite(&p->closeOp);
636 return SQLITE_OK;
637}
638
639/*
640** Implementation of sqlite3OsWrite() for asynchronous files. Instead of
641** writing to the underlying file, this function adds an entry to the end of
642** the global AsyncWrite list. Either SQLITE_OK or SQLITE_NOMEM may be
643** returned.
644*/
645static int asyncWrite(
646 sqlite3_file *pFile,
647 const void *pBuf,
648 int amt,
649 sqlite3_int64 iOff
650){
651 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
652 return addNewAsyncWrite(p, ASYNC_WRITE, iOff, amt, pBuf);
653}
654
655/*
656** Read data from the file. First we read from the filesystem, then adjust
657** the contents of the buffer based on ASYNC_WRITE operations in the
658** write-op queue.
659**
660** This method holds the mutex from start to finish.
661*/
662static int asyncRead(
663 sqlite3_file *pFile,
664 void *zOut,
665 int iAmt,
666 sqlite3_int64 iOffset
667){
668 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
669 int rc = SQLITE_OK;
dan78f1e532010-07-07 11:05:21 +0000670 sqlite3_int64 filesize = 0;
danielk1977a3f06592009-04-23 14:58:39 +0000671 sqlite3_file *pBase = p->pBaseRead;
danfd3b2222009-10-19 07:50:25 +0000672 sqlite3_int64 iAmt64 = (sqlite3_int64)iAmt;
danielk1977a3f06592009-04-23 14:58:39 +0000673
674 /* Grab the write queue mutex for the duration of the call */
675 async_mutex_enter(ASYNC_MUTEX_QUEUE);
676
677 /* If an I/O error has previously occurred in this virtual file
678 ** system, then all subsequent operations fail.
679 */
680 if( async.ioError!=SQLITE_OK ){
681 rc = async.ioError;
682 goto asyncread_out;
683 }
684
685 if( pBase->pMethods ){
danfd3b2222009-10-19 07:50:25 +0000686 sqlite3_int64 nRead;
danielk1977a3f06592009-04-23 14:58:39 +0000687 rc = pBase->pMethods->xFileSize(pBase, &filesize);
688 if( rc!=SQLITE_OK ){
689 goto asyncread_out;
690 }
danfd3b2222009-10-19 07:50:25 +0000691 nRead = MIN(filesize - iOffset, iAmt64);
danielk1977a3f06592009-04-23 14:58:39 +0000692 if( nRead>0 ){
693 rc = pBase->pMethods->xRead(pBase, zOut, nRead, iOffset);
694 ASYNC_TRACE(("READ %s %d bytes at %d\n", p->zName, nRead, iOffset));
695 }
696 }
697
698 if( rc==SQLITE_OK ){
699 AsyncWrite *pWrite;
700 char *zName = p->zName;
701
702 for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
703 if( pWrite->op==ASYNC_WRITE && (
704 (pWrite->pFileData==p) ||
705 (zName && pWrite->pFileData->zName==zName)
706 )){
danfd3b2222009-10-19 07:50:25 +0000707 sqlite3_int64 nCopy;
708 sqlite3_int64 nByte64 = (sqlite3_int64)pWrite->nByte;
dan78f1e532010-07-07 11:05:21 +0000709 filesize = MAX(filesize, pWrite->iOffset+nByte64);
danfd3b2222009-10-19 07:50:25 +0000710
711 /* Set variable iBeginIn to the offset in buffer pWrite->zBuf[] from
712 ** which data should be copied. Set iBeginOut to the offset within
713 ** the output buffer to which data should be copied. If either of
714 ** these offsets is a negative number, set them to 0.
715 */
shanea3628d12009-04-29 18:11:59 +0000716 sqlite3_int64 iBeginOut = (pWrite->iOffset-iOffset);
717 sqlite3_int64 iBeginIn = -iBeginOut;
danielk1977a3f06592009-04-23 14:58:39 +0000718 if( iBeginIn<0 ) iBeginIn = 0;
719 if( iBeginOut<0 ) iBeginOut = 0;
danielk1977a3f06592009-04-23 14:58:39 +0000720
danfd3b2222009-10-19 07:50:25 +0000721 nCopy = MIN(nByte64-iBeginIn, iAmt64-iBeginOut);
danielk1977a3f06592009-04-23 14:58:39 +0000722 if( nCopy>0 ){
723 memcpy(&((char *)zOut)[iBeginOut], &pWrite->zBuf[iBeginIn], nCopy);
724 ASYNC_TRACE(("OVERREAD %d bytes at %d\n", nCopy, iBeginOut+iOffset));
725 }
726 }
727 }
728 }
729
730asyncread_out:
731 async_mutex_leave(ASYNC_MUTEX_QUEUE);
dan78f1e532010-07-07 11:05:21 +0000732 if( rc==SQLITE_OK && filesize<(iOffset+iAmt) ){
733 rc = SQLITE_IOERR_SHORT_READ;
734 }
danielk1977a3f06592009-04-23 14:58:39 +0000735 return rc;
736}
737
738/*
739** Truncate the file to nByte bytes in length. This just adds an entry to
740** the write-op list, no IO actually takes place.
741*/
742static int asyncTruncate(sqlite3_file *pFile, sqlite3_int64 nByte){
743 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
744 return addNewAsyncWrite(p, ASYNC_TRUNCATE, nByte, 0, 0);
745}
746
747/*
748** Sync the file. This just adds an entry to the write-op list, the
749** sync() is done later by sqlite3_async_flush().
750*/
751static int asyncSync(sqlite3_file *pFile, int flags){
752 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
753 return addNewAsyncWrite(p, ASYNC_SYNC, 0, flags, 0);
754}
755
756/*
757** Read the size of the file. First we read the size of the file system
758** entry, then adjust for any ASYNC_WRITE or ASYNC_TRUNCATE operations
759** currently in the write-op list.
760**
761** This method holds the mutex from start to finish.
762*/
763int asyncFileSize(sqlite3_file *pFile, sqlite3_int64 *piSize){
764 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
765 int rc = SQLITE_OK;
766 sqlite3_int64 s = 0;
767 sqlite3_file *pBase;
768
769 async_mutex_enter(ASYNC_MUTEX_QUEUE);
770
shaneeb4ac062009-04-30 17:45:33 +0000771 /* Read the filesystem size from the base file. If pMethods is NULL, this
danielk1977a3f06592009-04-23 14:58:39 +0000772 ** means the file hasn't been opened yet. In this case all relevant data
773 ** must be in the write-op queue anyway, so we can omit reading from the
774 ** file-system.
775 */
776 pBase = p->pBaseRead;
777 if( pBase->pMethods ){
778 rc = pBase->pMethods->xFileSize(pBase, &s);
779 }
780
781 if( rc==SQLITE_OK ){
782 AsyncWrite *pWrite;
783 for(pWrite=async.pQueueFirst; pWrite; pWrite = pWrite->pNext){
784 if( pWrite->op==ASYNC_DELETE
785 && p->zName
786 && strcmp(p->zName, pWrite->zBuf)==0
787 ){
788 s = 0;
789 }else if( pWrite->pFileData && (
790 (pWrite->pFileData==p)
791 || (p->zName && pWrite->pFileData->zName==p->zName)
792 )){
793 switch( pWrite->op ){
794 case ASYNC_WRITE:
795 s = MAX(pWrite->iOffset + (sqlite3_int64)(pWrite->nByte), s);
796 break;
797 case ASYNC_TRUNCATE:
798 s = MIN(s, pWrite->iOffset);
799 break;
800 }
801 }
802 }
803 *piSize = s;
804 }
805 async_mutex_leave(ASYNC_MUTEX_QUEUE);
806 return rc;
807}
808
809/*
810** Lock or unlock the actual file-system entry.
811*/
812static int getFileLock(AsyncLock *pLock){
813 int rc = SQLITE_OK;
814 AsyncFileLock *pIter;
815 int eRequired = 0;
816
817 if( pLock->pFile ){
818 for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
819 assert(pIter->eAsyncLock>=pIter->eLock);
820 if( pIter->eAsyncLock>eRequired ){
821 eRequired = pIter->eAsyncLock;
822 assert(eRequired>=0 && eRequired<=SQLITE_LOCK_EXCLUSIVE);
823 }
824 }
825
826 if( eRequired>pLock->eLock ){
827 rc = pLock->pFile->pMethods->xLock(pLock->pFile, eRequired);
828 if( rc==SQLITE_OK ){
829 pLock->eLock = eRequired;
830 }
831 }
832 else if( eRequired<pLock->eLock && eRequired<=SQLITE_LOCK_SHARED ){
833 rc = pLock->pFile->pMethods->xUnlock(pLock->pFile, eRequired);
834 if( rc==SQLITE_OK ){
835 pLock->eLock = eRequired;
836 }
837 }
838 }
839
840 return rc;
841}
842
843/*
844** Return the AsyncLock structure from the global async.pLock list
845** associated with the file-system entry identified by path zName
846** (a string of nName bytes). If no such structure exists, return 0.
847*/
848static AsyncLock *findLock(const char *zName, int nName){
849 AsyncLock *p = async.pLock;
850 while( p && (p->nFile!=nName || memcmp(p->zFile, zName, nName)) ){
851 p = p->pNext;
852 }
853 return p;
854}
855
856/*
857** The following two methods - asyncLock() and asyncUnlock() - are used
858** to obtain and release locks on database files opened with the
859** asynchronous backend.
860*/
861static int asyncLock(sqlite3_file *pFile, int eLock){
862 int rc = SQLITE_OK;
863 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
864
865 if( p->zName ){
866 async_mutex_enter(ASYNC_MUTEX_LOCK);
867 if( p->lock.eLock<eLock ){
868 AsyncLock *pLock = p->pLock;
869 AsyncFileLock *pIter;
870 assert(pLock && pLock->pList);
871 for(pIter=pLock->pList; pIter; pIter=pIter->pNext){
872 if( pIter!=&p->lock && (
873 (eLock==SQLITE_LOCK_EXCLUSIVE && pIter->eLock>=SQLITE_LOCK_SHARED) ||
874 (eLock==SQLITE_LOCK_PENDING && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
875 (eLock==SQLITE_LOCK_RESERVED && pIter->eLock>=SQLITE_LOCK_RESERVED) ||
876 (eLock==SQLITE_LOCK_SHARED && pIter->eLock>=SQLITE_LOCK_PENDING)
877 )){
878 rc = SQLITE_BUSY;
879 }
880 }
881 if( rc==SQLITE_OK ){
882 p->lock.eLock = eLock;
883 p->lock.eAsyncLock = MAX(p->lock.eAsyncLock, eLock);
884 }
885 assert(p->lock.eAsyncLock>=p->lock.eLock);
886 if( rc==SQLITE_OK ){
887 rc = getFileLock(pLock);
888 }
889 }
890 async_mutex_leave(ASYNC_MUTEX_LOCK);
891 }
892
893 ASYNC_TRACE(("LOCK %d (%s) rc=%d\n", eLock, p->zName, rc));
894 return rc;
895}
896static int asyncUnlock(sqlite3_file *pFile, int eLock){
897 int rc = SQLITE_OK;
898 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
899 if( p->zName ){
900 AsyncFileLock *pLock = &p->lock;
901 async_mutex_enter(ASYNC_MUTEX_QUEUE);
902 async_mutex_enter(ASYNC_MUTEX_LOCK);
903 pLock->eLock = MIN(pLock->eLock, eLock);
904 rc = addNewAsyncWrite(p, ASYNC_UNLOCK, 0, eLock, 0);
905 async_mutex_leave(ASYNC_MUTEX_LOCK);
906 async_mutex_leave(ASYNC_MUTEX_QUEUE);
907 }
908 return rc;
909}
910
911/*
912** This function is called when the pager layer first opens a database file
913** and is checking for a hot-journal.
914*/
915static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){
916 int ret = 0;
917 AsyncFileLock *pIter;
918 AsyncFileData *p = ((AsyncFile *)pFile)->pData;
919
920 async_mutex_enter(ASYNC_MUTEX_LOCK);
921 for(pIter=p->pLock->pList; pIter; pIter=pIter->pNext){
922 if( pIter->eLock>=SQLITE_LOCK_RESERVED ){
923 ret = 1;
shanea3628d12009-04-29 18:11:59 +0000924 break;
danielk1977a3f06592009-04-23 14:58:39 +0000925 }
926 }
927 async_mutex_leave(ASYNC_MUTEX_LOCK);
928
929 ASYNC_TRACE(("CHECK-LOCK %d (%s)\n", ret, p->zName));
930 *pResOut = ret;
931 return SQLITE_OK;
932}
933
934/*
935** sqlite3_file_control() implementation.
936*/
937static int asyncFileControl(sqlite3_file *id, int op, void *pArg){
938 switch( op ){
939 case SQLITE_FCNTL_LOCKSTATE: {
940 async_mutex_enter(ASYNC_MUTEX_LOCK);
941 *(int*)pArg = ((AsyncFile*)id)->pData->lock.eLock;
942 async_mutex_leave(ASYNC_MUTEX_LOCK);
943 return SQLITE_OK;
944 }
945 }
946 return SQLITE_ERROR;
947}
948
949/*
950** Return the device characteristics and sector-size of the device. It
shanea3628d12009-04-29 18:11:59 +0000951** is tricky to implement these correctly, as this backend might
danielk1977a3f06592009-04-23 14:58:39 +0000952** not have an open file handle at this point.
953*/
954static int asyncSectorSize(sqlite3_file *pFile){
shanea3628d12009-04-29 18:11:59 +0000955 UNUSED_PARAMETER(pFile);
danielk1977a3f06592009-04-23 14:58:39 +0000956 return 512;
957}
958static int asyncDeviceCharacteristics(sqlite3_file *pFile){
shanea3628d12009-04-29 18:11:59 +0000959 UNUSED_PARAMETER(pFile);
danielk1977a3f06592009-04-23 14:58:39 +0000960 return 0;
961}
962
963static int unlinkAsyncFile(AsyncFileData *pData){
964 AsyncFileLock **ppIter;
965 int rc = SQLITE_OK;
966
967 if( pData->zName ){
968 AsyncLock *pLock = pData->pLock;
969 for(ppIter=&pLock->pList; *ppIter; ppIter=&((*ppIter)->pNext)){
970 if( (*ppIter)==&pData->lock ){
971 *ppIter = pData->lock.pNext;
972 break;
973 }
974 }
975 if( !pLock->pList ){
976 AsyncLock **pp;
977 if( pLock->pFile ){
978 pLock->pFile->pMethods->xClose(pLock->pFile);
979 }
980 for(pp=&async.pLock; *pp!=pLock; pp=&((*pp)->pNext));
981 *pp = pLock->pNext;
982 sqlite3_free(pLock);
983 }else{
984 rc = getFileLock(pLock);
985 }
986 }
987
988 return rc;
989}
990
991/*
992** The parameter passed to this function is a copy of a 'flags' parameter
993** passed to this modules xOpen() method. This function returns true
994** if the file should be opened asynchronously, or false if it should
995** be opened immediately.
996**
997** If the file is to be opened asynchronously, then asyncOpen() will add
998** an entry to the event queue and the file will not actually be opened
999** until the event is processed. Otherwise, the file is opened directly
1000** by the caller.
1001*/
1002static int doAsynchronousOpen(int flags){
1003 return (flags&SQLITE_OPEN_CREATE) && (
1004 (flags&SQLITE_OPEN_MAIN_JOURNAL) ||
1005 (flags&SQLITE_OPEN_TEMP_JOURNAL) ||
1006 (flags&SQLITE_OPEN_DELETEONCLOSE)
1007 );
1008}
1009
1010/*
1011** Open a file.
1012*/
1013static int asyncOpen(
1014 sqlite3_vfs *pAsyncVfs,
1015 const char *zName,
1016 sqlite3_file *pFile,
1017 int flags,
1018 int *pOutFlags
1019){
1020 static sqlite3_io_methods async_methods = {
1021 1, /* iVersion */
1022 asyncClose, /* xClose */
1023 asyncRead, /* xRead */
1024 asyncWrite, /* xWrite */
1025 asyncTruncate, /* xTruncate */
1026 asyncSync, /* xSync */
1027 asyncFileSize, /* xFileSize */
1028 asyncLock, /* xLock */
1029 asyncUnlock, /* xUnlock */
1030 asyncCheckReservedLock, /* xCheckReservedLock */
1031 asyncFileControl, /* xFileControl */
1032 asyncSectorSize, /* xSectorSize */
1033 asyncDeviceCharacteristics /* xDeviceCharacteristics */
1034 };
1035
1036 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1037 AsyncFile *p = (AsyncFile *)pFile;
1038 int nName = 0;
1039 int rc = SQLITE_OK;
1040 int nByte;
1041 AsyncFileData *pData;
1042 AsyncLock *pLock = 0;
1043 char *z;
1044 int isAsyncOpen = doAsynchronousOpen(flags);
1045
1046 /* If zName is NULL, then the upper layer is requesting an anonymous file */
1047 if( zName ){
shanea3628d12009-04-29 18:11:59 +00001048 nName = (int)strlen(zName)+1;
danielk1977a3f06592009-04-23 14:58:39 +00001049 }
1050
1051 nByte = (
1052 sizeof(AsyncFileData) + /* AsyncFileData structure */
1053 2 * pVfs->szOsFile + /* AsyncFileData.pBaseRead and pBaseWrite */
1054 nName /* AsyncFileData.zName */
1055 );
1056 z = sqlite3_malloc(nByte);
1057 if( !z ){
1058 return SQLITE_NOMEM;
1059 }
1060 memset(z, 0, nByte);
1061 pData = (AsyncFileData*)z;
1062 z += sizeof(pData[0]);
1063 pData->pBaseRead = (sqlite3_file*)z;
1064 z += pVfs->szOsFile;
1065 pData->pBaseWrite = (sqlite3_file*)z;
1066 pData->closeOp.pFileData = pData;
1067 pData->closeOp.op = ASYNC_CLOSE;
1068
1069 if( zName ){
1070 z += pVfs->szOsFile;
1071 pData->zName = z;
1072 pData->nName = nName;
1073 memcpy(pData->zName, zName, nName);
1074 }
1075
1076 if( !isAsyncOpen ){
1077 int flagsout;
1078 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, &flagsout);
danielk19775368f292009-07-18 11:52:04 +00001079 if( rc==SQLITE_OK
1080 && (flagsout&SQLITE_OPEN_READWRITE)
1081 && (flags&SQLITE_OPEN_EXCLUSIVE)==0
1082 ){
danielk1977a3f06592009-04-23 14:58:39 +00001083 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseWrite, flags, 0);
1084 }
1085 if( pOutFlags ){
1086 *pOutFlags = flagsout;
1087 }
1088 }
1089
1090 async_mutex_enter(ASYNC_MUTEX_LOCK);
1091
1092 if( zName && rc==SQLITE_OK ){
1093 pLock = findLock(pData->zName, pData->nName);
1094 if( !pLock ){
1095 int nByte = pVfs->szOsFile + sizeof(AsyncLock) + pData->nName + 1;
1096 pLock = (AsyncLock *)sqlite3_malloc(nByte);
1097 if( pLock ){
1098 memset(pLock, 0, nByte);
danielk19774598b8e2009-04-24 10:13:05 +00001099 if( async.bLockFiles && (flags&SQLITE_OPEN_MAIN_DB) ){
danielk1977a3f06592009-04-23 14:58:39 +00001100 pLock->pFile = (sqlite3_file *)&pLock[1];
1101 rc = pVfs->xOpen(pVfs, pData->zName, pLock->pFile, flags, 0);
1102 if( rc!=SQLITE_OK ){
1103 sqlite3_free(pLock);
1104 pLock = 0;
1105 }
1106 }
danielk1977a3f06592009-04-23 14:58:39 +00001107 if( pLock ){
1108 pLock->nFile = pData->nName;
1109 pLock->zFile = &((char *)(&pLock[1]))[pVfs->szOsFile];
1110 memcpy(pLock->zFile, pData->zName, pLock->nFile);
1111 pLock->pNext = async.pLock;
1112 async.pLock = pLock;
1113 }
1114 }else{
1115 rc = SQLITE_NOMEM;
1116 }
1117 }
1118 }
1119
1120 if( rc==SQLITE_OK ){
1121 p->pMethod = &async_methods;
1122 p->pData = pData;
1123
1124 /* Link AsyncFileData.lock into the linked list of
1125 ** AsyncFileLock structures for this file.
1126 */
1127 if( zName ){
1128 pData->lock.pNext = pLock->pList;
1129 pLock->pList = &pData->lock;
1130 pData->zName = pLock->zFile;
1131 }
1132 }else{
1133 if( pData->pBaseRead->pMethods ){
1134 pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1135 }
1136 if( pData->pBaseWrite->pMethods ){
1137 pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1138 }
1139 sqlite3_free(pData);
1140 }
1141
1142 async_mutex_leave(ASYNC_MUTEX_LOCK);
1143
1144 if( rc==SQLITE_OK ){
1145 incrOpenFileCount();
1146 pData->pLock = pLock;
1147 }
1148
1149 if( rc==SQLITE_OK && isAsyncOpen ){
1150 rc = addNewAsyncWrite(pData, ASYNC_OPENEXCLUSIVE, (sqlite3_int64)flags,0,0);
1151 if( rc==SQLITE_OK ){
1152 if( pOutFlags ) *pOutFlags = flags;
1153 }else{
1154 async_mutex_enter(ASYNC_MUTEX_LOCK);
1155 unlinkAsyncFile(pData);
1156 async_mutex_leave(ASYNC_MUTEX_LOCK);
1157 sqlite3_free(pData);
1158 }
1159 }
1160 if( rc!=SQLITE_OK ){
1161 p->pMethod = 0;
1162 }
1163 return rc;
1164}
1165
1166/*
1167** Implementation of sqlite3OsDelete. Add an entry to the end of the
1168** write-op queue to perform the delete.
1169*/
1170static int asyncDelete(sqlite3_vfs *pAsyncVfs, const char *z, int syncDir){
shanea3628d12009-04-29 18:11:59 +00001171 UNUSED_PARAMETER(pAsyncVfs);
1172 return addNewAsyncWrite(0, ASYNC_DELETE, syncDir, (int)strlen(z)+1, z);
danielk1977a3f06592009-04-23 14:58:39 +00001173}
1174
1175/*
1176** Implementation of sqlite3OsAccess. This method holds the mutex from
1177** start to finish.
1178*/
1179static int asyncAccess(
1180 sqlite3_vfs *pAsyncVfs,
1181 const char *zName,
1182 int flags,
1183 int *pResOut
1184){
1185 int rc;
1186 int ret;
1187 AsyncWrite *p;
1188 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1189
1190 assert(flags==SQLITE_ACCESS_READWRITE
1191 || flags==SQLITE_ACCESS_READ
1192 || flags==SQLITE_ACCESS_EXISTS
1193 );
1194
1195 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1196 rc = pVfs->xAccess(pVfs, zName, flags, &ret);
1197 if( rc==SQLITE_OK && flags==SQLITE_ACCESS_EXISTS ){
1198 for(p=async.pQueueFirst; p; p = p->pNext){
1199 if( p->op==ASYNC_DELETE && 0==strcmp(p->zBuf, zName) ){
1200 ret = 0;
1201 }else if( p->op==ASYNC_OPENEXCLUSIVE
1202 && p->pFileData->zName
1203 && 0==strcmp(p->pFileData->zName, zName)
1204 ){
1205 ret = 1;
1206 }
1207 }
1208 }
1209 ASYNC_TRACE(("ACCESS(%s): %s = %d\n",
1210 flags==SQLITE_ACCESS_READWRITE?"read-write":
1211 flags==SQLITE_ACCESS_READ?"read":"exists"
1212 , zName, ret)
1213 );
1214 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1215 *pResOut = ret;
1216 return rc;
1217}
1218
1219/*
1220** Fill in zPathOut with the full path to the file identified by zPath.
1221*/
1222static int asyncFullPathname(
1223 sqlite3_vfs *pAsyncVfs,
1224 const char *zPath,
1225 int nPathOut,
1226 char *zPathOut
1227){
1228 int rc;
1229 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1230 rc = pVfs->xFullPathname(pVfs, zPath, nPathOut, zPathOut);
1231
1232 /* Because of the way intra-process file locking works, this backend
1233 ** needs to return a canonical path. The following block assumes the
1234 ** file-system uses unix style paths.
1235 */
1236 if( rc==SQLITE_OK ){
1237 int i, j;
danielk1977a3f06592009-04-23 14:58:39 +00001238 char *z = zPathOut;
dan19125aa2009-12-02 18:16:56 +00001239 int n = strlen(z);
danielk1977a3f06592009-04-23 14:58:39 +00001240 while( n>1 && z[n-1]=='/' ){ n--; }
1241 for(i=j=0; i<n; i++){
1242 if( z[i]=='/' ){
1243 if( z[i+1]=='/' ) continue;
1244 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
1245 i += 1;
1246 continue;
1247 }
1248 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
1249 while( j>0 && z[j-1]!='/' ){ j--; }
1250 if( j>0 ){ j--; }
1251 i += 2;
1252 continue;
1253 }
1254 }
1255 z[j++] = z[i];
1256 }
1257 z[j] = 0;
1258 }
1259
1260 return rc;
1261}
1262static void *asyncDlOpen(sqlite3_vfs *pAsyncVfs, const char *zPath){
1263 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1264 return pVfs->xDlOpen(pVfs, zPath);
1265}
1266static void asyncDlError(sqlite3_vfs *pAsyncVfs, int nByte, char *zErrMsg){
1267 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1268 pVfs->xDlError(pVfs, nByte, zErrMsg);
1269}
1270static void (*asyncDlSym(
1271 sqlite3_vfs *pAsyncVfs,
1272 void *pHandle,
1273 const char *zSymbol
1274))(void){
1275 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1276 return pVfs->xDlSym(pVfs, pHandle, zSymbol);
1277}
1278static void asyncDlClose(sqlite3_vfs *pAsyncVfs, void *pHandle){
1279 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1280 pVfs->xDlClose(pVfs, pHandle);
1281}
1282static int asyncRandomness(sqlite3_vfs *pAsyncVfs, int nByte, char *zBufOut){
1283 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1284 return pVfs->xRandomness(pVfs, nByte, zBufOut);
1285}
1286static int asyncSleep(sqlite3_vfs *pAsyncVfs, int nMicro){
1287 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1288 return pVfs->xSleep(pVfs, nMicro);
1289}
1290static int asyncCurrentTime(sqlite3_vfs *pAsyncVfs, double *pTimeOut){
1291 sqlite3_vfs *pVfs = (sqlite3_vfs *)pAsyncVfs->pAppData;
1292 return pVfs->xCurrentTime(pVfs, pTimeOut);
1293}
1294
1295static sqlite3_vfs async_vfs = {
1296 1, /* iVersion */
1297 sizeof(AsyncFile), /* szOsFile */
1298 0, /* mxPathname */
1299 0, /* pNext */
1300 SQLITEASYNC_VFSNAME, /* zName */
1301 0, /* pAppData */
1302 asyncOpen, /* xOpen */
1303 asyncDelete, /* xDelete */
1304 asyncAccess, /* xAccess */
1305 asyncFullPathname, /* xFullPathname */
1306 asyncDlOpen, /* xDlOpen */
1307 asyncDlError, /* xDlError */
1308 asyncDlSym, /* xDlSym */
1309 asyncDlClose, /* xDlClose */
1310 asyncRandomness, /* xDlError */
1311 asyncSleep, /* xDlSym */
1312 asyncCurrentTime /* xDlClose */
1313};
1314
1315/*
1316** This procedure runs in a separate thread, reading messages off of the
1317** write queue and processing them one by one.
1318**
1319** If async.writerHaltNow is true, then this procedure exits
1320** after processing a single message.
1321**
1322** If async.writerHaltWhenIdle is true, then this procedure exits when
1323** the write queue is empty.
1324**
1325** If both of the above variables are false, this procedure runs
1326** indefinately, waiting for operations to be added to the write queue
1327** and processing them in the order in which they arrive.
1328**
1329** An artifical delay of async.ioDelay milliseconds is inserted before
1330** each write operation in order to simulate the effect of a slow disk.
1331**
1332** Only one instance of this procedure may be running at a time.
1333*/
1334static void asyncWriterThread(void){
1335 sqlite3_vfs *pVfs = (sqlite3_vfs *)(async_vfs.pAppData);
1336 AsyncWrite *p = 0;
1337 int rc = SQLITE_OK;
1338 int holdingMutex = 0;
1339
1340 async_mutex_enter(ASYNC_MUTEX_WRITER);
1341
1342 while( async.eHalt!=SQLITEASYNC_HALT_NOW ){
1343 int doNotFree = 0;
1344 sqlite3_file *pBase = 0;
1345
1346 if( !holdingMutex ){
1347 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1348 }
1349 while( (p = async.pQueueFirst)==0 ){
1350 if( async.eHalt!=SQLITEASYNC_HALT_NEVER ){
1351 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1352 break;
1353 }else{
1354 ASYNC_TRACE(("IDLE\n"));
1355 async_cond_wait(ASYNC_COND_QUEUE, ASYNC_MUTEX_QUEUE);
1356 ASYNC_TRACE(("WAKEUP\n"));
1357 }
1358 }
1359 if( p==0 ) break;
1360 holdingMutex = 1;
1361
1362 /* Right now this thread is holding the mutex on the write-op queue.
1363 ** Variable 'p' points to the first entry in the write-op queue. In
1364 ** the general case, we hold on to the mutex for the entire body of
1365 ** the loop.
1366 **
1367 ** However in the cases enumerated below, we relinquish the mutex,
1368 ** perform the IO, and then re-request the mutex before removing 'p' from
1369 ** the head of the write-op queue. The idea is to increase concurrency with
1370 ** sqlite threads.
1371 **
1372 ** * An ASYNC_CLOSE operation.
1373 ** * An ASYNC_OPENEXCLUSIVE operation. For this one, we relinquish
1374 ** the mutex, call the underlying xOpenExclusive() function, then
1375 ** re-aquire the mutex before seting the AsyncFile.pBaseRead
1376 ** variable.
1377 ** * ASYNC_SYNC and ASYNC_WRITE operations, if
1378 ** SQLITE_ASYNC_TWO_FILEHANDLES was set at compile time and two
1379 ** file-handles are open for the particular file being "synced".
1380 */
1381 if( async.ioError!=SQLITE_OK && p->op!=ASYNC_CLOSE ){
1382 p->op = ASYNC_NOOP;
1383 }
1384 if( p->pFileData ){
1385 pBase = p->pFileData->pBaseWrite;
1386 if(
1387 p->op==ASYNC_CLOSE ||
1388 p->op==ASYNC_OPENEXCLUSIVE ||
1389 (pBase->pMethods && (p->op==ASYNC_SYNC || p->op==ASYNC_WRITE) )
1390 ){
1391 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1392 holdingMutex = 0;
1393 }
1394 if( !pBase->pMethods ){
1395 pBase = p->pFileData->pBaseRead;
1396 }
1397 }
1398
1399 switch( p->op ){
1400 case ASYNC_NOOP:
1401 break;
1402
1403 case ASYNC_WRITE:
1404 assert( pBase );
1405 ASYNC_TRACE(("WRITE %s %d bytes at %d\n",
1406 p->pFileData->zName, p->nByte, p->iOffset));
1407 rc = pBase->pMethods->xWrite(pBase, (void *)(p->zBuf), p->nByte, p->iOffset);
1408 break;
1409
1410 case ASYNC_SYNC:
1411 assert( pBase );
1412 ASYNC_TRACE(("SYNC %s\n", p->pFileData->zName));
1413 rc = pBase->pMethods->xSync(pBase, p->nByte);
1414 break;
1415
1416 case ASYNC_TRUNCATE:
1417 assert( pBase );
1418 ASYNC_TRACE(("TRUNCATE %s to %d bytes\n",
1419 p->pFileData->zName, p->iOffset));
1420 rc = pBase->pMethods->xTruncate(pBase, p->iOffset);
1421 break;
1422
1423 case ASYNC_CLOSE: {
1424 AsyncFileData *pData = p->pFileData;
1425 ASYNC_TRACE(("CLOSE %s\n", p->pFileData->zName));
1426 if( pData->pBaseWrite->pMethods ){
1427 pData->pBaseWrite->pMethods->xClose(pData->pBaseWrite);
1428 }
1429 if( pData->pBaseRead->pMethods ){
1430 pData->pBaseRead->pMethods->xClose(pData->pBaseRead);
1431 }
1432
1433 /* Unlink AsyncFileData.lock from the linked list of AsyncFileLock
1434 ** structures for this file. Obtain the async.lockMutex mutex
1435 ** before doing so.
1436 */
1437 async_mutex_enter(ASYNC_MUTEX_LOCK);
1438 rc = unlinkAsyncFile(pData);
1439 async_mutex_leave(ASYNC_MUTEX_LOCK);
1440
1441 if( !holdingMutex ){
1442 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1443 holdingMutex = 1;
1444 }
1445 assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1446 async.pQueueFirst = p->pNext;
1447 sqlite3_free(pData);
1448 doNotFree = 1;
1449 break;
1450 }
1451
1452 case ASYNC_UNLOCK: {
1453 AsyncWrite *pIter;
1454 AsyncFileData *pData = p->pFileData;
1455 int eLock = p->nByte;
1456
1457 /* When a file is locked by SQLite using the async backend, it is
1458 ** locked within the 'real' file-system synchronously. When it is
1459 ** unlocked, an ASYNC_UNLOCK event is added to the write-queue to
1460 ** unlock the file asynchronously. The design of the async backend
1461 ** requires that the 'real' file-system file be locked from the
1462 ** time that SQLite first locks it (and probably reads from it)
1463 ** until all asynchronous write events that were scheduled before
1464 ** SQLite unlocked the file have been processed.
1465 **
1466 ** This is more complex if SQLite locks and unlocks the file multiple
1467 ** times in quick succession. For example, if SQLite does:
1468 **
1469 ** lock, write, unlock, lock, write, unlock
1470 **
1471 ** Each "lock" operation locks the file immediately. Each "write"
1472 ** and "unlock" operation adds an event to the event queue. If the
1473 ** second "lock" operation is performed before the first "unlock"
1474 ** operation has been processed asynchronously, then the first
1475 ** "unlock" cannot be safely processed as is, since this would mean
1476 ** the file was unlocked when the second "write" operation is
1477 ** processed. To work around this, when processing an ASYNC_UNLOCK
1478 ** operation, SQLite:
1479 **
1480 ** 1) Unlocks the file to the minimum of the argument passed to
1481 ** the xUnlock() call and the current lock from SQLite's point
1482 ** of view, and
1483 **
1484 ** 2) Only unlocks the file at all if this event is the last
1485 ** ASYNC_UNLOCK event on this file in the write-queue.
1486 */
1487 assert( holdingMutex==1 );
1488 assert( async.pQueueFirst==p );
1489 for(pIter=async.pQueueFirst->pNext; pIter; pIter=pIter->pNext){
1490 if( pIter->pFileData==pData && pIter->op==ASYNC_UNLOCK ) break;
1491 }
1492 if( !pIter ){
1493 async_mutex_enter(ASYNC_MUTEX_LOCK);
1494 pData->lock.eAsyncLock = MIN(
1495 pData->lock.eAsyncLock, MAX(pData->lock.eLock, eLock)
1496 );
1497 assert(pData->lock.eAsyncLock>=pData->lock.eLock);
1498 rc = getFileLock(pData->pLock);
1499 async_mutex_leave(ASYNC_MUTEX_LOCK);
1500 }
1501 break;
1502 }
1503
1504 case ASYNC_DELETE:
1505 ASYNC_TRACE(("DELETE %s\n", p->zBuf));
1506 rc = pVfs->xDelete(pVfs, p->zBuf, (int)p->iOffset);
1507 break;
1508
1509 case ASYNC_OPENEXCLUSIVE: {
1510 int flags = (int)p->iOffset;
1511 AsyncFileData *pData = p->pFileData;
1512 ASYNC_TRACE(("OPEN %s flags=%d\n", p->zBuf, (int)p->iOffset));
1513 assert(pData->pBaseRead->pMethods==0 && pData->pBaseWrite->pMethods==0);
1514 rc = pVfs->xOpen(pVfs, pData->zName, pData->pBaseRead, flags, 0);
1515 assert( holdingMutex==0 );
1516 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1517 holdingMutex = 1;
1518 break;
1519 }
1520
1521 default: assert(!"Illegal value for AsyncWrite.op");
1522 }
1523
1524 /* If we didn't hang on to the mutex during the IO op, obtain it now
1525 ** so that the AsyncWrite structure can be safely removed from the
1526 ** global write-op queue.
1527 */
1528 if( !holdingMutex ){
1529 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1530 holdingMutex = 1;
1531 }
1532 /* ASYNC_TRACE(("UNLINK %p\n", p)); */
1533 if( p==async.pQueueLast ){
1534 async.pQueueLast = 0;
1535 }
1536 if( !doNotFree ){
1537 assert_mutex_is_held(ASYNC_MUTEX_QUEUE);
1538 async.pQueueFirst = p->pNext;
1539 sqlite3_free(p);
1540 }
1541 assert( holdingMutex );
1542
1543 /* An IO error has occurred. We cannot report the error back to the
1544 ** connection that requested the I/O since the error happened
1545 ** asynchronously. The connection has already moved on. There
1546 ** really is nobody to report the error to.
1547 **
1548 ** The file for which the error occurred may have been a database or
1549 ** journal file. Regardless, none of the currently queued operations
1550 ** associated with the same database should now be performed. Nor should
1551 ** any subsequently requested IO on either a database or journal file
1552 ** handle for the same database be accepted until the main database
1553 ** file handle has been closed and reopened.
1554 **
1555 ** Furthermore, no further IO should be queued or performed on any file
1556 ** handle associated with a database that may have been part of a
1557 ** multi-file transaction that included the database associated with
1558 ** the IO error (i.e. a database ATTACHed to the same handle at some
1559 ** point in time).
1560 */
1561 if( rc!=SQLITE_OK ){
1562 async.ioError = rc;
1563 }
1564
1565 if( async.ioError && !async.pQueueFirst ){
1566 async_mutex_enter(ASYNC_MUTEX_LOCK);
1567 if( 0==async.pLock ){
1568 async.ioError = SQLITE_OK;
1569 }
1570 async_mutex_leave(ASYNC_MUTEX_LOCK);
1571 }
1572
1573 /* Drop the queue mutex before continuing to the next write operation
1574 ** in order to give other threads a chance to work with the write queue.
1575 */
1576 if( !async.pQueueFirst || !async.ioError ){
1577 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1578 holdingMutex = 0;
1579 if( async.ioDelay>0 ){
danielk19776f050aa2009-04-25 08:39:14 +00001580 pVfs->xSleep(pVfs, async.ioDelay*1000);
danielk1977a3f06592009-04-23 14:58:39 +00001581 }else{
1582 async_sched_yield();
1583 }
1584 }
1585 }
1586
1587 async_mutex_leave(ASYNC_MUTEX_WRITER);
1588 return;
1589}
1590
1591/*
1592** Install the asynchronous VFS.
1593*/
1594int sqlite3async_initialize(const char *zParent, int isDefault){
1595 int rc = SQLITE_OK;
1596 if( async_vfs.pAppData==0 ){
1597 sqlite3_vfs *pParent = sqlite3_vfs_find(zParent);
1598 if( !pParent || async_os_initialize() ){
1599 rc = SQLITE_ERROR;
1600 }else if( SQLITE_OK!=(rc = sqlite3_vfs_register(&async_vfs, isDefault)) ){
1601 async_os_shutdown();
1602 }else{
1603 async_vfs.pAppData = (void *)pParent;
1604 async_vfs.mxPathname = ((sqlite3_vfs *)async_vfs.pAppData)->mxPathname;
1605 }
1606 }
1607 return rc;
1608}
1609
1610/*
1611** Uninstall the asynchronous VFS.
1612*/
1613void sqlite3async_shutdown(void){
1614 if( async_vfs.pAppData ){
1615 async_os_shutdown();
1616 sqlite3_vfs_unregister((sqlite3_vfs *)&async_vfs);
1617 async_vfs.pAppData = 0;
1618 }
1619}
1620
1621/*
1622** Process events on the write-queue.
1623*/
1624void sqlite3async_run(void){
1625 asyncWriterThread();
1626}
1627
1628/*
1629** Control/configure the asynchronous IO system.
1630*/
1631int sqlite3async_control(int op, ...){
1632 va_list ap;
1633 va_start(ap, op);
1634 switch( op ){
1635 case SQLITEASYNC_HALT: {
1636 int eWhen = va_arg(ap, int);
1637 if( eWhen!=SQLITEASYNC_HALT_NEVER
1638 && eWhen!=SQLITEASYNC_HALT_NOW
1639 && eWhen!=SQLITEASYNC_HALT_IDLE
1640 ){
danielk19774598b8e2009-04-24 10:13:05 +00001641 return SQLITE_MISUSE;
danielk1977a3f06592009-04-23 14:58:39 +00001642 }
1643 async.eHalt = eWhen;
1644 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1645 async_cond_signal(ASYNC_COND_QUEUE);
1646 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1647 break;
1648 }
1649
1650 case SQLITEASYNC_DELAY: {
1651 int iDelay = va_arg(ap, int);
danielk19774598b8e2009-04-24 10:13:05 +00001652 if( iDelay<0 ){
1653 return SQLITE_MISUSE;
1654 }
danielk1977a3f06592009-04-23 14:58:39 +00001655 async.ioDelay = iDelay;
1656 break;
1657 }
danielk19774598b8e2009-04-24 10:13:05 +00001658
1659 case SQLITEASYNC_LOCKFILES: {
1660 int bLock = va_arg(ap, int);
1661 async_mutex_enter(ASYNC_MUTEX_QUEUE);
1662 if( async.nFile || async.pQueueFirst ){
1663 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1664 return SQLITE_MISUSE;
1665 }
1666 async.bLockFiles = bLock;
1667 async_mutex_leave(ASYNC_MUTEX_QUEUE);
1668 break;
1669 }
danielk1977a3f06592009-04-23 14:58:39 +00001670
1671 case SQLITEASYNC_GET_HALT: {
1672 int *peWhen = va_arg(ap, int *);
1673 *peWhen = async.eHalt;
1674 break;
1675 }
1676 case SQLITEASYNC_GET_DELAY: {
1677 int *piDelay = va_arg(ap, int *);
1678 *piDelay = async.ioDelay;
1679 break;
1680 }
danielk19774598b8e2009-04-24 10:13:05 +00001681 case SQLITEASYNC_GET_LOCKFILES: {
1682 int *piDelay = va_arg(ap, int *);
1683 *piDelay = async.bLockFiles;
1684 break;
1685 }
danielk1977a3f06592009-04-23 14:58:39 +00001686
1687 default:
1688 return SQLITE_ERROR;
1689 }
1690 return SQLITE_OK;
1691}
1692
1693#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
1694