blob: f0239bdb3a979078ed891949ddb55659cb8af46d [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
dan97a31352010-04-16 13:59:31 +00007/*
8** LOG FILE FORMAT
9**
10** A log file consists of a header followed by zero or more log frames.
11** The log header is 12 bytes in size and consists of the following three
12** big-endian 32-bit unsigned integer values:
13**
dan3de777f2010-04-17 12:31:37 +000014** 0: Database page size,
15** 4: Randomly selected salt value 1,
16** 8: Randomly selected salt value 2.
dan97a31352010-04-16 13:59:31 +000017**
18** Immediately following the log header are zero or more log frames. Each
19** frame itself consists of a 16-byte header followed by a <page-size> bytes
20** of page data. The header is broken into 4 big-endian 32-bit unsigned
21** integer values, as follows:
22**
dan3de777f2010-04-17 12:31:37 +000023** 0: Page number.
24** 4: For commit records, the size of the database image in pages
dan97a31352010-04-16 13:59:31 +000025** after the commit. For all other records, zero.
dan3de777f2010-04-17 12:31:37 +000026** 8: Checksum value 1.
dan97a31352010-04-16 13:59:31 +000027** 12: Checksum value 2.
28*/
29
30/*
danff207012010-04-24 04:49:15 +000031** LOG SUMMARY FILE FORMAT
dan97a31352010-04-16 13:59:31 +000032**
danff207012010-04-24 04:49:15 +000033** The log-summary file consists of a header region, followed by an
34** region that contains no useful data (used to apply byte-range locks
35** to), followed by the data region.
36**
37** The contents of both the header and data region are specified in terms
38** of 1, 2 and 4 byte unsigned integers. All integers are stored in
39** machine-endian order.
40**
41** A log-summary file is essentially a shadow-pager map. It contains a
42** mapping from database page number to the set of locations in the log
43** file that contain versions of the database page. When a database
44** client needs to read a page of data, it first queries the log-summary
45** file to determine if the required version of the page is stored in
46** the log. If so, it is read from the log file. If not, it is read from
47** the database file.
48**
49** Whenever a transaction is appended to the log or a checkpoint transfers
50** data from the log file into the database file, the log-summary is
51** updated accordingly.
52**
53** The fields in the log-summary file header are described in the comment
54** directly above the definition of struct LogSummaryHdr (see below).
55** Immediately following the fields in the LogSummaryHdr structure is
56** an 8 byte checksum based on the contents of the header. This field is
57** not the same as the iCheck1 and iCheck2 fields of the LogSummaryHdr.
dan97a31352010-04-16 13:59:31 +000058*/
59
drhc438efd2010-04-26 00:19:45 +000060#include "wal.h"
dan7c246102010-04-12 19:00:29 +000061
62#include <unistd.h>
63#include <fcntl.h>
64#include <sys/mman.h>
65
66typedef struct LogSummaryHdr LogSummaryHdr;
67typedef struct LogSummary LogSummary;
dan4a4b01d2010-04-16 11:30:18 +000068typedef struct LogIterator LogIterator;
dan64d039e2010-04-13 19:27:31 +000069typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000070
71
72/*
73** The following structure may be used to store the same data that
74** is stored in the log-summary header.
75**
76** Member variables iCheck1 and iCheck2 contain the checksum for the
77** last frame written to the log, or 2 and 3 respectively if the log
78** is currently empty.
79*/
80struct LogSummaryHdr {
81 u32 iChange; /* Counter incremented each transaction */
82 u32 pgsz; /* Database page size in bytes */
83 u32 iLastPg; /* Address of last valid frame in log */
84 u32 nPage; /* Size of database in pages */
85 u32 iCheck1; /* Checkpoint value 1 */
86 u32 iCheck2; /* Checkpoint value 2 */
87};
88
89/* Size of serialized LogSummaryHdr object. */
90#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
91
danff207012010-04-24 04:49:15 +000092/* A block of 16 bytes beginning at LOGSUMMARY_LOCK_OFFSET is reserved
93** for locks. Since some systems only feature mandatory file-locks, we
94** do not read or write data from the region of the file on which locks
95** are applied.
96*/
97#define LOGSUMMARY_LOCK_OFFSET ((sizeof(LogSummaryHdr))+2*sizeof(u32))
98#define LOGSUMMARY_LOCK_RESERVED 16
dan7c246102010-04-12 19:00:29 +000099
danff207012010-04-24 04:49:15 +0000100/* Size of header before each frame in log file */
dan97a31352010-04-16 13:59:31 +0000101#define LOG_FRAME_HDRSIZE 16
danff207012010-04-24 04:49:15 +0000102
103/* Size of log header */
104#define LOG_HDRSIZE 12
dan97a31352010-04-16 13:59:31 +0000105
106/*
107** Return the offset of frame iFrame in the log file, assuming a database
108** page size of pgsz bytes. The offset returned is to the start of the
109** log frame-header.
110*/
111#define logFrameOffset(iFrame, pgsz) ( \
danff207012010-04-24 04:49:15 +0000112 LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE) \
dan97a31352010-04-16 13:59:31 +0000113)
dan7c246102010-04-12 19:00:29 +0000114
115/*
dance4f05f2010-04-22 19:14:13 +0000116** If using mmap() to access a shared (or otherwise) log-summary file, then
117** the mapping size is incremented in units of the following size.
118**
119** A 64 KB log-summary mapping corresponds to a log file containing over
120** 13000 frames, so the mapping size does not need to be increased often.
121*/
122#define LOGSUMMARY_MMAP_INCREMENT (64*1024)
123
124/*
dan7c246102010-04-12 19:00:29 +0000125** There is one instance of this structure for each log-summary object
126** that this process has a connection to. They are stored in a linked
127** list starting at pLogSummary (global variable).
128**
129** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
130** directly in this implementation because the VFS does not support
131** the required blocking file-locks.
132*/
133struct LogSummary {
134 sqlite3_mutex *mutex; /* Mutex used to protect this object */
135 int nRef; /* Number of pointers to this structure */
136 int fd; /* File descriptor open on log-summary */
137 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +0000138 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +0000139 LogSummary *pNext; /* Next in global list */
dance4f05f2010-04-22 19:14:13 +0000140
dan7c246102010-04-12 19:00:29 +0000141 int nData; /* Size of aData allocation/mapping */
142 u32 *aData; /* File body */
143};
144
dan54934f42010-04-17 18:50:27 +0000145/*
146** This module uses three different types of file-locks. All are taken
147** on the log-summary file. The three types of locks are as follows:
148**
149** MUTEX: The MUTEX lock is used as a robust inter-process mutex. It
150** is held while the log-summary header is modified, and
151** sometimes when it is read. It is also held while a new client
152** obtains the DMH lock (see below), and while log recovery is
153** being run.
154**
155** DMH: The DMH (Dead Mans Hand mechanism) lock is used to ensure
156** that log-recovery is always run following a system restart.
157** When it first opens a log-summary file, a process takes a
158** SHARED lock on the DMH region. This lock is not released until
159** the log-summary file is closed.
160**
161** The process then attempts to upgrade to an EXCLUSIVE lock. If
162** successful, then the contents of the log-summary file are deemed
163** suspect and the log-summary header zeroed. This forces the
164** first process that reads the log-summary file to run log
165** recovery. After zeroing the log-summary header, the process
166** downgrades to a SHARED lock on the DMH region.
167**
168** If the attempt to obtain the EXCLUSIVE lock fails, then the
169** process concludes that some other process is already using the
170** log-summary file, and it can therefore be trusted.
171**
172** The procedure described in the previous three paragraphs (taking
173** a SHARED lock and then upgrading to an EXCLUSIVE lock to check
174** if the process is the only one to have an open connection to the
175** log file) is protected by holding the MUTEX lock. This avoids the
176** race condition wherein the first two clients connect almost
177** simultaneously following a system restart and each prevents
178** the other from obtaining the EXCLUSIVE lock.
179**
180**
181** REGION: There are 4 different region locks, regions A, B, C and D.
182** Various EXCLUSIVE and SHARED locks on these regions are obtained
183** when a client reads, writes or checkpoints the database.
184**
185** To obtain a reader lock:
186**
187** 1. Attempt a SHARED lock on regions A and B.
188** 2. If step 1 is successful, drop the lock on region B. Or, if
189** it is unsuccessful, attempt a SHARED lock on region D.
190** 3. Repeat the above until the lock attempt in step 1 or 2 is
191** successful.
192**
193** The reader lock is released when the read transaction is finished.
194**
195** To obtain a writer lock:
196**
197** 1. Take (wait for) an EXCLUSIVE lock on regions C and D.
198**
199** The locks are released after the write transaction is finished
200** and, if any frames were committed to the log, the log-summary
201** file updated.
202**
203** To obtain a checkpointer lock:
204**
205** 1. Take (wait for) an EXCLUSIVE lock on regions B and C.
206** 2. Take (wait for) an EXCLUSIVE lock on region A.
207**
208** Step 1 waits until any existing writer has finished. And forces
209** all new readers to become "region D" readers.
210**
211** Step 2 causes the checkpointer to wait until all existing region A
212** readers have finished their transactions. Once the exclusive lock
213** on region A has been obtained, only "region D" readers exist.
214** These readers are operating on the snapshot at the head of the
215** log. As such, the log can be safely copied into the database file
216** without interfering with the readers.
217**
218** Once the checkpoint has finished and the log-summary header
219** updated (to indicate the log contents can now be ignored), all
220** locks are released.
221**
222** However, there may still exist region D readers using data in
223** the body of the log file, so the log file itself cannot be
224** truncated or overwritten until all region D readers have finished.
225** That requirement is satisfied, because writers (the clients that
226** write to the log file) require an exclusive lock on region D.
227** Which they cannot get until all region D readers have finished.
228*/
danff207012010-04-24 04:49:15 +0000229#define LOG_LOCK_MUTEX (LOGSUMMARY_LOCK_OFFSET)
230#define LOG_LOCK_DMH (LOG_LOCK_MUTEX+1)
231#define LOG_LOCK_REGION (LOG_LOCK_DMH+1)
dan64d039e2010-04-13 19:27:31 +0000232
dan7c246102010-04-12 19:00:29 +0000233/*
dan64d039e2010-04-13 19:27:31 +0000234** The four lockable regions associated with each log-summary. A connection
dan3de777f2010-04-17 12:31:37 +0000235** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination
236** of the following bitmasks is passed as the second argument to the
237** logLockRegion() function.
dan7c246102010-04-12 19:00:29 +0000238*/
dan64d039e2010-04-13 19:27:31 +0000239#define LOG_REGION_A 0x01
240#define LOG_REGION_B 0x02
241#define LOG_REGION_C 0x04
242#define LOG_REGION_D 0x08
243
244/*
dan8d22a172010-04-19 18:03:51 +0000245** Values for the third parameter to logLockRegion().
246*/
247#define LOG_UNLOCK 0 /* Unlock a range of bytes */
248#define LOG_RDLOCK 1 /* Put a SHARED lock on a range of bytes */
249#define LOG_WRLOCK 2 /* Put an EXCLUSIVE lock on a byte-range */
250#define LOG_WRLOCKW 3 /* Block on EXCLUSIVE lock on a byte-range */
251
252/*
dan64d039e2010-04-13 19:27:31 +0000253** A single instance of this structure is allocated as part of each
254** connection to a database log. All structures associated with the
255** same log file are linked together into a list using LogLock.pNext
256** starting at LogSummary.pLock.
257**
258** The mLock field of the structure describes the locks (if any)
259** currently held by the connection. If a SHARED lock is held on
260** any of the four locking regions, then the associated LOG_REGION_X
261** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
262** then the (LOG_REGION_X << 8) bit is set.
263*/
264struct LogLock {
265 LogLock *pNext; /* Next lock on the same log */
266 u32 mLock; /* Mask of locks */
267};
dan7c246102010-04-12 19:00:29 +0000268
269struct Log {
270 LogSummary *pSummary; /* Log file summary data */
271 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
272 sqlite3_file *pFd; /* File handle for log file */
dan64d039e2010-04-13 19:27:31 +0000273 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +0000274 int isWriteLocked; /* True if this is the writer connection */
dan8d22a172010-04-19 18:03:51 +0000275 u32 iCallback; /* Value to pass to log callback (or 0) */
dan7c246102010-04-12 19:00:29 +0000276 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000277 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000278};
279
dan64d039e2010-04-13 19:27:31 +0000280
dan7c246102010-04-12 19:00:29 +0000281/*
282** This structure is used to implement an iterator that iterates through
283** all frames in the log in database page order. Where two or more frames
284** correspond to the same database page, the iterator visits only the
285** frame most recently written to the log.
286**
287** The internals of this structure are only accessed by:
288**
dan4a4b01d2010-04-16 11:30:18 +0000289** logIteratorInit() - Create a new iterator,
290** logIteratorNext() - Step an iterator,
291** logIteratorFree() - Free an iterator.
dan7c246102010-04-12 19:00:29 +0000292**
293** This functionality is used by the checkpoint code (see logCheckpoint()).
294*/
dan4a4b01d2010-04-16 11:30:18 +0000295struct LogIterator {
296 int nSegment; /* Size of LogIterator.aSegment[] array */
dan7c246102010-04-12 19:00:29 +0000297 int nFinal; /* Elements in segment nSegment-1 */
298 struct LogSegment {
299 int iNext; /* Next aIndex index */
300 u8 *aIndex; /* Pointer to index array */
301 u32 *aDbPage; /* Pointer to db page array */
302 } aSegment[1];
303};
304
dan64d039e2010-04-13 19:27:31 +0000305
dan97a31352010-04-16 13:59:31 +0000306
dan64d039e2010-04-13 19:27:31 +0000307/*
308** List of all LogSummary objects created by this process. Protected by
309** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
310** here instead of borrowing the LRU mutex.
311*/
312#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
313static LogSummary *pLogSummary = 0;
314
dan7c246102010-04-12 19:00:29 +0000315/*
316** Generate an 8 byte checksum based on the data in array aByte[] and the
317** initial values of aCksum[0] and aCksum[1]. The checksum is written into
318** aCksum[] before returning.
dan56d95912010-04-24 19:07:29 +0000319**
320** The range of bytes to checksum is treated as an array of 32-bit
321** little-endian unsigned integers. For each integer X in the array, from
322** start to finish, do the following:
323**
324** aCksum[0] += X;
325** aCksum[1] += aCksum[0];
326**
327** For the calculation above, use 64-bit unsigned accumulators. Before
328** returning, truncate the values to 32-bits as follows:
329**
330** aCksum[0] = (u32)(aCksum[0] + (aCksum[0]>>24));
331** aCksum[1] = (u32)(aCksum[1] + (aCksum[1]>>24));
dan7c246102010-04-12 19:00:29 +0000332*/
dan7c246102010-04-12 19:00:29 +0000333static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000334 u64 sum1 = aCksum[0];
335 u64 sum2 = aCksum[1];
336 u32 *a32 = (u32 *)aByte;
337 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000338
dan7c246102010-04-12 19:00:29 +0000339 assert( (nByte&0x00000003)==0 );
340
dance4f05f2010-04-22 19:14:13 +0000341 if( SQLITE_LITTLEENDIAN ){
342#ifdef SQLITE_DEBUG
343 u8 *a = (u8 *)a32;
344 assert( *a32==(a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24)) );
345#endif
346 do {
347 sum1 += *a32;
348 sum2 += sum1;
349 } while( ++a32<aEnd );
350 }else{
351 do {
352 u8 *a = (u8*)a32;
353 sum1 += a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24);
354 sum2 += sum1;
355 } while( ++a32<aEnd );
356 }
dan7c246102010-04-12 19:00:29 +0000357
dan39c79f52010-04-15 10:58:51 +0000358 aCksum[0] = sum1 + (sum1>>24);
359 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000360}
361
362/*
363** Argument zPath must be a nul-terminated string containing a path-name.
364** This function modifies the string in-place by removing any "./" or "../"
365** elements in the path. For example, the following input:
366**
367** "/home/user/plans/good/../evil/./world_domination.txt"
368**
369** is overwritten with the 'normalized' version:
370**
371** "/home/user/plans/evil/world_domination.txt"
372*/
373static void logNormalizePath(char *zPath){
374 int i, j;
375 char *z = zPath;
376 int n = strlen(z);
377
378 while( n>1 && z[n-1]=='/' ){ n--; }
379 for(i=j=0; i<n; i++){
380 if( z[i]=='/' ){
381 if( z[i+1]=='/' ) continue;
382 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
383 i += 1;
384 continue;
385 }
386 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
387 while( j>0 && z[j-1]!='/' ){ j--; }
388 if( j>0 ){ j--; }
389 i += 2;
390 continue;
391 }
392 }
393 z[j++] = z[i];
394 }
395 z[j] = 0;
396}
397
398/*
dan7c246102010-04-12 19:00:29 +0000399** Unmap the log-summary mapping and close the file-descriptor. If
400** the isTruncate argument is non-zero, truncate the log-summary file
401** region to zero bytes.
402**
403** Regardless of the value of isTruncate, close the file-descriptor
404** opened on the log-summary file.
405*/
dan3de777f2010-04-17 12:31:37 +0000406static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){
dan7c246102010-04-12 19:00:29 +0000407 int rc = SQLITE_OK;
408 if( pSummary->aData ){
409 assert( pSummary->fd>0 );
410 munmap(pSummary->aData, pSummary->nData);
411 pSummary->aData = 0;
dan3de777f2010-04-17 12:31:37 +0000412 if( isUnlink ){
413 char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
414 if( !zFile ){
415 rc = SQLITE_NOMEM;
416 }
417 unlink(zFile);
418 sqlite3_free(zFile);
dan7c246102010-04-12 19:00:29 +0000419 }
420 }
421 if( pSummary->fd>0 ){
422 close(pSummary->fd);
423 pSummary->fd = -1;
424 }
425 return rc;
426}
427
dan7c246102010-04-12 19:00:29 +0000428static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
danff207012010-04-24 04:49:15 +0000429 u32 *aHdr = pSummary->aData; /* Write header here */
430 u32 *aCksum = &aHdr[LOGSUMMARY_HDR_NFIELD]; /* Write header cksum here */
431
432 assert( LOGSUMMARY_HDR_NFIELD==sizeof(LogSummaryHdr)/4 );
433 memcpy(aHdr, pHdr, sizeof(LogSummaryHdr));
434 aCksum[0] = aCksum[1] = 1;
435 logChecksumBytes((u8 *)aHdr, sizeof(LogSummaryHdr), aCksum);
dan7c246102010-04-12 19:00:29 +0000436}
437
438/*
439** This function encodes a single frame header and writes it to a buffer
440** supplied by the caller. A log frame-header is made up of a series of
441** 4-byte big-endian integers, as follows:
442**
443** 0: Database page size in bytes.
444** 4: Page number.
445** 8: New database size (for commit frames, otherwise zero).
446** 12: Frame checksum 1.
447** 16: Frame checksum 2.
448*/
449static void logEncodeFrame(
450 u32 *aCksum, /* IN/OUT: Checksum values */
451 u32 iPage, /* Database page number for frame */
452 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
453 int nData, /* Database page size (size of aData[]) */
454 u8 *aData, /* Pointer to page data (for checksum) */
455 u8 *aFrame /* OUT: Write encoded frame here */
456){
dan97a31352010-04-16 13:59:31 +0000457 assert( LOG_FRAME_HDRSIZE==16 );
dan7c246102010-04-12 19:00:29 +0000458
dan97a31352010-04-16 13:59:31 +0000459 sqlite3Put4byte(&aFrame[0], iPage);
460 sqlite3Put4byte(&aFrame[4], nTruncate);
dan7c246102010-04-12 19:00:29 +0000461
dan97a31352010-04-16 13:59:31 +0000462 logChecksumBytes(aFrame, 8, aCksum);
dan7c246102010-04-12 19:00:29 +0000463 logChecksumBytes(aData, nData, aCksum);
464
dan97a31352010-04-16 13:59:31 +0000465 sqlite3Put4byte(&aFrame[8], aCksum[0]);
466 sqlite3Put4byte(&aFrame[12], aCksum[1]);
dan7c246102010-04-12 19:00:29 +0000467}
468
469/*
470** Return 1 and populate *piPage, *pnTruncate and aCksum if the
471** frame checksum looks Ok. Otherwise return 0.
472*/
473static int logDecodeFrame(
474 u32 *aCksum, /* IN/OUT: Checksum values */
475 u32 *piPage, /* OUT: Database page number for frame */
476 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
477 int nData, /* Database page size (size of aData[]) */
478 u8 *aData, /* Pointer to page data (for checksum) */
479 u8 *aFrame /* Frame data */
480){
dan97a31352010-04-16 13:59:31 +0000481 assert( LOG_FRAME_HDRSIZE==16 );
dan4a4b01d2010-04-16 11:30:18 +0000482
dan97a31352010-04-16 13:59:31 +0000483 logChecksumBytes(aFrame, 8, aCksum);
dan7c246102010-04-12 19:00:29 +0000484 logChecksumBytes(aData, nData, aCksum);
485
dan97a31352010-04-16 13:59:31 +0000486 if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
487 || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
dan7c246102010-04-12 19:00:29 +0000488 ){
489 /* Checksum failed. */
490 return 0;
491 }
492
dan97a31352010-04-16 13:59:31 +0000493 *piPage = sqlite3Get4byte(&aFrame[0]);
494 *pnTruncate = sqlite3Get4byte(&aFrame[4]);
dan7c246102010-04-12 19:00:29 +0000495 return 1;
496}
497
498static void logMergesort8(
499 Pgno *aContent, /* Pages in log */
500 u8 *aBuffer, /* Buffer of at least *pnList items to use */
501 u8 *aList, /* IN/OUT: List to sort */
502 int *pnList /* IN/OUT: Number of elements in aList[] */
503){
504 int nList = *pnList;
505 if( nList>1 ){
506 int nLeft = nList / 2; /* Elements in left list */
507 int nRight = nList - nLeft; /* Elements in right list */
508 u8 *aLeft = aList; /* Left list */
509 u8 *aRight = &aList[nLeft]; /* Right list */
510 int iLeft = 0; /* Current index in aLeft */
511 int iRight = 0; /* Current index in aright */
512 int iOut = 0; /* Current index in output buffer */
513
514 /* TODO: Change to non-recursive version. */
515 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
516 logMergesort8(aContent, aBuffer, aRight, &nRight);
517
518 while( iRight<nRight || iLeft<nLeft ){
519 u8 logpage;
520 Pgno dbpage;
521
522 if( (iLeft<nLeft)
523 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
524 ){
525 logpage = aLeft[iLeft++];
526 }else{
527 logpage = aRight[iRight++];
528 }
529 dbpage = aContent[logpage];
530
531 aBuffer[iOut++] = logpage;
532 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
533
534 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
535 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
536 }
537 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
538 *pnList = iOut;
539 }
540
541#ifdef SQLITE_DEBUG
542 {
543 int i;
544 for(i=1; i<*pnList; i++){
545 assert( aContent[aList[i]] > aContent[aList[i-1]] );
546 }
547 }
548#endif
549}
550
551
552/*
dance4f05f2010-04-22 19:14:13 +0000553** Memory map the first nByte bytes of the summary file opened with
554** pSummary->fd at pSummary->aData. If the summary file is smaller than
555** nByte bytes in size when this function is called, ftruncate() is
556** used to expand it before it is mapped.
557**
558** It is assumed that an exclusive lock is held on the summary file
559** by the caller (to protect the ftruncate()).
560*/
561static int logSummaryMap(LogSummary *pSummary, int nByte){
562 struct stat sStat;
563 int rc;
564 int fd = pSummary->fd;
565 void *pMap;
566
567 assert( pSummary->aData==0 );
568
569 /* If the file is less than nByte bytes in size, cause it to grow. */
570 rc = fstat(fd, &sStat);
571 if( rc!=0 ) return SQLITE_IOERR;
572 if( sStat.st_size<nByte ){
573 rc = ftruncate(fd, nByte);
574 if( rc!=0 ) return SQLITE_IOERR;
575 }else{
576 nByte = sStat.st_size;
577 }
578
579 /* Map the file. */
580 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
581 if( pMap==MAP_FAILED ){
582 return SQLITE_IOERR;
583 }
584 pSummary->aData = (u32 *)pMap;
585 pSummary->nData = nByte/4;
586
587 return SQLITE_OK;
588}
589
590/*
dan7c246102010-04-12 19:00:29 +0000591** Return the index in the LogSummary.aData array that corresponds to
592** frame iFrame. The log-summary file consists of a header, followed by
593** alternating "map" and "index" blocks.
594*/
595static int logSummaryEntry(u32 iFrame){
danff207012010-04-24 04:49:15 +0000596 return (
597 (LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED)/sizeof(u32)
598 + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
599 + iFrame-1 /* Db page numbers that occur before iFrame */
600 );
dan7c246102010-04-12 19:00:29 +0000601}
602
603
604/*
605** Set an entry in the log-summary map to map log frame iFrame to db
606** page iPage. Values are always appended to the log-summary (i.e. the
607** value of iFrame is always exactly one more than the value passed to
608** the previous call), but that restriction is not enforced or asserted
609** here.
610*/
611static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
612 u32 iSlot = logSummaryEntry(iFrame);
613
dance4f05f2010-04-22 19:14:13 +0000614 if( (iSlot+128)>=pSummary->nData ){
615 int nByte = pSummary->nData*4 + LOGSUMMARY_MMAP_INCREMENT;
616
617 sqlite3_mutex_enter(pSummary->mutex);
618 munmap(pSummary->aData, pSummary->nData*4);
619 pSummary->aData = 0;
620 logSummaryMap(pSummary, nByte);
621 sqlite3_mutex_leave(pSummary->mutex);
622 }
623
dan7c246102010-04-12 19:00:29 +0000624 /* Set the log-summary entry itself */
625 pSummary->aData[iSlot] = iPage;
626
627 /* If the frame number is a multiple of 256 (frames are numbered starting
628 ** at 1), build an index of the most recently added 256 frames.
629 */
630 if( (iFrame&0x000000FF)==0 ){
631 int i; /* Iterator used while initializing aIndex */
632 u32 *aFrame; /* Pointer to array of 256 frames */
633 int nIndex; /* Number of entries in index */
634 u8 *aIndex; /* 256 bytes to build index in */
635 u8 *aTmp; /* Scratch space to use while sorting */
636
637 aFrame = &pSummary->aData[iSlot-255];
638 aIndex = (u8 *)&pSummary->aData[iSlot+1];
639 aTmp = &aIndex[256];
640
641 nIndex = 256;
642 for(i=0; i<256; i++) aIndex[i] = (u8)i;
643 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
644 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
645 }
646}
647
648
649/*
650** Recover the log-summary by reading the log file. The caller must hold
651** an exclusive lock on the log-summary file.
652*/
653static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
654 int rc; /* Return Code */
655 i64 nSize; /* Size of log file */
656 LogSummaryHdr hdr; /* Recovered log-summary header */
657
658 memset(&hdr, 0, sizeof(hdr));
659
660 rc = sqlite3OsFileSize(pFd, &nSize);
661 if( rc!=SQLITE_OK ){
662 return rc;
663 }
664
665 if( nSize>LOG_FRAME_HDRSIZE ){
666 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
667 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
668 int nFrame; /* Number of bytes at aFrame */
669 u8 *aData; /* Pointer to data part of aFrame buffer */
670 int iFrame; /* Index of last frame read */
671 i64 iOffset; /* Next offset to read from log file */
672 int nPgsz; /* Page size according to the log */
dan97a31352010-04-16 13:59:31 +0000673 u32 aCksum[2]; /* Running checksum */
dan7c246102010-04-12 19:00:29 +0000674
675 /* Read in the first frame header in the file (to determine the
676 ** database page size).
677 */
dan97a31352010-04-16 13:59:31 +0000678 rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000679 if( rc!=SQLITE_OK ){
680 return rc;
681 }
682
683 /* If the database page size is not a power of two, or is greater than
684 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
685 */
686 nPgsz = sqlite3Get4byte(&aBuf[0]);
dance4f05f2010-04-22 19:14:13 +0000687 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE || nPgsz<512 ){
dan7c246102010-04-12 19:00:29 +0000688 goto finished;
689 }
dan97a31352010-04-16 13:59:31 +0000690 aCksum[0] = sqlite3Get4byte(&aBuf[4]);
691 aCksum[1] = sqlite3Get4byte(&aBuf[8]);
dan7c246102010-04-12 19:00:29 +0000692
693 /* Malloc a buffer to read frames into. */
694 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
695 aFrame = (u8 *)sqlite3_malloc(nFrame);
696 if( !aFrame ){
697 return SQLITE_NOMEM;
698 }
699 aData = &aFrame[LOG_FRAME_HDRSIZE];
700
701 /* Read all frames from the log file. */
702 iFrame = 0;
dan97a31352010-04-16 13:59:31 +0000703 for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
dan7c246102010-04-12 19:00:29 +0000704 u32 pgno; /* Database page number for frame */
705 u32 nTruncate; /* dbsize field from frame header */
706 int isValid; /* True if this frame is valid */
707
708 /* Read and decode the next log frame. */
709 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
710 if( rc!=SQLITE_OK ) break;
711 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
712 if( !isValid ) break;
713 logSummaryAppend(pSummary, ++iFrame, pgno);
714
715 /* If nTruncate is non-zero, this is a commit record. */
716 if( nTruncate ){
717 hdr.iCheck1 = aCksum[0];
718 hdr.iCheck2 = aCksum[1];
719 hdr.iLastPg = iFrame;
720 hdr.nPage = nTruncate;
721 hdr.pgsz = nPgsz;
722 }
723 }
724
725 sqlite3_free(aFrame);
726 }else{
727 hdr.iCheck1 = 2;
728 hdr.iCheck2 = 3;
729 }
730
731finished:
732 logSummaryWriteHdr(pSummary, &hdr);
733 return rc;
734}
735
dan3de777f2010-04-17 12:31:37 +0000736/*
dan8d22a172010-04-19 18:03:51 +0000737** Place, modify or remove a lock on the log-summary file associated
738** with pSummary.
danff207012010-04-24 04:49:15 +0000739**
740** The locked byte-range should be inside the region dedicated to
741** locking. This region of the log-summary file is never read or written.
dan3de777f2010-04-17 12:31:37 +0000742*/
dan8d22a172010-04-19 18:03:51 +0000743static int logLockFd(
744 LogSummary *pSummary, /* The log-summary object to lock */
745 int iStart, /* First byte to lock */
746 int nByte, /* Number of bytes to lock */
747 int op /* LOG_UNLOCK, RDLOCK, WRLOCK or WRLOCKW */
748){
dan3de777f2010-04-17 12:31:37 +0000749 int aType[4] = {
dan8d22a172010-04-19 18:03:51 +0000750 F_UNLCK, /* LOG_UNLOCK */
751 F_RDLCK, /* LOG_RDLOCK */
752 F_WRLCK, /* LOG_WRLOCK */
753 F_WRLCK /* LOG_WRLOCKW */
dan3de777f2010-04-17 12:31:37 +0000754 };
755 int aOp[4] = {
dan8d22a172010-04-19 18:03:51 +0000756 F_SETLK, /* LOG_UNLOCK */
757 F_SETLK, /* LOG_RDLOCK */
758 F_SETLK, /* LOG_WRLOCK */
759 F_SETLKW /* LOG_WRLOCKW */
dan3de777f2010-04-17 12:31:37 +0000760 };
dan8d22a172010-04-19 18:03:51 +0000761 struct flock f; /* Locking operation */
762 int rc; /* Value returned by fcntl() */
dan3de777f2010-04-17 12:31:37 +0000763
764 assert( ArraySize(aType)==ArraySize(aOp) );
765 assert( op>=0 && op<ArraySize(aType) );
danff207012010-04-24 04:49:15 +0000766 assert( nByte>0 );
767 assert( iStart>=LOGSUMMARY_LOCK_OFFSET
768 && iStart+nByte<=LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED
769 );
770#if defined(SQLITE_DEBUG) && defined(SQLITE_OS_UNIX)
771 if( pSummary->aData ) memset(&((u8*)pSummary->aData)[iStart], op, nByte);
772#endif
dan3de777f2010-04-17 12:31:37 +0000773
774 memset(&f, 0, sizeof(f));
775 f.l_type = aType[op];
776 f.l_whence = SEEK_SET;
777 f.l_start = iStart;
778 f.l_len = nByte;
779 rc = fcntl(pSummary->fd, aOp[op], &f);
780 return (rc==0) ? SQLITE_OK : SQLITE_BUSY;
781}
782
783static int logLockRegion(Log *pLog, u32 mRegion, int op){
784 LogSummary *pSummary = pLog->pSummary;
785 LogLock *p; /* Used to iterate through in-process locks */
786 u32 mOther; /* Locks held by other connections */
787 u32 mNew; /* New mask for pLog */
788
789 assert(
790 /* Writer lock operations */
791 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
792 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
793
794 /* Normal reader lock operations */
795 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
796 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
797 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
798
799 /* Region D reader lock operations */
800 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
801 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
802 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
803
804 /* Checkpointer lock operations */
805 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
806 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
807 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
808 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
809 );
810
811 /* Assert that a connection never tries to go from an EXCLUSIVE to a
812 ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
813 ** happens though (when a region D reader upgrades to a writer).
814 */
815 assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
816
817 sqlite3_mutex_enter(pSummary->mutex);
818
819 /* Calculate a mask of logs held by all connections in this process apart
820 ** from this one. The least significant byte of the mask contains a mask
821 ** of the SHARED logs held. The next least significant byte of the mask
822 ** indicates the EXCLUSIVE locks held. For example, to test if some other
823 ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
824 ** on region C, do:
825 **
826 ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
827 ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
828 **
829 ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
830 ** corresponding bit in the SHARED mask.
831 */
832 mOther = 0;
833 for(p=pSummary->pLock; p; p=p->pNext){
834 assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
835 if( p!=&pLog->lock ){
836 mOther |= p->mLock;
837 }
838 }
839
840 /* If this call is to lock a region (not to unlock one), test if locks held
841 ** by any other connection in this process prevent the new locks from
842 ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
843 */
844 if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
845 sqlite3_mutex_leave(pSummary->mutex);
846 return SQLITE_BUSY;
847 }
848
849 /* Figure out the new log mask for this connection. */
850 switch( op ){
851 case LOG_UNLOCK:
852 mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
853 break;
854 case LOG_RDLOCK:
855 mNew = (pLog->lock.mLock | mRegion);
856 break;
857 default:
858 assert( op==LOG_WRLOCK );
859 mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
860 break;
861 }
862
863 /* Now modify the locks held on the log-summary file descriptor. This
864 ** file descriptor is shared by all log connections in this process.
865 ** Therefore:
866 **
867 ** + If one or more log connections in this process hold a SHARED lock
868 ** on a region, the file-descriptor should hold a SHARED lock on
869 ** the file region.
870 **
871 ** + If a log connection in this process holds an EXCLUSIVE lock on a
872 ** region, the file-descriptor should also hold an EXCLUSIVE lock on
873 ** the region in question.
874 **
875 ** If this is an LOG_UNLOCK operation, only regions for which no other
876 ** connection holds a lock should actually be unlocked. And if this
877 ** is a LOG_RDLOCK operation and other connections already hold all
878 ** the required SHARED locks, then no system call is required.
879 */
880 if( op==LOG_UNLOCK ){
881 mRegion = (mRegion & ~mOther);
882 }
883 if( (op==LOG_WRLOCK)
884 || (op==LOG_UNLOCK && mRegion)
885 || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
886 ){
887 struct LockMap {
888 int iStart; /* Byte offset to start locking operation */
889 int iLen; /* Length field for locking operation */
890 } aMap[] = {
danff207012010-04-24 04:49:15 +0000891 /* 0000 */ {0, 0}, /* 0001 */ {3+LOG_LOCK_REGION, 1},
892 /* 0010 */ {2+LOG_LOCK_REGION, 1}, /* 0011 */ {2+LOG_LOCK_REGION, 2},
893 /* 0100 */ {1+LOG_LOCK_REGION, 1}, /* 0101 */ {0, 0},
894 /* 0110 */ {1+LOG_LOCK_REGION, 2}, /* 0111 */ {1+LOG_LOCK_REGION, 3},
895 /* 1000 */ {0+LOG_LOCK_REGION, 1}, /* 1001 */ {0, 0},
dan3de777f2010-04-17 12:31:37 +0000896 /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
danff207012010-04-24 04:49:15 +0000897 /* 1100 */ {0+LOG_LOCK_REGION, 2}, /* 1101 */ {0, 0},
dan3de777f2010-04-17 12:31:37 +0000898 /* 1110 */ {0, 0}, /* 1111 */ {0, 0}
899 };
900 int rc; /* Return code of logLockFd() */
901
902 assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
903
904 rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op);
905 if( rc!=0 ){
906 sqlite3_mutex_leave(pSummary->mutex);
907 return rc;
908 }
909 }
910
911 pLog->lock.mLock = mNew;
912 sqlite3_mutex_leave(pSummary->mutex);
913 return SQLITE_OK;
914}
915
dan8d22a172010-04-19 18:03:51 +0000916/*
917** Lock the DMH region, either with an EXCLUSIVE or SHARED lock. This
918** function is never called with LOG_UNLOCK - the only way the DMH region
919** is every completely unlocked is by by closing the file descriptor.
920*/
dan3de777f2010-04-17 12:31:37 +0000921static int logLockDMH(LogSummary *pSummary, int eLock){
dan8d22a172010-04-19 18:03:51 +0000922 assert( sqlite3_mutex_held(pSummary->mutex) );
dan3de777f2010-04-17 12:31:37 +0000923 assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK );
924 return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock);
925}
926
dan8d22a172010-04-19 18:03:51 +0000927/*
928** Lock (or unlock) the MUTEX region. It is always locked using an
929** EXCLUSIVE, blocking lock.
930*/
dan3de777f2010-04-17 12:31:37 +0000931static int logLockMutex(LogSummary *pSummary, int eLock){
dan8d22a172010-04-19 18:03:51 +0000932 assert( sqlite3_mutex_held(pSummary->mutex) );
dan3de777f2010-04-17 12:31:37 +0000933 assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK );
934 logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock);
935 return SQLITE_OK;
936}
937
dan7c246102010-04-12 19:00:29 +0000938/*
939** This function intializes the connection to the log-summary identified
940** by struct pSummary.
941*/
dan3de777f2010-04-17 12:31:37 +0000942static int logSummaryInit(
943 LogSummary *pSummary, /* Log summary object to initialize */
944 sqlite3_file *pFd /* File descriptor open on log file */
945){
dan7c246102010-04-12 19:00:29 +0000946 int rc; /* Return Code */
947 char *zFile; /* File name for summary file */
948
949 assert( pSummary->fd<0 );
950 assert( pSummary->aData==0 );
951 assert( pSummary->nRef>0 );
952 assert( pSummary->zPath );
953
954 /* Open a file descriptor on the summary file. */
955 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
956 if( !zFile ){
957 return SQLITE_NOMEM;
958 }
959 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
960 sqlite3_free(zFile);
961 if( pSummary->fd<0 ){
962 return SQLITE_IOERR;
963 }
964
dan3de777f2010-04-17 12:31:37 +0000965 /* Grab an exclusive lock the summary file. Then mmap() it.
966 **
967 ** TODO: This code needs to be enhanced to support a growable mapping.
968 ** For now, just make the mapping very large to start with. The
969 ** pages should not be allocated until they are first accessed anyhow,
970 ** so using a large mapping consumes no more resources than a smaller
971 ** one would.
dan7c246102010-04-12 19:00:29 +0000972 */
dan3de777f2010-04-17 12:31:37 +0000973 assert( sqlite3_mutex_held(pSummary->mutex) );
974 rc = logLockMutex(pSummary, LOG_WRLOCKW);
dan7c246102010-04-12 19:00:29 +0000975 if( rc!=SQLITE_OK ) return rc;
dance4f05f2010-04-22 19:14:13 +0000976 rc = logSummaryMap(pSummary, LOGSUMMARY_MMAP_INCREMENT);
dan7c246102010-04-12 19:00:29 +0000977 if( rc!=SQLITE_OK ) goto out;
978
dan3de777f2010-04-17 12:31:37 +0000979 /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this
980 ** is possible, the contents of the log-summary file (if any) may not
981 ** be trusted. Zero the log-summary header before continuing.
dan7c246102010-04-12 19:00:29 +0000982 */
dan3de777f2010-04-17 12:31:37 +0000983 rc = logLockDMH(pSummary, LOG_WRLOCK);
dan7c246102010-04-12 19:00:29 +0000984 if( rc==SQLITE_OK ){
dan7c246102010-04-12 19:00:29 +0000985 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
dan3de777f2010-04-17 12:31:37 +0000986 }
987 rc = logLockDMH(pSummary, LOG_RDLOCK);
988 if( rc!=SQLITE_OK ){
dan8d22a172010-04-19 18:03:51 +0000989 rc = SQLITE_IOERR;
dan7c246102010-04-12 19:00:29 +0000990 }
991
992 out:
dan3de777f2010-04-17 12:31:37 +0000993 logLockMutex(pSummary, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +0000994 return rc;
995}
996
997/*
998** Open a connection to the log file associated with database zDb. The
999** database file does not actually have to exist. zDb is used only to
1000** figure out the name of the log file to open. If the log file does not
1001** exist it is created by this call.
dan3de777f2010-04-17 12:31:37 +00001002**
1003** A SHARED lock should be held on the database file when this function
1004** is called. The purpose of this SHARED lock is to prevent any other
1005** client from unlinking the log or log-summary file. If another process
1006** were to do this just after this client opened one of these files, the
1007** system would be badly broken.
dan7c246102010-04-12 19:00:29 +00001008*/
drhc438efd2010-04-26 00:19:45 +00001009int sqlite3WalOpen(
dan7c246102010-04-12 19:00:29 +00001010 sqlite3_vfs *pVfs, /* vfs module to open log file with */
1011 const char *zDb, /* Name of database file */
1012 Log **ppLog /* OUT: Allocated Log handle */
1013){
danb9bf16b2010-04-14 11:23:30 +00001014 int rc = SQLITE_OK; /* Return Code */
dan7c246102010-04-12 19:00:29 +00001015 Log *pRet; /* Object to allocate and return */
1016 LogSummary *pSummary = 0; /* Summary object */
1017 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
1018 int flags; /* Flags passed to OsOpen() */
1019 char *zWal = 0; /* Path to WAL file */
1020 int nWal; /* Length of zWal in bytes */
1021
dan7c246102010-04-12 19:00:29 +00001022 assert( zDb );
dan7c246102010-04-12 19:00:29 +00001023
1024 /* Allocate an instance of struct Log to return. */
dan3de777f2010-04-17 12:31:37 +00001025 *ppLog = 0;
dan7c246102010-04-12 19:00:29 +00001026 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
1027 if( !pRet ) goto out;
1028 pRet->pVfs = pVfs;
1029 pRet->pFd = (sqlite3_file *)&pRet[1];
dan7c246102010-04-12 19:00:29 +00001030
1031 /* Normalize the path name. */
1032 zWal = sqlite3_mprintf("%s-wal", zDb);
1033 if( !zWal ) goto out;
1034 logNormalizePath(zWal);
dan67032392010-04-17 15:42:43 +00001035 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
dan7c246102010-04-12 19:00:29 +00001036 nWal = sqlite3Strlen30(zWal);
1037
1038 /* Enter the mutex that protects the linked-list of LogSummary structures */
1039 if( sqlite3GlobalConfig.bCoreMutex ){
1040 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
1041 }
1042 sqlite3_mutex_enter(mutex);
1043
1044 /* Search for an existing log summary object in the linked list. If one
1045 ** cannot be found, allocate and initialize a new object.
1046 */
1047 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
1048 int nPath = sqlite3Strlen30(pSummary->zPath);
1049 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
1050 }
1051 if( !pSummary ){
1052 int nByte = sizeof(LogSummary) + nWal + 1;
1053 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
1054 if( !pSummary ){
1055 rc = SQLITE_NOMEM;
1056 goto out;
1057 }
1058 if( sqlite3GlobalConfig.bCoreMutex ){
1059 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
1060 }
1061 pSummary->zPath = (char *)&pSummary[1];
1062 pSummary->fd = -1;
1063 memcpy(pSummary->zPath, zWal, nWal);
1064 pSummary->pNext = pLogSummary;
1065 pLogSummary = pSummary;
1066 }
1067 pSummary->nRef++;
1068 pRet->pSummary = pSummary;
1069
1070 /* Exit the mutex protecting the linked-list of LogSummary objects. */
1071 sqlite3_mutex_leave(mutex);
1072 mutex = 0;
1073
1074 /* Open file handle on the log file. */
1075 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
1076 if( rc!=SQLITE_OK ) goto out;
1077
1078 /* Object pSummary is shared between all connections to the database made
1079 ** by this process. So at this point it may or may not be connected to
dan3de777f2010-04-17 12:31:37 +00001080 ** the log-summary. If it is not, connect it.
dan7c246102010-04-12 19:00:29 +00001081 */
1082 sqlite3_mutex_enter(pSummary->mutex);
1083 mutex = pSummary->mutex;
1084 if( pSummary->fd<0 ){
1085 rc = logSummaryInit(pSummary, pRet->pFd);
dan7c246102010-04-12 19:00:29 +00001086 }
1087
dan64d039e2010-04-13 19:27:31 +00001088 pRet->lock.pNext = pSummary->pLock;
1089 pSummary->pLock = &pRet->lock;
1090
dan7c246102010-04-12 19:00:29 +00001091 out:
1092 sqlite3_mutex_leave(mutex);
1093 sqlite3_free(zWal);
1094 if( rc!=SQLITE_OK ){
1095 assert(0);
1096 if( pRet ){
1097 sqlite3OsClose(pRet->pFd);
1098 sqlite3_free(pRet);
1099 }
1100 assert( !pSummary || pSummary->nRef==0 );
1101 sqlite3_free(pSummary);
1102 }
1103 *ppLog = pRet;
1104 return rc;
1105}
1106
dan4a4b01d2010-04-16 11:30:18 +00001107static int logIteratorNext(
1108 LogIterator *p, /* Iterator */
dan7c246102010-04-12 19:00:29 +00001109 u32 *piPage, /* OUT: Next db page to write */
1110 u32 *piFrame /* OUT: Log frame to read from */
1111){
1112 u32 iMin = *piPage;
1113 u32 iRet = 0xFFFFFFFF;
1114 int i;
1115 int nBlock = p->nFinal;
1116
1117 for(i=p->nSegment-1; i>=0; i--){
1118 struct LogSegment *pSegment = &p->aSegment[i];
1119 while( pSegment->iNext<nBlock ){
1120 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
1121 if( iPg>iMin ){
1122 if( iPg<iRet ){
1123 iRet = iPg;
1124 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
1125 }
1126 break;
1127 }
1128 pSegment->iNext++;
1129 }
1130
1131 nBlock = 256;
1132 }
1133
1134 *piPage = iRet;
1135 return (iRet==0xFFFFFFFF);
1136}
1137
dan4a4b01d2010-04-16 11:30:18 +00001138static LogIterator *logIteratorInit(Log *pLog){
dan7c246102010-04-12 19:00:29 +00001139 u32 *aData = pLog->pSummary->aData;
dan4a4b01d2010-04-16 11:30:18 +00001140 LogIterator *p; /* Return value */
dan7c246102010-04-12 19:00:29 +00001141 int nSegment; /* Number of segments to merge */
1142 u32 iLast; /* Last frame in log */
1143 int nByte; /* Number of bytes to allocate */
1144 int i; /* Iterator variable */
1145 int nFinal; /* Number of unindexed entries */
1146 struct LogSegment *pFinal; /* Final (unindexed) segment */
1147 u8 *aTmp; /* Temp space used by merge-sort */
1148
1149 iLast = pLog->hdr.iLastPg;
1150 nSegment = (iLast >> 8) + 1;
1151 nFinal = (iLast & 0x000000FF);
1152
dan4a4b01d2010-04-16 11:30:18 +00001153 nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
1154 p = (LogIterator *)sqlite3_malloc(nByte);
dan7c246102010-04-12 19:00:29 +00001155 if( p ){
1156 memset(p, 0, nByte);
1157 p->nSegment = nSegment;
1158 p->nFinal = nFinal;
1159 }
1160
1161 for(i=0; i<nSegment-1; i++){
1162 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
1163 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
1164 }
1165 pFinal = &p->aSegment[nSegment-1];
1166
1167 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
1168 pFinal->aIndex = (u8 *)&pFinal[1];
1169 aTmp = &pFinal->aIndex[256];
1170 for(i=0; i<nFinal; i++){
1171 pFinal->aIndex[i] = i;
1172 }
1173 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
1174 p->nFinal = nFinal;
1175
1176 return p;
1177}
1178
1179/*
dan4a4b01d2010-04-16 11:30:18 +00001180** Free a log iterator allocated by logIteratorInit().
dan7c246102010-04-12 19:00:29 +00001181*/
dan4a4b01d2010-04-16 11:30:18 +00001182static void logIteratorFree(LogIterator *p){
dan7c246102010-04-12 19:00:29 +00001183 sqlite3_free(p);
1184}
1185
1186/*
1187** Checkpoint the contents of the log file.
1188*/
1189static int logCheckpoint(
1190 Log *pLog, /* Log connection */
1191 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001192 int sync_flags, /* Flags for OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001193 u8 *zBuf /* Temporary buffer to use */
1194){
1195 int rc; /* Return code */
1196 int pgsz = pLog->hdr.pgsz; /* Database page-size */
dan4a4b01d2010-04-16 11:30:18 +00001197 LogIterator *pIter = 0; /* Log iterator context */
dan7c246102010-04-12 19:00:29 +00001198 u32 iDbpage = 0; /* Next database page to write */
danb9bf16b2010-04-14 11:23:30 +00001199 u32 iFrame = 0; /* Log frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +00001200
danbb2e9c92010-04-15 13:33:18 +00001201 if( pLog->hdr.iLastPg==0 ){
1202 return SQLITE_OK;
1203 }
1204
dan7c246102010-04-12 19:00:29 +00001205 /* Allocate the iterator */
dan4a4b01d2010-04-16 11:30:18 +00001206 pIter = logIteratorInit(pLog);
dan7c246102010-04-12 19:00:29 +00001207 if( !pIter ) return SQLITE_NOMEM;
1208
1209 /* Sync the log file to disk */
danc5118782010-04-17 17:34:41 +00001210 if( sync_flags ){
1211 rc = sqlite3OsSync(pLog->pFd, sync_flags);
1212 if( rc!=SQLITE_OK ) goto out;
1213 }
dan7c246102010-04-12 19:00:29 +00001214
1215 /* Iterate through the contents of the log, copying data to the db file. */
dan4a4b01d2010-04-16 11:30:18 +00001216 while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
dan7c246102010-04-12 19:00:29 +00001217 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
dan97a31352010-04-16 13:59:31 +00001218 logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE
dan7c246102010-04-12 19:00:29 +00001219 );
1220 if( rc!=SQLITE_OK ) goto out;
1221 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
1222 if( rc!=SQLITE_OK ) goto out;
1223 }
1224
1225 /* Truncate the database file */
1226 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
1227 if( rc!=SQLITE_OK ) goto out;
1228
1229 /* Sync the database file. If successful, update the log-summary. */
danc5118782010-04-17 17:34:41 +00001230 if( sync_flags ){
1231 rc = sqlite3OsSync(pFd, sync_flags);
1232 if( rc!=SQLITE_OK ) goto out;
1233 }
dan7c246102010-04-12 19:00:29 +00001234 pLog->hdr.iLastPg = 0;
1235 pLog->hdr.iCheck1 = 2;
1236 pLog->hdr.iCheck2 = 3;
1237 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1238
1239 /* TODO: If a crash occurs and the current log is copied into the
1240 ** database there is no problem. However, if a crash occurs while
1241 ** writing the next transaction into the start of the log, such that:
1242 **
1243 ** * The first transaction currently in the log is left intact, but
1244 ** * The second (or subsequent) transaction is damaged,
1245 **
1246 ** then the database could become corrupt.
1247 **
1248 ** The easiest thing to do would be to write and sync a dummy header
1249 ** into the log at this point. Unfortunately, that turns out to be
1250 ** an unwelcome performance hit. Alternatives are...
1251 */
1252#if 0
1253 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
1254 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
1255 if( rc!=SQLITE_OK ) goto out;
1256 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1257#endif
1258
1259 out:
dan4a4b01d2010-04-16 11:30:18 +00001260 logIteratorFree(pIter);
dan7c246102010-04-12 19:00:29 +00001261 return rc;
1262}
1263
1264/*
1265** Close a connection to a log file.
1266*/
drhc438efd2010-04-26 00:19:45 +00001267int sqlite3WalClose(
dan7c246102010-04-12 19:00:29 +00001268 Log *pLog, /* Log to close */
1269 sqlite3_file *pFd, /* Database file */
danc5118782010-04-17 17:34:41 +00001270 int sync_flags, /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001271 u8 *zBuf /* Buffer of at least page-size bytes */
1272){
1273 int rc = SQLITE_OK;
1274 if( pLog ){
dan64d039e2010-04-13 19:27:31 +00001275 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +00001276 LogSummary *pSummary = pLog->pSummary;
1277 sqlite3_mutex *mutex = 0;
1278
dan64d039e2010-04-13 19:27:31 +00001279 sqlite3_mutex_enter(pSummary->mutex);
1280 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
1281 *ppL = pLog->lock.pNext;
1282 sqlite3_mutex_leave(pSummary->mutex);
1283
dan7c246102010-04-12 19:00:29 +00001284 if( sqlite3GlobalConfig.bCoreMutex ){
1285 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
1286 }
1287 sqlite3_mutex_enter(mutex);
1288
1289 /* Decrement the reference count on the log summary. If this is the last
1290 ** reference to the log summary object in this process, the object will
1291 ** be freed. If this is also the last connection to the database, then
1292 ** checkpoint the database and truncate the log and log-summary files
1293 ** to zero bytes in size.
1294 **/
1295 pSummary->nRef--;
1296 if( pSummary->nRef==0 ){
dan3de777f2010-04-17 12:31:37 +00001297 int rc;
dan7c246102010-04-12 19:00:29 +00001298 LogSummary **pp;
dan7c246102010-04-12 19:00:29 +00001299 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
1300 *pp = (*pp)->pNext;
dan3de777f2010-04-17 12:31:37 +00001301
1302 sqlite3_mutex_leave(mutex);
1303
1304 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
1305 if( rc==SQLITE_OK ){
1306
1307 /* This is the last connection to the database (including other
1308 ** processes). Do three things:
1309 **
1310 ** 1. Checkpoint the db.
1311 ** 2. Truncate the log file.
1312 ** 3. Unlink the log-summary file.
1313 */
danc5118782010-04-17 17:34:41 +00001314 rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
dan3de777f2010-04-17 12:31:37 +00001315 if( rc==SQLITE_OK ){
1316 rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0);
1317 }
1318
1319 logSummaryUnmap(pSummary, 1);
1320 }else{
1321 if( rc==SQLITE_BUSY ){
1322 rc = SQLITE_OK;
1323 }
1324 logSummaryUnmap(pSummary, 0);
1325 }
dan3de777f2010-04-17 12:31:37 +00001326
dan7c246102010-04-12 19:00:29 +00001327 sqlite3_mutex_free(pSummary->mutex);
1328 sqlite3_free(pSummary);
dan3de777f2010-04-17 12:31:37 +00001329 }else{
1330 sqlite3_mutex_leave(mutex);
dan7c246102010-04-12 19:00:29 +00001331 }
1332
dan7c246102010-04-12 19:00:29 +00001333 /* Close the connection to the log file and free the Log handle. */
1334 sqlite3OsClose(pLog->pFd);
1335 sqlite3_free(pLog);
1336 }
1337 return rc;
1338}
1339
1340/*
dan7c246102010-04-12 19:00:29 +00001341** Enter and leave the log-summary mutex. In this context, entering the
1342** log-summary mutex means:
1343**
1344** 1. Obtaining mutex pLog->pSummary->mutex, and
1345** 2. Taking an exclusive lock on the log-summary file.
1346**
1347** i.e. this mutex locks out other processes as well as other threads
1348** hosted in this address space.
1349*/
1350static int logEnterMutex(Log *pLog){
1351 LogSummary *pSummary = pLog->pSummary;
1352 int rc;
1353
1354 sqlite3_mutex_enter(pSummary->mutex);
dan3de777f2010-04-17 12:31:37 +00001355 rc = logLockMutex(pSummary, LOG_WRLOCKW);
dan7c246102010-04-12 19:00:29 +00001356 if( rc!=SQLITE_OK ){
1357 sqlite3_mutex_leave(pSummary->mutex);
1358 }
1359 return rc;
1360}
1361static void logLeaveMutex(Log *pLog){
1362 LogSummary *pSummary = pLog->pSummary;
dan3de777f2010-04-17 12:31:37 +00001363 logLockMutex(pSummary, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001364 sqlite3_mutex_leave(pSummary->mutex);
1365}
1366
1367/*
danb9bf16b2010-04-14 11:23:30 +00001368** Try to read the log-summary header. Attempt to verify the header
1369** checksum. If the checksum can be verified, copy the log-summary
1370** header into structure pLog->hdr. If the contents of pLog->hdr are
1371** modified by this and pChanged is not NULL, set *pChanged to 1.
1372** Otherwise leave *pChanged unmodified.
1373**
1374** If the checksum cannot be verified return SQLITE_ERROR.
1375*/
1376int logSummaryTryHdr(Log *pLog, int *pChanged){
1377 u32 aCksum[2] = {1, 1};
1378 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1379
1380 /* First try to read the header without a lock. Verify the checksum
1381 ** before returning. This will almost always work.
1382 */
1383 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1384 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1385 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1386 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1387 ){
1388 return SQLITE_ERROR;
1389 }
1390
1391 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1392 if( pChanged ){
1393 *pChanged = 1;
1394 }
1395 memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
1396 }
1397 return SQLITE_OK;
1398}
1399
1400/*
1401** Read the log-summary header from the log-summary file into structure
1402** pLog->hdr. If attempting to verify the header checksum fails, try
1403** to recover the log before returning.
1404**
1405** If the log-summary header is successfully read, return SQLITE_OK.
1406** Otherwise an SQLite error code.
1407*/
1408int logSummaryReadHdr(Log *pLog, int *pChanged){
1409 int rc;
1410
1411 /* First try to read the header without a lock. Verify the checksum
1412 ** before returning. This will almost always work.
1413 */
1414 if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
1415 return SQLITE_OK;
1416 }
1417
1418 /* If the first attempt to read the header failed, lock the log-summary
1419 ** file and try again. If the header checksum verification fails this
1420 ** time as well, run log recovery.
1421 */
1422 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1423 if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
1424 if( pChanged ){
1425 *pChanged = 1;
1426 }
1427 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1428 if( rc==SQLITE_OK ){
1429 rc = logSummaryTryHdr(pLog, 0);
1430 }
1431 }
1432 logLeaveMutex(pLog);
1433 }
1434
1435 return rc;
1436}
1437
1438/*
dan64d039e2010-04-13 19:27:31 +00001439** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001440**
1441** If this call obtains a new read-lock and the database contents have been
1442** modified since the most recent call to LogCloseSnapshot() on this Log
1443** connection, then *pChanged is set to 1 before returning. Otherwise, it
1444** is left unmodified. This is used by the pager layer to determine whether
1445** or not any cached pages may be safely reused.
1446*/
drhc438efd2010-04-26 00:19:45 +00001447int sqlite3WalOpenSnapshot(Log *pLog, int *pChanged){
dan7c246102010-04-12 19:00:29 +00001448 int rc = SQLITE_OK;
1449 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001450 int nAttempt;
1451
1452 /* Obtain a snapshot-lock on the log-summary file. The procedure
1453 ** for obtaining the snapshot log is:
1454 **
1455 ** 1. Attempt a SHARED lock on regions A and B.
1456 ** 2a. If step 1 is successful, drop the lock on region B.
1457 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1458 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1459 ** successful.
1460 **
1461 ** If neither of the locks can be obtained after 5 tries, presumably
1462 ** something is wrong (i.e. a process not following the locking protocol).
1463 ** Return an error code in this case.
1464 */
1465 rc = SQLITE_BUSY;
1466 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1467 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1468 if( rc==SQLITE_BUSY ){
1469 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1470 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1471 }else{
1472 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1473 pLog->isLocked = LOG_REGION_A;
1474 }
1475 }
1476 if( rc!=SQLITE_OK ){
1477 return rc;
1478 }
1479
danb9bf16b2010-04-14 11:23:30 +00001480 rc = logSummaryReadHdr(pLog, pChanged);
dan64d039e2010-04-13 19:27:31 +00001481 if( rc!=SQLITE_OK ){
1482 /* An error occured while attempting log recovery. */
drhc438efd2010-04-26 00:19:45 +00001483 sqlite3WalCloseSnapshot(pLog);
dan64d039e2010-04-13 19:27:31 +00001484 }
dan7c246102010-04-12 19:00:29 +00001485 }
1486 return rc;
1487}
1488
1489/*
1490** Unlock the current snapshot.
1491*/
drhc438efd2010-04-26 00:19:45 +00001492void sqlite3WalCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001493 if( pLog->isLocked ){
1494 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1495 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1496 }
dan7c246102010-04-12 19:00:29 +00001497 pLog->isLocked = 0;
1498}
1499
dan7c246102010-04-12 19:00:29 +00001500/*
1501** Read a page from the log, if it is present.
1502*/
drhc438efd2010-04-26 00:19:45 +00001503int sqlite3WalRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
dan7c246102010-04-12 19:00:29 +00001504 u32 iRead = 0;
1505 u32 *aData = pLog->pSummary->aData;
1506 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1507
dan39c79f52010-04-15 10:58:51 +00001508 assert( pLog->isLocked );
1509
dan7c246102010-04-12 19:00:29 +00001510 /* Do a linear search of the unindexed block of page-numbers (if any)
1511 ** at the end of the log-summary. An alternative to this would be to
1512 ** build an index in private memory each time a read transaction is
1513 ** opened on a new snapshot.
1514 */
1515 if( pLog->hdr.iLastPg ){
1516 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1517 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1518 while( *pi!=pgno && pi!=piStop ) pi--;
1519 if( pi!=piStop ){
1520 iRead = (pi-piStop) + iFrame;
1521 }
1522 }
1523 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1524
1525 while( iRead==0 && iFrame>0 ){
1526 int iLow = 0;
1527 int iHigh = 255;
1528 u32 *aFrame;
1529 u8 *aIndex;
1530
1531 iFrame -= 256;
1532 aFrame = &aData[logSummaryEntry(iFrame+1)];
1533 aIndex = (u8 *)&aFrame[256];
1534
1535 while( iLow<=iHigh ){
1536 int iTest = (iLow+iHigh)>>1;
1537 u32 iPg = aFrame[aIndex[iTest]];
1538
1539 if( iPg==pgno ){
1540 iRead = iFrame + 1 + aIndex[iTest];
1541 break;
1542 }
1543 else if( iPg<pgno ){
1544 iLow = iTest+1;
1545 }else{
1546 iHigh = iTest-1;
1547 }
1548 }
1549 }
1550 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1551
1552 /* If iRead is non-zero, then it is the log frame number that contains the
1553 ** required page. Read and return data from the log file.
1554 */
1555 if( iRead ){
dan97a31352010-04-16 13:59:31 +00001556 i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE;
dan7c246102010-04-12 19:00:29 +00001557 *pInLog = 1;
1558 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1559 }
1560
1561 *pInLog = 0;
1562 return SQLITE_OK;
1563}
1564
1565
1566/*
1567** Set *pPgno to the size of the database file (or zero, if unknown).
1568*/
drhc438efd2010-04-26 00:19:45 +00001569void sqlite3WalDbsize(Log *pLog, Pgno *pPgno){
dan7c246102010-04-12 19:00:29 +00001570 assert( pLog->isLocked );
1571 *pPgno = pLog->hdr.nPage;
1572}
1573
1574/*
dan7c246102010-04-12 19:00:29 +00001575** This function returns SQLITE_OK if the caller may write to the database.
1576** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001577** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001578*/
drhc438efd2010-04-26 00:19:45 +00001579int sqlite3WalWriteLock(Log *pLog, int op){
dan7c246102010-04-12 19:00:29 +00001580 assert( pLog->isLocked );
1581 if( op ){
dan64d039e2010-04-13 19:27:31 +00001582
1583 /* Obtain the writer lock */
1584 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1585 if( rc!=SQLITE_OK ){
1586 return rc;
1587 }
1588
dan39c79f52010-04-15 10:58:51 +00001589 /* If this is connection is a region D reader, then the SHARED lock on
1590 ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
1591 ** held on region A. This means that if the write-transaction is committed
dan49320f82010-04-14 18:50:08 +00001592 ** and this connection downgrades to a reader, it will be left with no
dan39c79f52010-04-15 10:58:51 +00001593 ** lock at all. And so its snapshot could get clobbered by a checkpoint
dan49320f82010-04-14 18:50:08 +00001594 ** operation.
1595 **
1596 ** To stop this from happening, grab a SHARED lock on region A now.
1597 ** This should always be successful, as the only time a client holds
1598 ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
1599 ** lock on region C (a checkpointer does this). This is not possible,
1600 ** as this connection currently has the EXCLUSIVE lock on region C.
dan02bb5962010-04-14 15:49:40 +00001601 */
dan49320f82010-04-14 18:50:08 +00001602 if( pLog->isLocked==LOG_REGION_D ){
1603 logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
1604 pLog->isLocked = LOG_REGION_A;
1605 }
dan02bb5962010-04-14 15:49:40 +00001606
dan39c79f52010-04-15 10:58:51 +00001607 /* If this connection is not reading the most recent database snapshot,
1608 ** it is not possible to write to the database. In this case release
1609 ** the write locks and return SQLITE_BUSY.
1610 */
dan7c246102010-04-12 19:00:29 +00001611 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
dan49320f82010-04-14 18:50:08 +00001612 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001613 return SQLITE_BUSY;
1614 }
1615 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001616
dan7c246102010-04-12 19:00:29 +00001617 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001618 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001619 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1620 pLog->isWriteLocked = 0;
1621 }
1622 return SQLITE_OK;
1623}
1624
dan74d6cd82010-04-24 18:44:05 +00001625/*
1626** The log handle passed to this function must be holding the write-lock.
1627**
1628** If any data has been written (but not committed) to the log file, this
1629** function moves the write-pointer back to the start of the transaction.
1630**
1631** Additionally, the callback function is invoked for each frame written
1632** to the log since the start of the transaction. If the callback returns
1633** other than SQLITE_OK, it is not invoked again and the error code is
1634** returned to the caller.
1635**
1636** Otherwise, if the callback function does not return an error, this
1637** function returns SQLITE_OK.
1638*/
drhc438efd2010-04-26 00:19:45 +00001639int sqlite3WalUndo(Log *pLog, int (*xUndo)(void *, Pgno), void *pUndoCtx){
dan74d6cd82010-04-24 18:44:05 +00001640 int rc = SQLITE_OK;
1641 Pgno iMax = pLog->hdr.iLastPg;
1642 Pgno iFrame;
1643
1644 assert( pLog->isWriteLocked );
1645 logSummaryReadHdr(pLog, 0);
1646 for(iFrame=pLog->hdr.iLastPg+1; iFrame<=iMax && rc==SQLITE_OK; iFrame++){
1647 rc = xUndo(pUndoCtx, pLog->pSummary->aData[logSummaryEntry(iFrame)]);
1648 }
1649 return rc;
1650}
1651
dan7c246102010-04-12 19:00:29 +00001652/*
dan3306c4a2010-04-23 19:15:00 +00001653** Return true if data has been written but not committed to the log file.
1654*/
drhc438efd2010-04-26 00:19:45 +00001655int sqlite3WalDirty(Log *pLog){
dan3306c4a2010-04-23 19:15:00 +00001656 assert( pLog->isWriteLocked );
1657 return( pLog->hdr.iLastPg!=((LogSummaryHdr*)pLog->pSummary->aData)->iLastPg );
1658}
1659
1660/*
dan7c246102010-04-12 19:00:29 +00001661** Write a set of frames to the log. The caller must hold at least a
1662** RESERVED lock on the database file.
1663*/
drhc438efd2010-04-26 00:19:45 +00001664int sqlite3WalFrames(
dan7c246102010-04-12 19:00:29 +00001665 Log *pLog, /* Log handle to write to */
1666 int nPgsz, /* Database page-size in bytes */
1667 PgHdr *pList, /* List of dirty pages to write */
1668 Pgno nTruncate, /* Database size after this commit */
1669 int isCommit, /* True if this is a commit */
danc5118782010-04-17 17:34:41 +00001670 int sync_flags /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001671){
dan7c246102010-04-12 19:00:29 +00001672 int rc; /* Used to catch return codes */
1673 u32 iFrame; /* Next frame address */
dan97a31352010-04-16 13:59:31 +00001674 u8 aFrame[LOG_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
dan7c246102010-04-12 19:00:29 +00001675 PgHdr *p; /* Iterator to run through pList with. */
dan97a31352010-04-16 13:59:31 +00001676 u32 aCksum[2]; /* Checksums */
dan7c246102010-04-12 19:00:29 +00001677 PgHdr *pLast; /* Last frame in list */
1678 int nLast = 0; /* Number of extra copies of last page */
1679
dan56d95912010-04-24 19:07:29 +00001680 assert( LOG_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
dan7c246102010-04-12 19:00:29 +00001681 assert( pList );
1682
dan97a31352010-04-16 13:59:31 +00001683 /* If this is the first frame written into the log, write the log
1684 ** header to the start of the log file. See comments at the top of
1685 ** this file for a description of the log-header format.
1686 */
1687 assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE );
1688 iFrame = pLog->hdr.iLastPg;
1689 if( iFrame==0 ){
1690 sqlite3Put4byte(aFrame, nPgsz);
1691 sqlite3_randomness(8, &aFrame[4]);
1692 pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
1693 pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
1694 rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0);
1695 if( rc!=SQLITE_OK ){
1696 return rc;
1697 }
1698 }
1699
dan7c246102010-04-12 19:00:29 +00001700 aCksum[0] = pLog->hdr.iCheck1;
1701 aCksum[1] = pLog->hdr.iCheck2;
1702
1703 /* Write the log file. */
dan7c246102010-04-12 19:00:29 +00001704 for(p=pList; p; p=p->pDirty){
1705 u32 nDbsize; /* Db-size field for frame header */
1706 i64 iOffset; /* Write offset in log file */
1707
dan97a31352010-04-16 13:59:31 +00001708 iOffset = logFrameOffset(++iFrame, nPgsz);
dan7c246102010-04-12 19:00:29 +00001709
1710 /* Populate and write the frame header */
1711 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1712 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1713 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1714 if( rc!=SQLITE_OK ){
1715 return rc;
1716 }
1717
1718 /* Write the page data */
1719 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1720 if( rc!=SQLITE_OK ){
1721 return rc;
1722 }
1723 pLast = p;
1724 }
1725
1726 /* Sync the log file if the 'isSync' flag was specified. */
danc5118782010-04-17 17:34:41 +00001727 if( sync_flags ){
dan7c246102010-04-12 19:00:29 +00001728 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
dan67032392010-04-17 15:42:43 +00001729 i64 iOffset = logFrameOffset(iFrame+1, nPgsz);
1730
1731 assert( isCommit );
dan7c246102010-04-12 19:00:29 +00001732
1733 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1734 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1735 }
1736 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1737 while( iOffset<iSegment ){
1738 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1739 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1740 if( rc!=SQLITE_OK ){
1741 return rc;
1742 }
1743
1744 iOffset += LOG_FRAME_HDRSIZE;
1745 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1746 if( rc!=SQLITE_OK ){
1747 return rc;
1748 }
1749 nLast++;
1750 iOffset += nPgsz;
1751 }
dan7c246102010-04-12 19:00:29 +00001752
danc5118782010-04-17 17:34:41 +00001753 rc = sqlite3OsSync(pLog->pFd, sync_flags);
dan7c246102010-04-12 19:00:29 +00001754 if( rc!=SQLITE_OK ){
1755 return rc;
1756 }
1757 }
1758
1759 /* Append data to the log summary. It is not necessary to lock the
1760 ** log-summary to do this as the RESERVED lock held on the db file
1761 ** guarantees that there are no other writers, and no data that may
1762 ** be in use by existing readers is being overwritten.
1763 */
1764 iFrame = pLog->hdr.iLastPg;
1765 for(p=pList; p; p=p->pDirty){
1766 iFrame++;
1767 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1768 }
1769 while( nLast>0 ){
1770 iFrame++;
1771 nLast--;
1772 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1773 }
1774
1775 /* Update the private copy of the header. */
1776 pLog->hdr.pgsz = nPgsz;
1777 pLog->hdr.iLastPg = iFrame;
1778 if( isCommit ){
1779 pLog->hdr.iChange++;
1780 pLog->hdr.nPage = nTruncate;
1781 }
1782 pLog->hdr.iCheck1 = aCksum[0];
1783 pLog->hdr.iCheck2 = aCksum[1];
1784
1785 /* If this is a commit, update the log-summary header too. */
1786 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1787 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1788 logLeaveMutex(pLog);
dan8d22a172010-04-19 18:03:51 +00001789 pLog->iCallback = iFrame;
dan7c246102010-04-12 19:00:29 +00001790 }
1791
dan8d22a172010-04-19 18:03:51 +00001792 return rc;
dan7c246102010-04-12 19:00:29 +00001793}
1794
1795/*
danb9bf16b2010-04-14 11:23:30 +00001796** Checkpoint the database:
1797**
1798** 1. Wait for an EXCLUSIVE lock on regions B and C.
1799** 2. Wait for an EXCLUSIVE lock on region A.
1800** 3. Copy the contents of the log into the database file.
1801** 4. Zero the log-summary header (so new readers will ignore the log).
1802** 5. Drop the locks obtained in steps 1 and 2.
dan7c246102010-04-12 19:00:29 +00001803*/
drhc438efd2010-04-26 00:19:45 +00001804int sqlite3WalCheckpoint(
dan7c246102010-04-12 19:00:29 +00001805 Log *pLog, /* Log connection */
1806 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001807 int sync_flags, /* Flags to sync db file with (or 0) */
dan64d039e2010-04-13 19:27:31 +00001808 u8 *zBuf, /* Temporary buffer to use */
1809 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1810 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001811){
danb9bf16b2010-04-14 11:23:30 +00001812 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001813
dan39c79f52010-04-15 10:58:51 +00001814 assert( !pLog->isLocked );
1815
1816 /* Wait for an EXCLUSIVE lock on regions B and C. */
dan64d039e2010-04-13 19:27:31 +00001817 do {
1818 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1819 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1820 if( rc!=SQLITE_OK ) return rc;
1821
dan39c79f52010-04-15 10:58:51 +00001822 /* Wait for an EXCLUSIVE lock on region A. */
dan64d039e2010-04-13 19:27:31 +00001823 do {
1824 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1825 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001826 if( rc!=SQLITE_OK ){
1827 logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1828 return rc;
1829 }
dan64d039e2010-04-13 19:27:31 +00001830
danb9bf16b2010-04-14 11:23:30 +00001831 /* Copy data from the log to the database file. */
1832 rc = logSummaryReadHdr(pLog, 0);
1833 if( rc==SQLITE_OK ){
danc5118782010-04-17 17:34:41 +00001834 rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
danb9bf16b2010-04-14 11:23:30 +00001835 }
1836
1837 /* Release the locks. */
dan64d039e2010-04-13 19:27:31 +00001838 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1839 return rc;
dan7c246102010-04-12 19:00:29 +00001840}
1841
drhc438efd2010-04-26 00:19:45 +00001842int sqlite3WalCallback(Log *pLog){
dan8d22a172010-04-19 18:03:51 +00001843 u32 ret = 0;
1844 if( pLog ){
1845 ret = pLog->iCallback;
1846 pLog->iCallback = 0;
1847 }
1848 return (int)ret;
1849}