blob: 0797822f8a49f5d48f33b2b6f1d93c133ff844ad [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
dan97a31352010-04-16 13:59:31 +00007/*
8** LOG FILE FORMAT
9**
10** A log file consists of a header followed by zero or more log frames.
11** The log header is 12 bytes in size and consists of the following three
12** big-endian 32-bit unsigned integer values:
13**
dan3de777f2010-04-17 12:31:37 +000014** 0: Database page size,
15** 4: Randomly selected salt value 1,
16** 8: Randomly selected salt value 2.
dan97a31352010-04-16 13:59:31 +000017**
18** Immediately following the log header are zero or more log frames. Each
19** frame itself consists of a 16-byte header followed by a <page-size> bytes
20** of page data. The header is broken into 4 big-endian 32-bit unsigned
21** integer values, as follows:
22**
dan3de777f2010-04-17 12:31:37 +000023** 0: Page number.
24** 4: For commit records, the size of the database image in pages
dan97a31352010-04-16 13:59:31 +000025** after the commit. For all other records, zero.
dan3de777f2010-04-17 12:31:37 +000026** 8: Checksum value 1.
dan97a31352010-04-16 13:59:31 +000027** 12: Checksum value 2.
28*/
29
30/*
danff207012010-04-24 04:49:15 +000031** LOG SUMMARY FILE FORMAT
dan97a31352010-04-16 13:59:31 +000032**
danff207012010-04-24 04:49:15 +000033** The log-summary file consists of a header region, followed by an
34** region that contains no useful data (used to apply byte-range locks
35** to), followed by the data region.
36**
37** The contents of both the header and data region are specified in terms
38** of 1, 2 and 4 byte unsigned integers. All integers are stored in
39** machine-endian order.
40**
41** A log-summary file is essentially a shadow-pager map. It contains a
42** mapping from database page number to the set of locations in the log
43** file that contain versions of the database page. When a database
44** client needs to read a page of data, it first queries the log-summary
45** file to determine if the required version of the page is stored in
46** the log. If so, it is read from the log file. If not, it is read from
47** the database file.
48**
49** Whenever a transaction is appended to the log or a checkpoint transfers
50** data from the log file into the database file, the log-summary is
51** updated accordingly.
52**
53** The fields in the log-summary file header are described in the comment
54** directly above the definition of struct LogSummaryHdr (see below).
55** Immediately following the fields in the LogSummaryHdr structure is
56** an 8 byte checksum based on the contents of the header. This field is
57** not the same as the iCheck1 and iCheck2 fields of the LogSummaryHdr.
dan97a31352010-04-16 13:59:31 +000058*/
59
drhc438efd2010-04-26 00:19:45 +000060#include "wal.h"
dan7c246102010-04-12 19:00:29 +000061
62#include <unistd.h>
63#include <fcntl.h>
64#include <sys/mman.h>
65
66typedef struct LogSummaryHdr LogSummaryHdr;
67typedef struct LogSummary LogSummary;
dan4a4b01d2010-04-16 11:30:18 +000068typedef struct LogIterator LogIterator;
dan64d039e2010-04-13 19:27:31 +000069typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000070
71
72/*
73** The following structure may be used to store the same data that
74** is stored in the log-summary header.
75**
76** Member variables iCheck1 and iCheck2 contain the checksum for the
77** last frame written to the log, or 2 and 3 respectively if the log
78** is currently empty.
79*/
80struct LogSummaryHdr {
81 u32 iChange; /* Counter incremented each transaction */
82 u32 pgsz; /* Database page size in bytes */
83 u32 iLastPg; /* Address of last valid frame in log */
84 u32 nPage; /* Size of database in pages */
85 u32 iCheck1; /* Checkpoint value 1 */
86 u32 iCheck2; /* Checkpoint value 2 */
87};
88
89/* Size of serialized LogSummaryHdr object. */
90#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
91
danff207012010-04-24 04:49:15 +000092/* A block of 16 bytes beginning at LOGSUMMARY_LOCK_OFFSET is reserved
93** for locks. Since some systems only feature mandatory file-locks, we
94** do not read or write data from the region of the file on which locks
95** are applied.
96*/
97#define LOGSUMMARY_LOCK_OFFSET ((sizeof(LogSummaryHdr))+2*sizeof(u32))
98#define LOGSUMMARY_LOCK_RESERVED 16
dan7c246102010-04-12 19:00:29 +000099
danff207012010-04-24 04:49:15 +0000100/* Size of header before each frame in log file */
dan97a31352010-04-16 13:59:31 +0000101#define LOG_FRAME_HDRSIZE 16
danff207012010-04-24 04:49:15 +0000102
103/* Size of log header */
104#define LOG_HDRSIZE 12
dan97a31352010-04-16 13:59:31 +0000105
106/*
107** Return the offset of frame iFrame in the log file, assuming a database
108** page size of pgsz bytes. The offset returned is to the start of the
109** log frame-header.
110*/
111#define logFrameOffset(iFrame, pgsz) ( \
danff207012010-04-24 04:49:15 +0000112 LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE) \
dan97a31352010-04-16 13:59:31 +0000113)
dan7c246102010-04-12 19:00:29 +0000114
115/*
dance4f05f2010-04-22 19:14:13 +0000116** If using mmap() to access a shared (or otherwise) log-summary file, then
117** the mapping size is incremented in units of the following size.
118**
119** A 64 KB log-summary mapping corresponds to a log file containing over
120** 13000 frames, so the mapping size does not need to be increased often.
121*/
dancd11fb22010-04-26 10:40:52 +0000122#ifdef SQLITE_TEST
123int sqlite3_walsummary_mmap_incr = 128;
124# define LOGSUMMARY_MMAP_INCREMENT sqlite3_walsummary_mmap_incr
125#else
126# define LOGSUMMARY_MMAP_INCREMENT (64*1024)
127#endif
dance4f05f2010-04-22 19:14:13 +0000128
129/*
dan7c246102010-04-12 19:00:29 +0000130** There is one instance of this structure for each log-summary object
131** that this process has a connection to. They are stored in a linked
132** list starting at pLogSummary (global variable).
133**
134** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
135** directly in this implementation because the VFS does not support
136** the required blocking file-locks.
137*/
138struct LogSummary {
139 sqlite3_mutex *mutex; /* Mutex used to protect this object */
140 int nRef; /* Number of pointers to this structure */
141 int fd; /* File descriptor open on log-summary */
142 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +0000143 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +0000144 LogSummary *pNext; /* Next in global list */
dance4f05f2010-04-22 19:14:13 +0000145
dan7c246102010-04-12 19:00:29 +0000146 int nData; /* Size of aData allocation/mapping */
147 u32 *aData; /* File body */
148};
149
dan54934f42010-04-17 18:50:27 +0000150/*
151** This module uses three different types of file-locks. All are taken
152** on the log-summary file. The three types of locks are as follows:
153**
154** MUTEX: The MUTEX lock is used as a robust inter-process mutex. It
155** is held while the log-summary header is modified, and
156** sometimes when it is read. It is also held while a new client
157** obtains the DMH lock (see below), and while log recovery is
158** being run.
159**
160** DMH: The DMH (Dead Mans Hand mechanism) lock is used to ensure
161** that log-recovery is always run following a system restart.
162** When it first opens a log-summary file, a process takes a
163** SHARED lock on the DMH region. This lock is not released until
164** the log-summary file is closed.
165**
166** The process then attempts to upgrade to an EXCLUSIVE lock. If
167** successful, then the contents of the log-summary file are deemed
168** suspect and the log-summary header zeroed. This forces the
169** first process that reads the log-summary file to run log
170** recovery. After zeroing the log-summary header, the process
171** downgrades to a SHARED lock on the DMH region.
172**
173** If the attempt to obtain the EXCLUSIVE lock fails, then the
174** process concludes that some other process is already using the
175** log-summary file, and it can therefore be trusted.
176**
177** The procedure described in the previous three paragraphs (taking
178** a SHARED lock and then upgrading to an EXCLUSIVE lock to check
179** if the process is the only one to have an open connection to the
180** log file) is protected by holding the MUTEX lock. This avoids the
181** race condition wherein the first two clients connect almost
182** simultaneously following a system restart and each prevents
183** the other from obtaining the EXCLUSIVE lock.
184**
185**
186** REGION: There are 4 different region locks, regions A, B, C and D.
187** Various EXCLUSIVE and SHARED locks on these regions are obtained
188** when a client reads, writes or checkpoints the database.
189**
190** To obtain a reader lock:
191**
192** 1. Attempt a SHARED lock on regions A and B.
193** 2. If step 1 is successful, drop the lock on region B. Or, if
194** it is unsuccessful, attempt a SHARED lock on region D.
195** 3. Repeat the above until the lock attempt in step 1 or 2 is
196** successful.
197**
198** The reader lock is released when the read transaction is finished.
199**
200** To obtain a writer lock:
201**
202** 1. Take (wait for) an EXCLUSIVE lock on regions C and D.
203**
204** The locks are released after the write transaction is finished
205** and, if any frames were committed to the log, the log-summary
206** file updated.
207**
208** To obtain a checkpointer lock:
209**
210** 1. Take (wait for) an EXCLUSIVE lock on regions B and C.
211** 2. Take (wait for) an EXCLUSIVE lock on region A.
212**
213** Step 1 waits until any existing writer has finished. And forces
214** all new readers to become "region D" readers.
215**
216** Step 2 causes the checkpointer to wait until all existing region A
217** readers have finished their transactions. Once the exclusive lock
218** on region A has been obtained, only "region D" readers exist.
219** These readers are operating on the snapshot at the head of the
220** log. As such, the log can be safely copied into the database file
221** without interfering with the readers.
222**
223** Once the checkpoint has finished and the log-summary header
224** updated (to indicate the log contents can now be ignored), all
225** locks are released.
226**
227** However, there may still exist region D readers using data in
228** the body of the log file, so the log file itself cannot be
229** truncated or overwritten until all region D readers have finished.
230** That requirement is satisfied, because writers (the clients that
231** write to the log file) require an exclusive lock on region D.
232** Which they cannot get until all region D readers have finished.
233*/
danff207012010-04-24 04:49:15 +0000234#define LOG_LOCK_MUTEX (LOGSUMMARY_LOCK_OFFSET)
235#define LOG_LOCK_DMH (LOG_LOCK_MUTEX+1)
236#define LOG_LOCK_REGION (LOG_LOCK_DMH+1)
dan64d039e2010-04-13 19:27:31 +0000237
dan7c246102010-04-12 19:00:29 +0000238/*
dan64d039e2010-04-13 19:27:31 +0000239** The four lockable regions associated with each log-summary. A connection
dan3de777f2010-04-17 12:31:37 +0000240** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination
241** of the following bitmasks is passed as the second argument to the
242** logLockRegion() function.
dan7c246102010-04-12 19:00:29 +0000243*/
dan64d039e2010-04-13 19:27:31 +0000244#define LOG_REGION_A 0x01
245#define LOG_REGION_B 0x02
246#define LOG_REGION_C 0x04
247#define LOG_REGION_D 0x08
248
249/*
dan8d22a172010-04-19 18:03:51 +0000250** Values for the third parameter to logLockRegion().
251*/
252#define LOG_UNLOCK 0 /* Unlock a range of bytes */
253#define LOG_RDLOCK 1 /* Put a SHARED lock on a range of bytes */
254#define LOG_WRLOCK 2 /* Put an EXCLUSIVE lock on a byte-range */
255#define LOG_WRLOCKW 3 /* Block on EXCLUSIVE lock on a byte-range */
256
257/*
dan64d039e2010-04-13 19:27:31 +0000258** A single instance of this structure is allocated as part of each
259** connection to a database log. All structures associated with the
260** same log file are linked together into a list using LogLock.pNext
261** starting at LogSummary.pLock.
262**
263** The mLock field of the structure describes the locks (if any)
264** currently held by the connection. If a SHARED lock is held on
265** any of the four locking regions, then the associated LOG_REGION_X
266** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
267** then the (LOG_REGION_X << 8) bit is set.
268*/
269struct LogLock {
270 LogLock *pNext; /* Next lock on the same log */
271 u32 mLock; /* Mask of locks */
272};
dan7c246102010-04-12 19:00:29 +0000273
274struct Log {
275 LogSummary *pSummary; /* Log file summary data */
276 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
277 sqlite3_file *pFd; /* File handle for log file */
dan64d039e2010-04-13 19:27:31 +0000278 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +0000279 int isWriteLocked; /* True if this is the writer connection */
dan8d22a172010-04-19 18:03:51 +0000280 u32 iCallback; /* Value to pass to log callback (or 0) */
dan7c246102010-04-12 19:00:29 +0000281 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000282 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000283};
284
dan64d039e2010-04-13 19:27:31 +0000285
dan7c246102010-04-12 19:00:29 +0000286/*
287** This structure is used to implement an iterator that iterates through
288** all frames in the log in database page order. Where two or more frames
289** correspond to the same database page, the iterator visits only the
290** frame most recently written to the log.
291**
292** The internals of this structure are only accessed by:
293**
dan4a4b01d2010-04-16 11:30:18 +0000294** logIteratorInit() - Create a new iterator,
295** logIteratorNext() - Step an iterator,
296** logIteratorFree() - Free an iterator.
dan7c246102010-04-12 19:00:29 +0000297**
298** This functionality is used by the checkpoint code (see logCheckpoint()).
299*/
dan4a4b01d2010-04-16 11:30:18 +0000300struct LogIterator {
301 int nSegment; /* Size of LogIterator.aSegment[] array */
dan7c246102010-04-12 19:00:29 +0000302 int nFinal; /* Elements in segment nSegment-1 */
303 struct LogSegment {
304 int iNext; /* Next aIndex index */
305 u8 *aIndex; /* Pointer to index array */
306 u32 *aDbPage; /* Pointer to db page array */
307 } aSegment[1];
308};
309
dan64d039e2010-04-13 19:27:31 +0000310
dan97a31352010-04-16 13:59:31 +0000311
dan64d039e2010-04-13 19:27:31 +0000312/*
313** List of all LogSummary objects created by this process. Protected by
314** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
315** here instead of borrowing the LRU mutex.
316*/
317#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
318static LogSummary *pLogSummary = 0;
319
dan7c246102010-04-12 19:00:29 +0000320/*
321** Generate an 8 byte checksum based on the data in array aByte[] and the
322** initial values of aCksum[0] and aCksum[1]. The checksum is written into
323** aCksum[] before returning.
dan56d95912010-04-24 19:07:29 +0000324**
325** The range of bytes to checksum is treated as an array of 32-bit
326** little-endian unsigned integers. For each integer X in the array, from
327** start to finish, do the following:
328**
329** aCksum[0] += X;
330** aCksum[1] += aCksum[0];
331**
332** For the calculation above, use 64-bit unsigned accumulators. Before
333** returning, truncate the values to 32-bits as follows:
334**
335** aCksum[0] = (u32)(aCksum[0] + (aCksum[0]>>24));
336** aCksum[1] = (u32)(aCksum[1] + (aCksum[1]>>24));
dan7c246102010-04-12 19:00:29 +0000337*/
dan7c246102010-04-12 19:00:29 +0000338static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000339 u64 sum1 = aCksum[0];
340 u64 sum2 = aCksum[1];
341 u32 *a32 = (u32 *)aByte;
342 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000343
dan7c246102010-04-12 19:00:29 +0000344 assert( (nByte&0x00000003)==0 );
345
dance4f05f2010-04-22 19:14:13 +0000346 if( SQLITE_LITTLEENDIAN ){
347#ifdef SQLITE_DEBUG
348 u8 *a = (u8 *)a32;
349 assert( *a32==(a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24)) );
350#endif
351 do {
352 sum1 += *a32;
353 sum2 += sum1;
354 } while( ++a32<aEnd );
355 }else{
356 do {
357 u8 *a = (u8*)a32;
358 sum1 += a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24);
359 sum2 += sum1;
360 } while( ++a32<aEnd );
361 }
dan7c246102010-04-12 19:00:29 +0000362
dan39c79f52010-04-15 10:58:51 +0000363 aCksum[0] = sum1 + (sum1>>24);
364 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000365}
366
367/*
368** Argument zPath must be a nul-terminated string containing a path-name.
369** This function modifies the string in-place by removing any "./" or "../"
370** elements in the path. For example, the following input:
371**
372** "/home/user/plans/good/../evil/./world_domination.txt"
373**
374** is overwritten with the 'normalized' version:
375**
376** "/home/user/plans/evil/world_domination.txt"
377*/
378static void logNormalizePath(char *zPath){
379 int i, j;
380 char *z = zPath;
381 int n = strlen(z);
382
383 while( n>1 && z[n-1]=='/' ){ n--; }
384 for(i=j=0; i<n; i++){
385 if( z[i]=='/' ){
386 if( z[i+1]=='/' ) continue;
387 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
388 i += 1;
389 continue;
390 }
391 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
392 while( j>0 && z[j-1]!='/' ){ j--; }
393 if( j>0 ){ j--; }
394 i += 2;
395 continue;
396 }
397 }
398 z[j++] = z[i];
399 }
400 z[j] = 0;
401}
402
403/*
dan7c246102010-04-12 19:00:29 +0000404** Unmap the log-summary mapping and close the file-descriptor. If
405** the isTruncate argument is non-zero, truncate the log-summary file
406** region to zero bytes.
407**
408** Regardless of the value of isTruncate, close the file-descriptor
409** opened on the log-summary file.
410*/
dan3de777f2010-04-17 12:31:37 +0000411static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){
dan7c246102010-04-12 19:00:29 +0000412 int rc = SQLITE_OK;
413 if( pSummary->aData ){
414 assert( pSummary->fd>0 );
415 munmap(pSummary->aData, pSummary->nData);
416 pSummary->aData = 0;
dan3de777f2010-04-17 12:31:37 +0000417 if( isUnlink ){
418 char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
419 if( !zFile ){
420 rc = SQLITE_NOMEM;
421 }
422 unlink(zFile);
423 sqlite3_free(zFile);
dan7c246102010-04-12 19:00:29 +0000424 }
425 }
426 if( pSummary->fd>0 ){
427 close(pSummary->fd);
428 pSummary->fd = -1;
429 }
430 return rc;
431}
432
dan7c246102010-04-12 19:00:29 +0000433static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
danff207012010-04-24 04:49:15 +0000434 u32 *aHdr = pSummary->aData; /* Write header here */
435 u32 *aCksum = &aHdr[LOGSUMMARY_HDR_NFIELD]; /* Write header cksum here */
436
437 assert( LOGSUMMARY_HDR_NFIELD==sizeof(LogSummaryHdr)/4 );
438 memcpy(aHdr, pHdr, sizeof(LogSummaryHdr));
439 aCksum[0] = aCksum[1] = 1;
440 logChecksumBytes((u8 *)aHdr, sizeof(LogSummaryHdr), aCksum);
dan7c246102010-04-12 19:00:29 +0000441}
442
443/*
444** This function encodes a single frame header and writes it to a buffer
445** supplied by the caller. A log frame-header is made up of a series of
446** 4-byte big-endian integers, as follows:
447**
448** 0: Database page size in bytes.
449** 4: Page number.
450** 8: New database size (for commit frames, otherwise zero).
451** 12: Frame checksum 1.
452** 16: Frame checksum 2.
453*/
454static void logEncodeFrame(
455 u32 *aCksum, /* IN/OUT: Checksum values */
456 u32 iPage, /* Database page number for frame */
457 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
458 int nData, /* Database page size (size of aData[]) */
459 u8 *aData, /* Pointer to page data (for checksum) */
460 u8 *aFrame /* OUT: Write encoded frame here */
461){
dan97a31352010-04-16 13:59:31 +0000462 assert( LOG_FRAME_HDRSIZE==16 );
dan7c246102010-04-12 19:00:29 +0000463
dan97a31352010-04-16 13:59:31 +0000464 sqlite3Put4byte(&aFrame[0], iPage);
465 sqlite3Put4byte(&aFrame[4], nTruncate);
dan7c246102010-04-12 19:00:29 +0000466
dan97a31352010-04-16 13:59:31 +0000467 logChecksumBytes(aFrame, 8, aCksum);
dan7c246102010-04-12 19:00:29 +0000468 logChecksumBytes(aData, nData, aCksum);
469
dan97a31352010-04-16 13:59:31 +0000470 sqlite3Put4byte(&aFrame[8], aCksum[0]);
471 sqlite3Put4byte(&aFrame[12], aCksum[1]);
dan7c246102010-04-12 19:00:29 +0000472}
473
474/*
475** Return 1 and populate *piPage, *pnTruncate and aCksum if the
476** frame checksum looks Ok. Otherwise return 0.
477*/
478static int logDecodeFrame(
479 u32 *aCksum, /* IN/OUT: Checksum values */
480 u32 *piPage, /* OUT: Database page number for frame */
481 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
482 int nData, /* Database page size (size of aData[]) */
483 u8 *aData, /* Pointer to page data (for checksum) */
484 u8 *aFrame /* Frame data */
485){
dan97a31352010-04-16 13:59:31 +0000486 assert( LOG_FRAME_HDRSIZE==16 );
dan4a4b01d2010-04-16 11:30:18 +0000487
dan97a31352010-04-16 13:59:31 +0000488 logChecksumBytes(aFrame, 8, aCksum);
dan7c246102010-04-12 19:00:29 +0000489 logChecksumBytes(aData, nData, aCksum);
490
dan97a31352010-04-16 13:59:31 +0000491 if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
492 || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
dan7c246102010-04-12 19:00:29 +0000493 ){
494 /* Checksum failed. */
495 return 0;
496 }
497
dan97a31352010-04-16 13:59:31 +0000498 *piPage = sqlite3Get4byte(&aFrame[0]);
499 *pnTruncate = sqlite3Get4byte(&aFrame[4]);
dan7c246102010-04-12 19:00:29 +0000500 return 1;
501}
502
503static void logMergesort8(
504 Pgno *aContent, /* Pages in log */
505 u8 *aBuffer, /* Buffer of at least *pnList items to use */
506 u8 *aList, /* IN/OUT: List to sort */
507 int *pnList /* IN/OUT: Number of elements in aList[] */
508){
509 int nList = *pnList;
510 if( nList>1 ){
511 int nLeft = nList / 2; /* Elements in left list */
512 int nRight = nList - nLeft; /* Elements in right list */
513 u8 *aLeft = aList; /* Left list */
514 u8 *aRight = &aList[nLeft]; /* Right list */
515 int iLeft = 0; /* Current index in aLeft */
516 int iRight = 0; /* Current index in aright */
517 int iOut = 0; /* Current index in output buffer */
518
519 /* TODO: Change to non-recursive version. */
520 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
521 logMergesort8(aContent, aBuffer, aRight, &nRight);
522
523 while( iRight<nRight || iLeft<nLeft ){
524 u8 logpage;
525 Pgno dbpage;
526
527 if( (iLeft<nLeft)
528 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
529 ){
530 logpage = aLeft[iLeft++];
531 }else{
532 logpage = aRight[iRight++];
533 }
534 dbpage = aContent[logpage];
535
536 aBuffer[iOut++] = logpage;
537 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
538
539 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
540 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
541 }
542 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
543 *pnList = iOut;
544 }
545
546#ifdef SQLITE_DEBUG
547 {
548 int i;
549 for(i=1; i<*pnList; i++){
550 assert( aContent[aList[i]] > aContent[aList[i-1]] );
551 }
552 }
553#endif
554}
555
556
557/*
dance4f05f2010-04-22 19:14:13 +0000558** Memory map the first nByte bytes of the summary file opened with
559** pSummary->fd at pSummary->aData. If the summary file is smaller than
560** nByte bytes in size when this function is called, ftruncate() is
561** used to expand it before it is mapped.
562**
563** It is assumed that an exclusive lock is held on the summary file
564** by the caller (to protect the ftruncate()).
565*/
566static int logSummaryMap(LogSummary *pSummary, int nByte){
567 struct stat sStat;
568 int rc;
569 int fd = pSummary->fd;
570 void *pMap;
571
572 assert( pSummary->aData==0 );
573
574 /* If the file is less than nByte bytes in size, cause it to grow. */
575 rc = fstat(fd, &sStat);
576 if( rc!=0 ) return SQLITE_IOERR;
577 if( sStat.st_size<nByte ){
578 rc = ftruncate(fd, nByte);
579 if( rc!=0 ) return SQLITE_IOERR;
580 }else{
581 nByte = sStat.st_size;
582 }
583
584 /* Map the file. */
585 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
586 if( pMap==MAP_FAILED ){
587 return SQLITE_IOERR;
588 }
589 pSummary->aData = (u32 *)pMap;
590 pSummary->nData = nByte/4;
591
592 return SQLITE_OK;
593}
594
595/*
dan7c246102010-04-12 19:00:29 +0000596** Return the index in the LogSummary.aData array that corresponds to
597** frame iFrame. The log-summary file consists of a header, followed by
598** alternating "map" and "index" blocks.
599*/
600static int logSummaryEntry(u32 iFrame){
danff207012010-04-24 04:49:15 +0000601 return (
602 (LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED)/sizeof(u32)
603 + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
604 + iFrame-1 /* Db page numbers that occur before iFrame */
605 );
dan7c246102010-04-12 19:00:29 +0000606}
607
608
609/*
610** Set an entry in the log-summary map to map log frame iFrame to db
611** page iPage. Values are always appended to the log-summary (i.e. the
612** value of iFrame is always exactly one more than the value passed to
613** the previous call), but that restriction is not enforced or asserted
614** here.
615*/
616static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
617 u32 iSlot = logSummaryEntry(iFrame);
618
dancd11fb22010-04-26 10:40:52 +0000619 while( (iSlot+128)>=pSummary->nData ){
dance4f05f2010-04-22 19:14:13 +0000620 int nByte = pSummary->nData*4 + LOGSUMMARY_MMAP_INCREMENT;
621
dancd11fb22010-04-26 10:40:52 +0000622 /* Unmap and remap the log-summary file. */
dance4f05f2010-04-22 19:14:13 +0000623 sqlite3_mutex_enter(pSummary->mutex);
624 munmap(pSummary->aData, pSummary->nData*4);
625 pSummary->aData = 0;
626 logSummaryMap(pSummary, nByte);
627 sqlite3_mutex_leave(pSummary->mutex);
628 }
629
dan7c246102010-04-12 19:00:29 +0000630 /* Set the log-summary entry itself */
631 pSummary->aData[iSlot] = iPage;
632
633 /* If the frame number is a multiple of 256 (frames are numbered starting
634 ** at 1), build an index of the most recently added 256 frames.
635 */
636 if( (iFrame&0x000000FF)==0 ){
637 int i; /* Iterator used while initializing aIndex */
638 u32 *aFrame; /* Pointer to array of 256 frames */
639 int nIndex; /* Number of entries in index */
640 u8 *aIndex; /* 256 bytes to build index in */
641 u8 *aTmp; /* Scratch space to use while sorting */
642
643 aFrame = &pSummary->aData[iSlot-255];
644 aIndex = (u8 *)&pSummary->aData[iSlot+1];
645 aTmp = &aIndex[256];
646
647 nIndex = 256;
648 for(i=0; i<256; i++) aIndex[i] = (u8)i;
649 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
650 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
651 }
652}
653
654
655/*
656** Recover the log-summary by reading the log file. The caller must hold
657** an exclusive lock on the log-summary file.
658*/
659static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
660 int rc; /* Return Code */
661 i64 nSize; /* Size of log file */
662 LogSummaryHdr hdr; /* Recovered log-summary header */
663
664 memset(&hdr, 0, sizeof(hdr));
665
666 rc = sqlite3OsFileSize(pFd, &nSize);
667 if( rc!=SQLITE_OK ){
668 return rc;
669 }
670
671 if( nSize>LOG_FRAME_HDRSIZE ){
672 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
673 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
674 int nFrame; /* Number of bytes at aFrame */
675 u8 *aData; /* Pointer to data part of aFrame buffer */
676 int iFrame; /* Index of last frame read */
677 i64 iOffset; /* Next offset to read from log file */
678 int nPgsz; /* Page size according to the log */
dan97a31352010-04-16 13:59:31 +0000679 u32 aCksum[2]; /* Running checksum */
dan7c246102010-04-12 19:00:29 +0000680
681 /* Read in the first frame header in the file (to determine the
682 ** database page size).
683 */
dan97a31352010-04-16 13:59:31 +0000684 rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000685 if( rc!=SQLITE_OK ){
686 return rc;
687 }
688
689 /* If the database page size is not a power of two, or is greater than
690 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
691 */
692 nPgsz = sqlite3Get4byte(&aBuf[0]);
dance4f05f2010-04-22 19:14:13 +0000693 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE || nPgsz<512 ){
dan7c246102010-04-12 19:00:29 +0000694 goto finished;
695 }
dan97a31352010-04-16 13:59:31 +0000696 aCksum[0] = sqlite3Get4byte(&aBuf[4]);
697 aCksum[1] = sqlite3Get4byte(&aBuf[8]);
dan7c246102010-04-12 19:00:29 +0000698
699 /* Malloc a buffer to read frames into. */
700 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
701 aFrame = (u8 *)sqlite3_malloc(nFrame);
702 if( !aFrame ){
703 return SQLITE_NOMEM;
704 }
705 aData = &aFrame[LOG_FRAME_HDRSIZE];
706
707 /* Read all frames from the log file. */
708 iFrame = 0;
dan97a31352010-04-16 13:59:31 +0000709 for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
dan7c246102010-04-12 19:00:29 +0000710 u32 pgno; /* Database page number for frame */
711 u32 nTruncate; /* dbsize field from frame header */
712 int isValid; /* True if this frame is valid */
713
714 /* Read and decode the next log frame. */
715 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
716 if( rc!=SQLITE_OK ) break;
717 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
718 if( !isValid ) break;
719 logSummaryAppend(pSummary, ++iFrame, pgno);
720
721 /* If nTruncate is non-zero, this is a commit record. */
722 if( nTruncate ){
723 hdr.iCheck1 = aCksum[0];
724 hdr.iCheck2 = aCksum[1];
725 hdr.iLastPg = iFrame;
726 hdr.nPage = nTruncate;
727 hdr.pgsz = nPgsz;
728 }
729 }
730
731 sqlite3_free(aFrame);
732 }else{
733 hdr.iCheck1 = 2;
734 hdr.iCheck2 = 3;
735 }
736
737finished:
738 logSummaryWriteHdr(pSummary, &hdr);
739 return rc;
740}
741
dan3de777f2010-04-17 12:31:37 +0000742/*
dan8d22a172010-04-19 18:03:51 +0000743** Place, modify or remove a lock on the log-summary file associated
744** with pSummary.
danff207012010-04-24 04:49:15 +0000745**
746** The locked byte-range should be inside the region dedicated to
747** locking. This region of the log-summary file is never read or written.
dan3de777f2010-04-17 12:31:37 +0000748*/
dan8d22a172010-04-19 18:03:51 +0000749static int logLockFd(
750 LogSummary *pSummary, /* The log-summary object to lock */
751 int iStart, /* First byte to lock */
752 int nByte, /* Number of bytes to lock */
753 int op /* LOG_UNLOCK, RDLOCK, WRLOCK or WRLOCKW */
754){
dan3de777f2010-04-17 12:31:37 +0000755 int aType[4] = {
dan8d22a172010-04-19 18:03:51 +0000756 F_UNLCK, /* LOG_UNLOCK */
757 F_RDLCK, /* LOG_RDLOCK */
758 F_WRLCK, /* LOG_WRLOCK */
759 F_WRLCK /* LOG_WRLOCKW */
dan3de777f2010-04-17 12:31:37 +0000760 };
761 int aOp[4] = {
dan8d22a172010-04-19 18:03:51 +0000762 F_SETLK, /* LOG_UNLOCK */
763 F_SETLK, /* LOG_RDLOCK */
764 F_SETLK, /* LOG_WRLOCK */
765 F_SETLKW /* LOG_WRLOCKW */
dan3de777f2010-04-17 12:31:37 +0000766 };
dan8d22a172010-04-19 18:03:51 +0000767 struct flock f; /* Locking operation */
768 int rc; /* Value returned by fcntl() */
dan3de777f2010-04-17 12:31:37 +0000769
770 assert( ArraySize(aType)==ArraySize(aOp) );
771 assert( op>=0 && op<ArraySize(aType) );
danff207012010-04-24 04:49:15 +0000772 assert( nByte>0 );
773 assert( iStart>=LOGSUMMARY_LOCK_OFFSET
774 && iStart+nByte<=LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED
775 );
776#if defined(SQLITE_DEBUG) && defined(SQLITE_OS_UNIX)
777 if( pSummary->aData ) memset(&((u8*)pSummary->aData)[iStart], op, nByte);
778#endif
dan3de777f2010-04-17 12:31:37 +0000779
780 memset(&f, 0, sizeof(f));
781 f.l_type = aType[op];
782 f.l_whence = SEEK_SET;
783 f.l_start = iStart;
784 f.l_len = nByte;
785 rc = fcntl(pSummary->fd, aOp[op], &f);
786 return (rc==0) ? SQLITE_OK : SQLITE_BUSY;
787}
788
789static int logLockRegion(Log *pLog, u32 mRegion, int op){
790 LogSummary *pSummary = pLog->pSummary;
791 LogLock *p; /* Used to iterate through in-process locks */
792 u32 mOther; /* Locks held by other connections */
793 u32 mNew; /* New mask for pLog */
794
795 assert(
796 /* Writer lock operations */
797 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
798 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
799
800 /* Normal reader lock operations */
801 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
802 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
803 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
804
805 /* Region D reader lock operations */
806 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
807 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
808 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
809
810 /* Checkpointer lock operations */
811 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
812 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
813 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
814 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
815 );
816
817 /* Assert that a connection never tries to go from an EXCLUSIVE to a
818 ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
819 ** happens though (when a region D reader upgrades to a writer).
820 */
821 assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
822
823 sqlite3_mutex_enter(pSummary->mutex);
824
825 /* Calculate a mask of logs held by all connections in this process apart
826 ** from this one. The least significant byte of the mask contains a mask
827 ** of the SHARED logs held. The next least significant byte of the mask
828 ** indicates the EXCLUSIVE locks held. For example, to test if some other
829 ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
830 ** on region C, do:
831 **
832 ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
833 ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
834 **
835 ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
836 ** corresponding bit in the SHARED mask.
837 */
838 mOther = 0;
839 for(p=pSummary->pLock; p; p=p->pNext){
840 assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
841 if( p!=&pLog->lock ){
842 mOther |= p->mLock;
843 }
844 }
845
846 /* If this call is to lock a region (not to unlock one), test if locks held
847 ** by any other connection in this process prevent the new locks from
848 ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
849 */
850 if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
851 sqlite3_mutex_leave(pSummary->mutex);
852 return SQLITE_BUSY;
853 }
854
855 /* Figure out the new log mask for this connection. */
856 switch( op ){
857 case LOG_UNLOCK:
858 mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
859 break;
860 case LOG_RDLOCK:
861 mNew = (pLog->lock.mLock | mRegion);
862 break;
863 default:
864 assert( op==LOG_WRLOCK );
865 mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
866 break;
867 }
868
869 /* Now modify the locks held on the log-summary file descriptor. This
870 ** file descriptor is shared by all log connections in this process.
871 ** Therefore:
872 **
873 ** + If one or more log connections in this process hold a SHARED lock
874 ** on a region, the file-descriptor should hold a SHARED lock on
875 ** the file region.
876 **
877 ** + If a log connection in this process holds an EXCLUSIVE lock on a
878 ** region, the file-descriptor should also hold an EXCLUSIVE lock on
879 ** the region in question.
880 **
881 ** If this is an LOG_UNLOCK operation, only regions for which no other
882 ** connection holds a lock should actually be unlocked. And if this
883 ** is a LOG_RDLOCK operation and other connections already hold all
884 ** the required SHARED locks, then no system call is required.
885 */
886 if( op==LOG_UNLOCK ){
887 mRegion = (mRegion & ~mOther);
888 }
889 if( (op==LOG_WRLOCK)
890 || (op==LOG_UNLOCK && mRegion)
891 || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
892 ){
893 struct LockMap {
894 int iStart; /* Byte offset to start locking operation */
895 int iLen; /* Length field for locking operation */
896 } aMap[] = {
danff207012010-04-24 04:49:15 +0000897 /* 0000 */ {0, 0}, /* 0001 */ {3+LOG_LOCK_REGION, 1},
898 /* 0010 */ {2+LOG_LOCK_REGION, 1}, /* 0011 */ {2+LOG_LOCK_REGION, 2},
899 /* 0100 */ {1+LOG_LOCK_REGION, 1}, /* 0101 */ {0, 0},
900 /* 0110 */ {1+LOG_LOCK_REGION, 2}, /* 0111 */ {1+LOG_LOCK_REGION, 3},
901 /* 1000 */ {0+LOG_LOCK_REGION, 1}, /* 1001 */ {0, 0},
dan3de777f2010-04-17 12:31:37 +0000902 /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
danff207012010-04-24 04:49:15 +0000903 /* 1100 */ {0+LOG_LOCK_REGION, 2}, /* 1101 */ {0, 0},
dan3de777f2010-04-17 12:31:37 +0000904 /* 1110 */ {0, 0}, /* 1111 */ {0, 0}
905 };
906 int rc; /* Return code of logLockFd() */
907
908 assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
909
910 rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op);
911 if( rc!=0 ){
912 sqlite3_mutex_leave(pSummary->mutex);
913 return rc;
914 }
915 }
916
917 pLog->lock.mLock = mNew;
918 sqlite3_mutex_leave(pSummary->mutex);
919 return SQLITE_OK;
920}
921
dan8d22a172010-04-19 18:03:51 +0000922/*
923** Lock the DMH region, either with an EXCLUSIVE or SHARED lock. This
924** function is never called with LOG_UNLOCK - the only way the DMH region
925** is every completely unlocked is by by closing the file descriptor.
926*/
dan3de777f2010-04-17 12:31:37 +0000927static int logLockDMH(LogSummary *pSummary, int eLock){
dan8d22a172010-04-19 18:03:51 +0000928 assert( sqlite3_mutex_held(pSummary->mutex) );
dan3de777f2010-04-17 12:31:37 +0000929 assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK );
930 return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock);
931}
932
dan8d22a172010-04-19 18:03:51 +0000933/*
934** Lock (or unlock) the MUTEX region. It is always locked using an
935** EXCLUSIVE, blocking lock.
936*/
dan3de777f2010-04-17 12:31:37 +0000937static int logLockMutex(LogSummary *pSummary, int eLock){
dan8d22a172010-04-19 18:03:51 +0000938 assert( sqlite3_mutex_held(pSummary->mutex) );
dan3de777f2010-04-17 12:31:37 +0000939 assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK );
940 logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock);
941 return SQLITE_OK;
942}
943
dan7c246102010-04-12 19:00:29 +0000944/*
945** This function intializes the connection to the log-summary identified
946** by struct pSummary.
947*/
dan3de777f2010-04-17 12:31:37 +0000948static int logSummaryInit(
949 LogSummary *pSummary, /* Log summary object to initialize */
950 sqlite3_file *pFd /* File descriptor open on log file */
951){
dan7c246102010-04-12 19:00:29 +0000952 int rc; /* Return Code */
953 char *zFile; /* File name for summary file */
954
955 assert( pSummary->fd<0 );
956 assert( pSummary->aData==0 );
957 assert( pSummary->nRef>0 );
958 assert( pSummary->zPath );
959
960 /* Open a file descriptor on the summary file. */
961 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
962 if( !zFile ){
963 return SQLITE_NOMEM;
964 }
965 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
966 sqlite3_free(zFile);
967 if( pSummary->fd<0 ){
968 return SQLITE_IOERR;
969 }
970
dan3de777f2010-04-17 12:31:37 +0000971 /* Grab an exclusive lock the summary file. Then mmap() it.
972 **
973 ** TODO: This code needs to be enhanced to support a growable mapping.
974 ** For now, just make the mapping very large to start with. The
975 ** pages should not be allocated until they are first accessed anyhow,
976 ** so using a large mapping consumes no more resources than a smaller
977 ** one would.
dan7c246102010-04-12 19:00:29 +0000978 */
dan3de777f2010-04-17 12:31:37 +0000979 assert( sqlite3_mutex_held(pSummary->mutex) );
980 rc = logLockMutex(pSummary, LOG_WRLOCKW);
dan7c246102010-04-12 19:00:29 +0000981 if( rc!=SQLITE_OK ) return rc;
dance4f05f2010-04-22 19:14:13 +0000982 rc = logSummaryMap(pSummary, LOGSUMMARY_MMAP_INCREMENT);
dan7c246102010-04-12 19:00:29 +0000983 if( rc!=SQLITE_OK ) goto out;
984
dan3de777f2010-04-17 12:31:37 +0000985 /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this
986 ** is possible, the contents of the log-summary file (if any) may not
987 ** be trusted. Zero the log-summary header before continuing.
dan7c246102010-04-12 19:00:29 +0000988 */
dan3de777f2010-04-17 12:31:37 +0000989 rc = logLockDMH(pSummary, LOG_WRLOCK);
dan7c246102010-04-12 19:00:29 +0000990 if( rc==SQLITE_OK ){
dan7c246102010-04-12 19:00:29 +0000991 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
dan3de777f2010-04-17 12:31:37 +0000992 }
993 rc = logLockDMH(pSummary, LOG_RDLOCK);
994 if( rc!=SQLITE_OK ){
dan8d22a172010-04-19 18:03:51 +0000995 rc = SQLITE_IOERR;
dan7c246102010-04-12 19:00:29 +0000996 }
997
998 out:
dan3de777f2010-04-17 12:31:37 +0000999 logLockMutex(pSummary, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001000 return rc;
1001}
1002
1003/*
1004** Open a connection to the log file associated with database zDb. The
1005** database file does not actually have to exist. zDb is used only to
1006** figure out the name of the log file to open. If the log file does not
1007** exist it is created by this call.
dan3de777f2010-04-17 12:31:37 +00001008**
1009** A SHARED lock should be held on the database file when this function
1010** is called. The purpose of this SHARED lock is to prevent any other
1011** client from unlinking the log or log-summary file. If another process
1012** were to do this just after this client opened one of these files, the
1013** system would be badly broken.
dan7c246102010-04-12 19:00:29 +00001014*/
drhc438efd2010-04-26 00:19:45 +00001015int sqlite3WalOpen(
dan7c246102010-04-12 19:00:29 +00001016 sqlite3_vfs *pVfs, /* vfs module to open log file with */
1017 const char *zDb, /* Name of database file */
1018 Log **ppLog /* OUT: Allocated Log handle */
1019){
danb9bf16b2010-04-14 11:23:30 +00001020 int rc = SQLITE_OK; /* Return Code */
dan7c246102010-04-12 19:00:29 +00001021 Log *pRet; /* Object to allocate and return */
1022 LogSummary *pSummary = 0; /* Summary object */
1023 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
1024 int flags; /* Flags passed to OsOpen() */
1025 char *zWal = 0; /* Path to WAL file */
1026 int nWal; /* Length of zWal in bytes */
1027
dan7c246102010-04-12 19:00:29 +00001028 assert( zDb );
dan7c246102010-04-12 19:00:29 +00001029
1030 /* Allocate an instance of struct Log to return. */
dan3de777f2010-04-17 12:31:37 +00001031 *ppLog = 0;
dan7c246102010-04-12 19:00:29 +00001032 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
1033 if( !pRet ) goto out;
1034 pRet->pVfs = pVfs;
1035 pRet->pFd = (sqlite3_file *)&pRet[1];
dan7c246102010-04-12 19:00:29 +00001036
1037 /* Normalize the path name. */
1038 zWal = sqlite3_mprintf("%s-wal", zDb);
1039 if( !zWal ) goto out;
1040 logNormalizePath(zWal);
dan67032392010-04-17 15:42:43 +00001041 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
dan7c246102010-04-12 19:00:29 +00001042 nWal = sqlite3Strlen30(zWal);
1043
1044 /* Enter the mutex that protects the linked-list of LogSummary structures */
1045 if( sqlite3GlobalConfig.bCoreMutex ){
1046 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
1047 }
1048 sqlite3_mutex_enter(mutex);
1049
1050 /* Search for an existing log summary object in the linked list. If one
1051 ** cannot be found, allocate and initialize a new object.
1052 */
1053 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
1054 int nPath = sqlite3Strlen30(pSummary->zPath);
1055 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
1056 }
1057 if( !pSummary ){
1058 int nByte = sizeof(LogSummary) + nWal + 1;
1059 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
1060 if( !pSummary ){
1061 rc = SQLITE_NOMEM;
1062 goto out;
1063 }
1064 if( sqlite3GlobalConfig.bCoreMutex ){
1065 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
1066 }
1067 pSummary->zPath = (char *)&pSummary[1];
1068 pSummary->fd = -1;
1069 memcpy(pSummary->zPath, zWal, nWal);
1070 pSummary->pNext = pLogSummary;
1071 pLogSummary = pSummary;
1072 }
1073 pSummary->nRef++;
1074 pRet->pSummary = pSummary;
1075
1076 /* Exit the mutex protecting the linked-list of LogSummary objects. */
1077 sqlite3_mutex_leave(mutex);
1078 mutex = 0;
1079
1080 /* Open file handle on the log file. */
1081 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
1082 if( rc!=SQLITE_OK ) goto out;
1083
1084 /* Object pSummary is shared between all connections to the database made
1085 ** by this process. So at this point it may or may not be connected to
dan3de777f2010-04-17 12:31:37 +00001086 ** the log-summary. If it is not, connect it.
dan7c246102010-04-12 19:00:29 +00001087 */
1088 sqlite3_mutex_enter(pSummary->mutex);
1089 mutex = pSummary->mutex;
1090 if( pSummary->fd<0 ){
1091 rc = logSummaryInit(pSummary, pRet->pFd);
dan7c246102010-04-12 19:00:29 +00001092 }
1093
dan64d039e2010-04-13 19:27:31 +00001094 pRet->lock.pNext = pSummary->pLock;
1095 pSummary->pLock = &pRet->lock;
1096
dan7c246102010-04-12 19:00:29 +00001097 out:
1098 sqlite3_mutex_leave(mutex);
1099 sqlite3_free(zWal);
1100 if( rc!=SQLITE_OK ){
1101 assert(0);
1102 if( pRet ){
1103 sqlite3OsClose(pRet->pFd);
1104 sqlite3_free(pRet);
1105 }
1106 assert( !pSummary || pSummary->nRef==0 );
1107 sqlite3_free(pSummary);
1108 }
1109 *ppLog = pRet;
1110 return rc;
1111}
1112
dan4a4b01d2010-04-16 11:30:18 +00001113static int logIteratorNext(
1114 LogIterator *p, /* Iterator */
dan7c246102010-04-12 19:00:29 +00001115 u32 *piPage, /* OUT: Next db page to write */
1116 u32 *piFrame /* OUT: Log frame to read from */
1117){
1118 u32 iMin = *piPage;
1119 u32 iRet = 0xFFFFFFFF;
1120 int i;
1121 int nBlock = p->nFinal;
1122
1123 for(i=p->nSegment-1; i>=0; i--){
1124 struct LogSegment *pSegment = &p->aSegment[i];
1125 while( pSegment->iNext<nBlock ){
1126 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
1127 if( iPg>iMin ){
1128 if( iPg<iRet ){
1129 iRet = iPg;
1130 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
1131 }
1132 break;
1133 }
1134 pSegment->iNext++;
1135 }
1136
1137 nBlock = 256;
1138 }
1139
1140 *piPage = iRet;
1141 return (iRet==0xFFFFFFFF);
1142}
1143
dan4a4b01d2010-04-16 11:30:18 +00001144static LogIterator *logIteratorInit(Log *pLog){
dan7c246102010-04-12 19:00:29 +00001145 u32 *aData = pLog->pSummary->aData;
dan4a4b01d2010-04-16 11:30:18 +00001146 LogIterator *p; /* Return value */
dan7c246102010-04-12 19:00:29 +00001147 int nSegment; /* Number of segments to merge */
1148 u32 iLast; /* Last frame in log */
1149 int nByte; /* Number of bytes to allocate */
1150 int i; /* Iterator variable */
1151 int nFinal; /* Number of unindexed entries */
1152 struct LogSegment *pFinal; /* Final (unindexed) segment */
1153 u8 *aTmp; /* Temp space used by merge-sort */
1154
1155 iLast = pLog->hdr.iLastPg;
1156 nSegment = (iLast >> 8) + 1;
1157 nFinal = (iLast & 0x000000FF);
1158
dan4a4b01d2010-04-16 11:30:18 +00001159 nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
1160 p = (LogIterator *)sqlite3_malloc(nByte);
dan7c246102010-04-12 19:00:29 +00001161 if( p ){
1162 memset(p, 0, nByte);
1163 p->nSegment = nSegment;
1164 p->nFinal = nFinal;
1165 }
1166
1167 for(i=0; i<nSegment-1; i++){
1168 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
1169 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
1170 }
1171 pFinal = &p->aSegment[nSegment-1];
1172
1173 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
1174 pFinal->aIndex = (u8 *)&pFinal[1];
1175 aTmp = &pFinal->aIndex[256];
1176 for(i=0; i<nFinal; i++){
1177 pFinal->aIndex[i] = i;
1178 }
1179 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
1180 p->nFinal = nFinal;
1181
1182 return p;
1183}
1184
1185/*
dan4a4b01d2010-04-16 11:30:18 +00001186** Free a log iterator allocated by logIteratorInit().
dan7c246102010-04-12 19:00:29 +00001187*/
dan4a4b01d2010-04-16 11:30:18 +00001188static void logIteratorFree(LogIterator *p){
dan7c246102010-04-12 19:00:29 +00001189 sqlite3_free(p);
1190}
1191
1192/*
1193** Checkpoint the contents of the log file.
1194*/
1195static int logCheckpoint(
1196 Log *pLog, /* Log connection */
1197 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001198 int sync_flags, /* Flags for OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001199 u8 *zBuf /* Temporary buffer to use */
1200){
1201 int rc; /* Return code */
1202 int pgsz = pLog->hdr.pgsz; /* Database page-size */
dan4a4b01d2010-04-16 11:30:18 +00001203 LogIterator *pIter = 0; /* Log iterator context */
dan7c246102010-04-12 19:00:29 +00001204 u32 iDbpage = 0; /* Next database page to write */
danb9bf16b2010-04-14 11:23:30 +00001205 u32 iFrame = 0; /* Log frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +00001206
danbb2e9c92010-04-15 13:33:18 +00001207 if( pLog->hdr.iLastPg==0 ){
1208 return SQLITE_OK;
1209 }
1210
dan7c246102010-04-12 19:00:29 +00001211 /* Allocate the iterator */
dan4a4b01d2010-04-16 11:30:18 +00001212 pIter = logIteratorInit(pLog);
dan7c246102010-04-12 19:00:29 +00001213 if( !pIter ) return SQLITE_NOMEM;
1214
1215 /* Sync the log file to disk */
danc5118782010-04-17 17:34:41 +00001216 if( sync_flags ){
1217 rc = sqlite3OsSync(pLog->pFd, sync_flags);
1218 if( rc!=SQLITE_OK ) goto out;
1219 }
dan7c246102010-04-12 19:00:29 +00001220
1221 /* Iterate through the contents of the log, copying data to the db file. */
dan4a4b01d2010-04-16 11:30:18 +00001222 while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
dan7c246102010-04-12 19:00:29 +00001223 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
dan97a31352010-04-16 13:59:31 +00001224 logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE
dan7c246102010-04-12 19:00:29 +00001225 );
1226 if( rc!=SQLITE_OK ) goto out;
1227 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
1228 if( rc!=SQLITE_OK ) goto out;
1229 }
1230
1231 /* Truncate the database file */
1232 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
1233 if( rc!=SQLITE_OK ) goto out;
1234
1235 /* Sync the database file. If successful, update the log-summary. */
danc5118782010-04-17 17:34:41 +00001236 if( sync_flags ){
1237 rc = sqlite3OsSync(pFd, sync_flags);
1238 if( rc!=SQLITE_OK ) goto out;
1239 }
dan7c246102010-04-12 19:00:29 +00001240 pLog->hdr.iLastPg = 0;
1241 pLog->hdr.iCheck1 = 2;
1242 pLog->hdr.iCheck2 = 3;
1243 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1244
1245 /* TODO: If a crash occurs and the current log is copied into the
1246 ** database there is no problem. However, if a crash occurs while
1247 ** writing the next transaction into the start of the log, such that:
1248 **
1249 ** * The first transaction currently in the log is left intact, but
1250 ** * The second (or subsequent) transaction is damaged,
1251 **
1252 ** then the database could become corrupt.
1253 **
1254 ** The easiest thing to do would be to write and sync a dummy header
1255 ** into the log at this point. Unfortunately, that turns out to be
1256 ** an unwelcome performance hit. Alternatives are...
1257 */
1258#if 0
1259 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
1260 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
1261 if( rc!=SQLITE_OK ) goto out;
1262 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1263#endif
1264
1265 out:
dan4a4b01d2010-04-16 11:30:18 +00001266 logIteratorFree(pIter);
dan7c246102010-04-12 19:00:29 +00001267 return rc;
1268}
1269
1270/*
1271** Close a connection to a log file.
1272*/
drhc438efd2010-04-26 00:19:45 +00001273int sqlite3WalClose(
dan7c246102010-04-12 19:00:29 +00001274 Log *pLog, /* Log to close */
1275 sqlite3_file *pFd, /* Database file */
danc5118782010-04-17 17:34:41 +00001276 int sync_flags, /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001277 u8 *zBuf /* Buffer of at least page-size bytes */
1278){
1279 int rc = SQLITE_OK;
1280 if( pLog ){
dan64d039e2010-04-13 19:27:31 +00001281 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +00001282 LogSummary *pSummary = pLog->pSummary;
1283 sqlite3_mutex *mutex = 0;
1284
dan64d039e2010-04-13 19:27:31 +00001285 sqlite3_mutex_enter(pSummary->mutex);
1286 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
1287 *ppL = pLog->lock.pNext;
1288 sqlite3_mutex_leave(pSummary->mutex);
1289
dan7c246102010-04-12 19:00:29 +00001290 if( sqlite3GlobalConfig.bCoreMutex ){
1291 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
1292 }
1293 sqlite3_mutex_enter(mutex);
1294
1295 /* Decrement the reference count on the log summary. If this is the last
1296 ** reference to the log summary object in this process, the object will
1297 ** be freed. If this is also the last connection to the database, then
1298 ** checkpoint the database and truncate the log and log-summary files
1299 ** to zero bytes in size.
1300 **/
1301 pSummary->nRef--;
1302 if( pSummary->nRef==0 ){
dan3de777f2010-04-17 12:31:37 +00001303 int rc;
dan7c246102010-04-12 19:00:29 +00001304 LogSummary **pp;
dan7c246102010-04-12 19:00:29 +00001305 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
1306 *pp = (*pp)->pNext;
dan3de777f2010-04-17 12:31:37 +00001307
1308 sqlite3_mutex_leave(mutex);
1309
1310 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
1311 if( rc==SQLITE_OK ){
1312
1313 /* This is the last connection to the database (including other
1314 ** processes). Do three things:
1315 **
1316 ** 1. Checkpoint the db.
1317 ** 2. Truncate the log file.
1318 ** 3. Unlink the log-summary file.
1319 */
danc5118782010-04-17 17:34:41 +00001320 rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
dan3de777f2010-04-17 12:31:37 +00001321 if( rc==SQLITE_OK ){
1322 rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0);
1323 }
1324
1325 logSummaryUnmap(pSummary, 1);
1326 }else{
1327 if( rc==SQLITE_BUSY ){
1328 rc = SQLITE_OK;
1329 }
1330 logSummaryUnmap(pSummary, 0);
1331 }
dan3de777f2010-04-17 12:31:37 +00001332
dan7c246102010-04-12 19:00:29 +00001333 sqlite3_mutex_free(pSummary->mutex);
1334 sqlite3_free(pSummary);
dan3de777f2010-04-17 12:31:37 +00001335 }else{
1336 sqlite3_mutex_leave(mutex);
dan7c246102010-04-12 19:00:29 +00001337 }
1338
dan7c246102010-04-12 19:00:29 +00001339 /* Close the connection to the log file and free the Log handle. */
1340 sqlite3OsClose(pLog->pFd);
1341 sqlite3_free(pLog);
1342 }
1343 return rc;
1344}
1345
1346/*
dan7c246102010-04-12 19:00:29 +00001347** Enter and leave the log-summary mutex. In this context, entering the
1348** log-summary mutex means:
1349**
1350** 1. Obtaining mutex pLog->pSummary->mutex, and
1351** 2. Taking an exclusive lock on the log-summary file.
1352**
1353** i.e. this mutex locks out other processes as well as other threads
1354** hosted in this address space.
1355*/
1356static int logEnterMutex(Log *pLog){
1357 LogSummary *pSummary = pLog->pSummary;
1358 int rc;
1359
1360 sqlite3_mutex_enter(pSummary->mutex);
dan3de777f2010-04-17 12:31:37 +00001361 rc = logLockMutex(pSummary, LOG_WRLOCKW);
dan7c246102010-04-12 19:00:29 +00001362 if( rc!=SQLITE_OK ){
1363 sqlite3_mutex_leave(pSummary->mutex);
1364 }
1365 return rc;
1366}
1367static void logLeaveMutex(Log *pLog){
1368 LogSummary *pSummary = pLog->pSummary;
dan3de777f2010-04-17 12:31:37 +00001369 logLockMutex(pSummary, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001370 sqlite3_mutex_leave(pSummary->mutex);
1371}
1372
1373/*
danb9bf16b2010-04-14 11:23:30 +00001374** Try to read the log-summary header. Attempt to verify the header
1375** checksum. If the checksum can be verified, copy the log-summary
1376** header into structure pLog->hdr. If the contents of pLog->hdr are
1377** modified by this and pChanged is not NULL, set *pChanged to 1.
1378** Otherwise leave *pChanged unmodified.
1379**
1380** If the checksum cannot be verified return SQLITE_ERROR.
1381*/
1382int logSummaryTryHdr(Log *pLog, int *pChanged){
1383 u32 aCksum[2] = {1, 1};
1384 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1385
dancd11fb22010-04-26 10:40:52 +00001386 /* Read the header. The caller may or may not have locked the log-summary
1387 ** file, meaning it is possible that an inconsistent snapshot is read
1388 ** from the file. If this happens, return SQLITE_ERROR. The caller will
1389 ** retry. Or, if the caller has already locked the file and the header
1390 ** still looks inconsistent, it will run recovery.
danb9bf16b2010-04-14 11:23:30 +00001391 */
1392 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1393 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1394 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1395 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1396 ){
1397 return SQLITE_ERROR;
1398 }
1399
1400 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1401 if( pChanged ){
1402 *pChanged = 1;
1403 }
1404 memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
1405 }
1406 return SQLITE_OK;
1407}
1408
1409/*
1410** Read the log-summary header from the log-summary file into structure
1411** pLog->hdr. If attempting to verify the header checksum fails, try
1412** to recover the log before returning.
1413**
1414** If the log-summary header is successfully read, return SQLITE_OK.
1415** Otherwise an SQLite error code.
1416*/
1417int logSummaryReadHdr(Log *pLog, int *pChanged){
1418 int rc;
1419
1420 /* First try to read the header without a lock. Verify the checksum
1421 ** before returning. This will almost always work.
dancd11fb22010-04-26 10:40:52 +00001422 **
1423 ** TODO: Doing this causes a race-condition with the code that resizes
1424 ** the mapping. Unless Log.pSummary->mutex is held, it is possible that
1425 ** LogSummary.aData is invalid.
danb9bf16b2010-04-14 11:23:30 +00001426 */
dancd11fb22010-04-26 10:40:52 +00001427#if 0
danb9bf16b2010-04-14 11:23:30 +00001428 if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
1429 return SQLITE_OK;
1430 }
dancd11fb22010-04-26 10:40:52 +00001431#endif
danb9bf16b2010-04-14 11:23:30 +00001432
1433 /* If the first attempt to read the header failed, lock the log-summary
1434 ** file and try again. If the header checksum verification fails this
1435 ** time as well, run log recovery.
1436 */
1437 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1438 if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
1439 if( pChanged ){
1440 *pChanged = 1;
1441 }
1442 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1443 if( rc==SQLITE_OK ){
1444 rc = logSummaryTryHdr(pLog, 0);
1445 }
1446 }
1447 logLeaveMutex(pLog);
1448 }
1449
1450 return rc;
1451}
1452
1453/*
dan64d039e2010-04-13 19:27:31 +00001454** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001455**
1456** If this call obtains a new read-lock and the database contents have been
1457** modified since the most recent call to LogCloseSnapshot() on this Log
1458** connection, then *pChanged is set to 1 before returning. Otherwise, it
1459** is left unmodified. This is used by the pager layer to determine whether
1460** or not any cached pages may be safely reused.
1461*/
drhc438efd2010-04-26 00:19:45 +00001462int sqlite3WalOpenSnapshot(Log *pLog, int *pChanged){
dan7c246102010-04-12 19:00:29 +00001463 int rc = SQLITE_OK;
1464 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001465 int nAttempt;
1466
1467 /* Obtain a snapshot-lock on the log-summary file. The procedure
1468 ** for obtaining the snapshot log is:
1469 **
1470 ** 1. Attempt a SHARED lock on regions A and B.
1471 ** 2a. If step 1 is successful, drop the lock on region B.
1472 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1473 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1474 ** successful.
1475 **
1476 ** If neither of the locks can be obtained after 5 tries, presumably
1477 ** something is wrong (i.e. a process not following the locking protocol).
1478 ** Return an error code in this case.
1479 */
1480 rc = SQLITE_BUSY;
1481 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1482 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1483 if( rc==SQLITE_BUSY ){
1484 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1485 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1486 }else{
1487 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1488 pLog->isLocked = LOG_REGION_A;
1489 }
1490 }
1491 if( rc!=SQLITE_OK ){
1492 return rc;
1493 }
1494
danb9bf16b2010-04-14 11:23:30 +00001495 rc = logSummaryReadHdr(pLog, pChanged);
dan64d039e2010-04-13 19:27:31 +00001496 if( rc!=SQLITE_OK ){
1497 /* An error occured while attempting log recovery. */
drhc438efd2010-04-26 00:19:45 +00001498 sqlite3WalCloseSnapshot(pLog);
dan64d039e2010-04-13 19:27:31 +00001499 }
dan7c246102010-04-12 19:00:29 +00001500 }
1501 return rc;
1502}
1503
1504/*
1505** Unlock the current snapshot.
1506*/
drhc438efd2010-04-26 00:19:45 +00001507void sqlite3WalCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001508 if( pLog->isLocked ){
1509 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1510 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1511 }
dan7c246102010-04-12 19:00:29 +00001512 pLog->isLocked = 0;
1513}
1514
dan7c246102010-04-12 19:00:29 +00001515/*
1516** Read a page from the log, if it is present.
1517*/
drhc438efd2010-04-26 00:19:45 +00001518int sqlite3WalRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
dan7c246102010-04-12 19:00:29 +00001519 u32 iRead = 0;
dancd11fb22010-04-26 10:40:52 +00001520 u32 *aData;
dan7c246102010-04-12 19:00:29 +00001521 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1522
dan39c79f52010-04-15 10:58:51 +00001523 assert( pLog->isLocked );
1524
dancd11fb22010-04-26 10:40:52 +00001525 sqlite3_mutex_enter(pLog->pSummary->mutex);
1526 aData = pLog->pSummary->aData;
1527
dan7c246102010-04-12 19:00:29 +00001528 /* Do a linear search of the unindexed block of page-numbers (if any)
1529 ** at the end of the log-summary. An alternative to this would be to
1530 ** build an index in private memory each time a read transaction is
1531 ** opened on a new snapshot.
1532 */
1533 if( pLog->hdr.iLastPg ){
1534 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1535 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1536 while( *pi!=pgno && pi!=piStop ) pi--;
1537 if( pi!=piStop ){
1538 iRead = (pi-piStop) + iFrame;
1539 }
1540 }
1541 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1542
1543 while( iRead==0 && iFrame>0 ){
1544 int iLow = 0;
1545 int iHigh = 255;
1546 u32 *aFrame;
1547 u8 *aIndex;
1548
1549 iFrame -= 256;
1550 aFrame = &aData[logSummaryEntry(iFrame+1)];
1551 aIndex = (u8 *)&aFrame[256];
1552
1553 while( iLow<=iHigh ){
1554 int iTest = (iLow+iHigh)>>1;
1555 u32 iPg = aFrame[aIndex[iTest]];
1556
1557 if( iPg==pgno ){
1558 iRead = iFrame + 1 + aIndex[iTest];
1559 break;
1560 }
1561 else if( iPg<pgno ){
1562 iLow = iTest+1;
1563 }else{
1564 iHigh = iTest-1;
1565 }
1566 }
1567 }
1568 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1569
dancd11fb22010-04-26 10:40:52 +00001570 sqlite3_mutex_leave(pLog->pSummary->mutex);
1571
dan7c246102010-04-12 19:00:29 +00001572 /* If iRead is non-zero, then it is the log frame number that contains the
1573 ** required page. Read and return data from the log file.
1574 */
1575 if( iRead ){
dan97a31352010-04-16 13:59:31 +00001576 i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE;
dan7c246102010-04-12 19:00:29 +00001577 *pInLog = 1;
1578 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1579 }
1580
1581 *pInLog = 0;
1582 return SQLITE_OK;
1583}
1584
1585
1586/*
1587** Set *pPgno to the size of the database file (or zero, if unknown).
1588*/
drhc438efd2010-04-26 00:19:45 +00001589void sqlite3WalDbsize(Log *pLog, Pgno *pPgno){
dan7c246102010-04-12 19:00:29 +00001590 assert( pLog->isLocked );
1591 *pPgno = pLog->hdr.nPage;
1592}
1593
1594/*
dan7c246102010-04-12 19:00:29 +00001595** This function returns SQLITE_OK if the caller may write to the database.
1596** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001597** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001598*/
drhc438efd2010-04-26 00:19:45 +00001599int sqlite3WalWriteLock(Log *pLog, int op){
dan7c246102010-04-12 19:00:29 +00001600 assert( pLog->isLocked );
1601 if( op ){
dan64d039e2010-04-13 19:27:31 +00001602
1603 /* Obtain the writer lock */
1604 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1605 if( rc!=SQLITE_OK ){
1606 return rc;
1607 }
1608
dan39c79f52010-04-15 10:58:51 +00001609 /* If this is connection is a region D reader, then the SHARED lock on
1610 ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
1611 ** held on region A. This means that if the write-transaction is committed
dan49320f82010-04-14 18:50:08 +00001612 ** and this connection downgrades to a reader, it will be left with no
dan39c79f52010-04-15 10:58:51 +00001613 ** lock at all. And so its snapshot could get clobbered by a checkpoint
dan49320f82010-04-14 18:50:08 +00001614 ** operation.
1615 **
1616 ** To stop this from happening, grab a SHARED lock on region A now.
1617 ** This should always be successful, as the only time a client holds
1618 ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
1619 ** lock on region C (a checkpointer does this). This is not possible,
1620 ** as this connection currently has the EXCLUSIVE lock on region C.
dan02bb5962010-04-14 15:49:40 +00001621 */
dan49320f82010-04-14 18:50:08 +00001622 if( pLog->isLocked==LOG_REGION_D ){
1623 logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
1624 pLog->isLocked = LOG_REGION_A;
1625 }
dan02bb5962010-04-14 15:49:40 +00001626
dan39c79f52010-04-15 10:58:51 +00001627 /* If this connection is not reading the most recent database snapshot,
1628 ** it is not possible to write to the database. In this case release
1629 ** the write locks and return SQLITE_BUSY.
1630 */
dan7c246102010-04-12 19:00:29 +00001631 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
dan49320f82010-04-14 18:50:08 +00001632 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001633 return SQLITE_BUSY;
1634 }
1635 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001636
dan7c246102010-04-12 19:00:29 +00001637 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001638 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001639 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1640 pLog->isWriteLocked = 0;
1641 }
1642 return SQLITE_OK;
1643}
1644
dan74d6cd82010-04-24 18:44:05 +00001645/*
1646** The log handle passed to this function must be holding the write-lock.
1647**
1648** If any data has been written (but not committed) to the log file, this
1649** function moves the write-pointer back to the start of the transaction.
1650**
1651** Additionally, the callback function is invoked for each frame written
1652** to the log since the start of the transaction. If the callback returns
1653** other than SQLITE_OK, it is not invoked again and the error code is
1654** returned to the caller.
1655**
1656** Otherwise, if the callback function does not return an error, this
1657** function returns SQLITE_OK.
1658*/
drhc438efd2010-04-26 00:19:45 +00001659int sqlite3WalUndo(Log *pLog, int (*xUndo)(void *, Pgno), void *pUndoCtx){
dan74d6cd82010-04-24 18:44:05 +00001660 int rc = SQLITE_OK;
1661 Pgno iMax = pLog->hdr.iLastPg;
1662 Pgno iFrame;
1663
1664 assert( pLog->isWriteLocked );
1665 logSummaryReadHdr(pLog, 0);
1666 for(iFrame=pLog->hdr.iLastPg+1; iFrame<=iMax && rc==SQLITE_OK; iFrame++){
1667 rc = xUndo(pUndoCtx, pLog->pSummary->aData[logSummaryEntry(iFrame)]);
1668 }
1669 return rc;
1670}
1671
dan7c246102010-04-12 19:00:29 +00001672/*
dan3306c4a2010-04-23 19:15:00 +00001673** Return true if data has been written but not committed to the log file.
1674*/
drhc438efd2010-04-26 00:19:45 +00001675int sqlite3WalDirty(Log *pLog){
dan3306c4a2010-04-23 19:15:00 +00001676 assert( pLog->isWriteLocked );
1677 return( pLog->hdr.iLastPg!=((LogSummaryHdr*)pLog->pSummary->aData)->iLastPg );
1678}
1679
1680/*
dan7c246102010-04-12 19:00:29 +00001681** Write a set of frames to the log. The caller must hold at least a
1682** RESERVED lock on the database file.
1683*/
drhc438efd2010-04-26 00:19:45 +00001684int sqlite3WalFrames(
dan7c246102010-04-12 19:00:29 +00001685 Log *pLog, /* Log handle to write to */
1686 int nPgsz, /* Database page-size in bytes */
1687 PgHdr *pList, /* List of dirty pages to write */
1688 Pgno nTruncate, /* Database size after this commit */
1689 int isCommit, /* True if this is a commit */
danc5118782010-04-17 17:34:41 +00001690 int sync_flags /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001691){
dan7c246102010-04-12 19:00:29 +00001692 int rc; /* Used to catch return codes */
1693 u32 iFrame; /* Next frame address */
dan97a31352010-04-16 13:59:31 +00001694 u8 aFrame[LOG_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
dan7c246102010-04-12 19:00:29 +00001695 PgHdr *p; /* Iterator to run through pList with. */
dan97a31352010-04-16 13:59:31 +00001696 u32 aCksum[2]; /* Checksums */
dan7c246102010-04-12 19:00:29 +00001697 PgHdr *pLast; /* Last frame in list */
1698 int nLast = 0; /* Number of extra copies of last page */
1699
dan56d95912010-04-24 19:07:29 +00001700 assert( LOG_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
dan7c246102010-04-12 19:00:29 +00001701 assert( pList );
1702
dan97a31352010-04-16 13:59:31 +00001703 /* If this is the first frame written into the log, write the log
1704 ** header to the start of the log file. See comments at the top of
1705 ** this file for a description of the log-header format.
1706 */
1707 assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE );
1708 iFrame = pLog->hdr.iLastPg;
1709 if( iFrame==0 ){
1710 sqlite3Put4byte(aFrame, nPgsz);
1711 sqlite3_randomness(8, &aFrame[4]);
1712 pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
1713 pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
1714 rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0);
1715 if( rc!=SQLITE_OK ){
1716 return rc;
1717 }
1718 }
1719
dan7c246102010-04-12 19:00:29 +00001720 aCksum[0] = pLog->hdr.iCheck1;
1721 aCksum[1] = pLog->hdr.iCheck2;
1722
1723 /* Write the log file. */
dan7c246102010-04-12 19:00:29 +00001724 for(p=pList; p; p=p->pDirty){
1725 u32 nDbsize; /* Db-size field for frame header */
1726 i64 iOffset; /* Write offset in log file */
1727
dan97a31352010-04-16 13:59:31 +00001728 iOffset = logFrameOffset(++iFrame, nPgsz);
dan7c246102010-04-12 19:00:29 +00001729
1730 /* Populate and write the frame header */
1731 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1732 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1733 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1734 if( rc!=SQLITE_OK ){
1735 return rc;
1736 }
1737
1738 /* Write the page data */
1739 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1740 if( rc!=SQLITE_OK ){
1741 return rc;
1742 }
1743 pLast = p;
1744 }
1745
1746 /* Sync the log file if the 'isSync' flag was specified. */
danc5118782010-04-17 17:34:41 +00001747 if( sync_flags ){
dan7c246102010-04-12 19:00:29 +00001748 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
dan67032392010-04-17 15:42:43 +00001749 i64 iOffset = logFrameOffset(iFrame+1, nPgsz);
1750
1751 assert( isCommit );
dan7c246102010-04-12 19:00:29 +00001752
1753 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1754 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1755 }
1756 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1757 while( iOffset<iSegment ){
1758 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1759 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1760 if( rc!=SQLITE_OK ){
1761 return rc;
1762 }
1763
1764 iOffset += LOG_FRAME_HDRSIZE;
1765 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1766 if( rc!=SQLITE_OK ){
1767 return rc;
1768 }
1769 nLast++;
1770 iOffset += nPgsz;
1771 }
dan7c246102010-04-12 19:00:29 +00001772
danc5118782010-04-17 17:34:41 +00001773 rc = sqlite3OsSync(pLog->pFd, sync_flags);
dan7c246102010-04-12 19:00:29 +00001774 if( rc!=SQLITE_OK ){
1775 return rc;
1776 }
1777 }
1778
1779 /* Append data to the log summary. It is not necessary to lock the
1780 ** log-summary to do this as the RESERVED lock held on the db file
1781 ** guarantees that there are no other writers, and no data that may
1782 ** be in use by existing readers is being overwritten.
1783 */
1784 iFrame = pLog->hdr.iLastPg;
1785 for(p=pList; p; p=p->pDirty){
1786 iFrame++;
1787 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1788 }
1789 while( nLast>0 ){
1790 iFrame++;
1791 nLast--;
1792 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1793 }
1794
1795 /* Update the private copy of the header. */
1796 pLog->hdr.pgsz = nPgsz;
1797 pLog->hdr.iLastPg = iFrame;
1798 if( isCommit ){
1799 pLog->hdr.iChange++;
1800 pLog->hdr.nPage = nTruncate;
1801 }
1802 pLog->hdr.iCheck1 = aCksum[0];
1803 pLog->hdr.iCheck2 = aCksum[1];
1804
1805 /* If this is a commit, update the log-summary header too. */
1806 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1807 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1808 logLeaveMutex(pLog);
dan8d22a172010-04-19 18:03:51 +00001809 pLog->iCallback = iFrame;
dan7c246102010-04-12 19:00:29 +00001810 }
1811
dan8d22a172010-04-19 18:03:51 +00001812 return rc;
dan7c246102010-04-12 19:00:29 +00001813}
1814
1815/*
danb9bf16b2010-04-14 11:23:30 +00001816** Checkpoint the database:
1817**
1818** 1. Wait for an EXCLUSIVE lock on regions B and C.
1819** 2. Wait for an EXCLUSIVE lock on region A.
1820** 3. Copy the contents of the log into the database file.
1821** 4. Zero the log-summary header (so new readers will ignore the log).
1822** 5. Drop the locks obtained in steps 1 and 2.
dan7c246102010-04-12 19:00:29 +00001823*/
drhc438efd2010-04-26 00:19:45 +00001824int sqlite3WalCheckpoint(
dan7c246102010-04-12 19:00:29 +00001825 Log *pLog, /* Log connection */
1826 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001827 int sync_flags, /* Flags to sync db file with (or 0) */
dan64d039e2010-04-13 19:27:31 +00001828 u8 *zBuf, /* Temporary buffer to use */
1829 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1830 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001831){
danb9bf16b2010-04-14 11:23:30 +00001832 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001833
dan39c79f52010-04-15 10:58:51 +00001834 assert( !pLog->isLocked );
1835
1836 /* Wait for an EXCLUSIVE lock on regions B and C. */
dan64d039e2010-04-13 19:27:31 +00001837 do {
1838 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1839 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1840 if( rc!=SQLITE_OK ) return rc;
1841
dan39c79f52010-04-15 10:58:51 +00001842 /* Wait for an EXCLUSIVE lock on region A. */
dan64d039e2010-04-13 19:27:31 +00001843 do {
1844 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1845 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001846 if( rc!=SQLITE_OK ){
1847 logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1848 return rc;
1849 }
dan64d039e2010-04-13 19:27:31 +00001850
danb9bf16b2010-04-14 11:23:30 +00001851 /* Copy data from the log to the database file. */
1852 rc = logSummaryReadHdr(pLog, 0);
1853 if( rc==SQLITE_OK ){
danc5118782010-04-17 17:34:41 +00001854 rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
danb9bf16b2010-04-14 11:23:30 +00001855 }
1856
1857 /* Release the locks. */
dan64d039e2010-04-13 19:27:31 +00001858 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1859 return rc;
dan7c246102010-04-12 19:00:29 +00001860}
1861
drhc438efd2010-04-26 00:19:45 +00001862int sqlite3WalCallback(Log *pLog){
dan8d22a172010-04-19 18:03:51 +00001863 u32 ret = 0;
1864 if( pLog ){
1865 ret = pLog->iCallback;
1866 pLog->iCallback = 0;
1867 }
1868 return (int)ret;
1869}