blob: 67bf9c0841b60d99aff1f59ce6c86bc1a7b97c31 [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
7#include "log.h"
8
9#include <unistd.h>
10#include <fcntl.h>
11#include <sys/mman.h>
12
13typedef struct LogSummaryHdr LogSummaryHdr;
14typedef struct LogSummary LogSummary;
dan4a4b01d2010-04-16 11:30:18 +000015typedef struct LogIterator LogIterator;
dan64d039e2010-04-13 19:27:31 +000016typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000017
18
19/*
20** The following structure may be used to store the same data that
21** is stored in the log-summary header.
22**
23** Member variables iCheck1 and iCheck2 contain the checksum for the
24** last frame written to the log, or 2 and 3 respectively if the log
25** is currently empty.
26*/
27struct LogSummaryHdr {
28 u32 iChange; /* Counter incremented each transaction */
29 u32 pgsz; /* Database page size in bytes */
30 u32 iLastPg; /* Address of last valid frame in log */
31 u32 nPage; /* Size of database in pages */
32 u32 iCheck1; /* Checkpoint value 1 */
33 u32 iCheck2; /* Checkpoint value 2 */
34};
35
36/* Size of serialized LogSummaryHdr object. */
37#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
38
39#define LOGSUMMARY_FRAME_OFFSET \
40 (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
41
42/* Size of frame header */
43#define LOG_FRAME_HDRSIZE 20
44
45/*
46** There is one instance of this structure for each log-summary object
47** that this process has a connection to. They are stored in a linked
48** list starting at pLogSummary (global variable).
49**
50** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
51** directly in this implementation because the VFS does not support
52** the required blocking file-locks.
53*/
54struct LogSummary {
55 sqlite3_mutex *mutex; /* Mutex used to protect this object */
56 int nRef; /* Number of pointers to this structure */
57 int fd; /* File descriptor open on log-summary */
58 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +000059 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +000060 LogSummary *pNext; /* Next in global list */
61 int nData; /* Size of aData allocation/mapping */
62 u32 *aData; /* File body */
63};
64
dan64d039e2010-04-13 19:27:31 +000065
dan7c246102010-04-12 19:00:29 +000066/*
dan64d039e2010-04-13 19:27:31 +000067** The four lockable regions associated with each log-summary. A connection
68** may take either a SHARED or EXCLUSIVE lock on each.
dan7c246102010-04-12 19:00:29 +000069*/
dan64d039e2010-04-13 19:27:31 +000070#define LOG_REGION_A 0x01
71#define LOG_REGION_B 0x02
72#define LOG_REGION_C 0x04
73#define LOG_REGION_D 0x08
74
75/*
76** A single instance of this structure is allocated as part of each
77** connection to a database log. All structures associated with the
78** same log file are linked together into a list using LogLock.pNext
79** starting at LogSummary.pLock.
80**
81** The mLock field of the structure describes the locks (if any)
82** currently held by the connection. If a SHARED lock is held on
83** any of the four locking regions, then the associated LOG_REGION_X
84** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
85** then the (LOG_REGION_X << 8) bit is set.
86*/
87struct LogLock {
88 LogLock *pNext; /* Next lock on the same log */
89 u32 mLock; /* Mask of locks */
90};
dan7c246102010-04-12 19:00:29 +000091
92struct Log {
93 LogSummary *pSummary; /* Log file summary data */
94 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
95 sqlite3_file *pFd; /* File handle for log file */
96 int sync_flags; /* Flags to use with OsSync() */
dan64d039e2010-04-13 19:27:31 +000097 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +000098 int isWriteLocked; /* True if this is the writer connection */
99 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000100 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000101};
102
dan64d039e2010-04-13 19:27:31 +0000103
dan7c246102010-04-12 19:00:29 +0000104/*
105** This structure is used to implement an iterator that iterates through
106** all frames in the log in database page order. Where two or more frames
107** correspond to the same database page, the iterator visits only the
108** frame most recently written to the log.
109**
110** The internals of this structure are only accessed by:
111**
dan4a4b01d2010-04-16 11:30:18 +0000112** logIteratorInit() - Create a new iterator,
113** logIteratorNext() - Step an iterator,
114** logIteratorFree() - Free an iterator.
dan7c246102010-04-12 19:00:29 +0000115**
116** This functionality is used by the checkpoint code (see logCheckpoint()).
117*/
dan4a4b01d2010-04-16 11:30:18 +0000118struct LogIterator {
119 int nSegment; /* Size of LogIterator.aSegment[] array */
dan7c246102010-04-12 19:00:29 +0000120 int nFinal; /* Elements in segment nSegment-1 */
121 struct LogSegment {
122 int iNext; /* Next aIndex index */
123 u8 *aIndex; /* Pointer to index array */
124 u32 *aDbPage; /* Pointer to db page array */
125 } aSegment[1];
126};
127
dan64d039e2010-04-13 19:27:31 +0000128
129/*
130** List of all LogSummary objects created by this process. Protected by
131** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
132** here instead of borrowing the LRU mutex.
133*/
134#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
135static LogSummary *pLogSummary = 0;
136
dan7c246102010-04-12 19:00:29 +0000137/*
138** Generate an 8 byte checksum based on the data in array aByte[] and the
139** initial values of aCksum[0] and aCksum[1]. The checksum is written into
140** aCksum[] before returning.
141*/
142#define LOG_CKSM_BYTES 8
143static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000144 u64 sum1 = aCksum[0];
145 u64 sum2 = aCksum[1];
146 u32 *a32 = (u32 *)aByte;
147 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000148
149 assert( LOG_CKSM_BYTES==2*sizeof(u32) );
150 assert( (nByte&0x00000003)==0 );
151
dan39c79f52010-04-15 10:58:51 +0000152 do {
153 sum1 += (*a32++);
154 sum2 += sum1;
155 } while( a32<aEnd );
dan7c246102010-04-12 19:00:29 +0000156
dan39c79f52010-04-15 10:58:51 +0000157 aCksum[0] = sum1 + (sum1>>24);
158 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000159}
160
161/*
162** Argument zPath must be a nul-terminated string containing a path-name.
163** This function modifies the string in-place by removing any "./" or "../"
164** elements in the path. For example, the following input:
165**
166** "/home/user/plans/good/../evil/./world_domination.txt"
167**
168** is overwritten with the 'normalized' version:
169**
170** "/home/user/plans/evil/world_domination.txt"
171*/
172static void logNormalizePath(char *zPath){
173 int i, j;
174 char *z = zPath;
175 int n = strlen(z);
176
177 while( n>1 && z[n-1]=='/' ){ n--; }
178 for(i=j=0; i<n; i++){
179 if( z[i]=='/' ){
180 if( z[i+1]=='/' ) continue;
181 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
182 i += 1;
183 continue;
184 }
185 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
186 while( j>0 && z[j-1]!='/' ){ j--; }
187 if( j>0 ){ j--; }
188 i += 2;
189 continue;
190 }
191 }
192 z[j++] = z[i];
193 }
194 z[j] = 0;
195}
196
197/*
198** Lock the summary file pSummary->fd.
199*/
200static int logSummaryLock(LogSummary *pSummary){
201 int rc;
202 struct flock f;
203 memset(&f, 0, sizeof(f));
204 f.l_type = F_WRLCK;
205 f.l_whence = SEEK_SET;
206 f.l_start = 0;
207 f.l_len = 1;
208 rc = fcntl(pSummary->fd, F_SETLKW, &f);
209 if( rc!=0 ){
210 return SQLITE_IOERR;
211 }
212 return SQLITE_OK;
213}
214
215/*
216** Unlock the summary file pSummary->fd.
217*/
218static int logSummaryUnlock(LogSummary *pSummary){
219 int rc;
220 struct flock f;
221 memset(&f, 0, sizeof(f));
222 f.l_type = F_UNLCK;
223 f.l_whence = SEEK_SET;
224 f.l_start = 0;
225 f.l_len = 1;
226 rc = fcntl(pSummary->fd, F_SETLK, &f);
227 if( rc!=0 ){
228 return SQLITE_IOERR;
229 }
230 return SQLITE_OK;
231}
232
233/*
234** Memory map the first nByte bytes of the summary file opened with
235** pSummary->fd at pSummary->aData. If the summary file is smaller than
236** nByte bytes in size when this function is called, ftruncate() is
237** used to expand it before it is mapped.
238**
239** It is assumed that an exclusive lock is held on the summary file
240** by the caller (to protect the ftruncate()).
241*/
242static int logSummaryMap(LogSummary *pSummary, int nByte){
243 struct stat sStat;
244 int rc;
245 int fd = pSummary->fd;
246 void *pMap;
247
248 assert( pSummary->aData==0 );
249
250 /* If the file is less than nByte bytes in size, cause it to grow. */
251 rc = fstat(fd, &sStat);
252 if( rc!=0 ) return SQLITE_IOERR;
253 if( sStat.st_size<nByte ){
254 rc = ftruncate(fd, nByte);
255 if( rc!=0 ) return SQLITE_IOERR;
256 }
257
258 /* Map the file. */
259 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
260 if( pMap==MAP_FAILED ){
261 return SQLITE_IOERR;
262 }
263 pSummary->aData = (u32 *)pMap;
264 pSummary->nData = nByte;
265
266 return SQLITE_OK;
267}
268
269/*
270** Unmap the log-summary mapping and close the file-descriptor. If
271** the isTruncate argument is non-zero, truncate the log-summary file
272** region to zero bytes.
273**
274** Regardless of the value of isTruncate, close the file-descriptor
275** opened on the log-summary file.
276*/
277static int logSummaryUnmap(LogSummary *pSummary, int isTruncate){
278 int rc = SQLITE_OK;
279 if( pSummary->aData ){
280 assert( pSummary->fd>0 );
281 munmap(pSummary->aData, pSummary->nData);
282 pSummary->aData = 0;
283 if( isTruncate ){
284 rc = (ftruncate(pSummary->fd, 0) ? SQLITE_IOERR : SQLITE_OK);
285 }
286 }
287 if( pSummary->fd>0 ){
288 close(pSummary->fd);
289 pSummary->fd = -1;
290 }
291 return rc;
292}
293
294
295static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
296 u32 *aData = pSummary->aData;
297 memcpy(aData, pHdr, sizeof(LogSummaryHdr));
298 aData[LOGSUMMARY_HDR_NFIELD] = 1;
299 aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
300 logChecksumBytes(
301 (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
302 );
303}
304
305/*
306** This function encodes a single frame header and writes it to a buffer
307** supplied by the caller. A log frame-header is made up of a series of
308** 4-byte big-endian integers, as follows:
309**
310** 0: Database page size in bytes.
311** 4: Page number.
312** 8: New database size (for commit frames, otherwise zero).
313** 12: Frame checksum 1.
314** 16: Frame checksum 2.
315*/
316static void logEncodeFrame(
317 u32 *aCksum, /* IN/OUT: Checksum values */
318 u32 iPage, /* Database page number for frame */
319 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
320 int nData, /* Database page size (size of aData[]) */
321 u8 *aData, /* Pointer to page data (for checksum) */
322 u8 *aFrame /* OUT: Write encoded frame here */
323){
324 assert( LOG_FRAME_HDRSIZE==20 );
325
326 sqlite3Put4byte(&aFrame[0], nData);
327 sqlite3Put4byte(&aFrame[4], iPage);
328 sqlite3Put4byte(&aFrame[8], nTruncate);
329
330 logChecksumBytes(aFrame, 12, aCksum);
331 logChecksumBytes(aData, nData, aCksum);
332
333 sqlite3Put4byte(&aFrame[12], aCksum[0]);
334 sqlite3Put4byte(&aFrame[16], aCksum[1]);
335}
336
337/*
338** Return 1 and populate *piPage, *pnTruncate and aCksum if the
339** frame checksum looks Ok. Otherwise return 0.
340*/
341static int logDecodeFrame(
342 u32 *aCksum, /* IN/OUT: Checksum values */
343 u32 *piPage, /* OUT: Database page number for frame */
344 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
345 int nData, /* Database page size (size of aData[]) */
346 u8 *aData, /* Pointer to page data (for checksum) */
347 u8 *aFrame /* Frame data */
348){
dan4a4b01d2010-04-16 11:30:18 +0000349 assert( LOG_FRAME_HDRSIZE==20 );
350
dan7c246102010-04-12 19:00:29 +0000351 logChecksumBytes(aFrame, 12, aCksum);
352 logChecksumBytes(aData, nData, aCksum);
353
354 if( aCksum[0]!=sqlite3Get4byte(&aFrame[12])
355 || aCksum[1]!=sqlite3Get4byte(&aFrame[16])
356 ){
357 /* Checksum failed. */
358 return 0;
359 }
360
361 *piPage = sqlite3Get4byte(&aFrame[4]);
362 *pnTruncate = sqlite3Get4byte(&aFrame[8]);
363 return 1;
364}
365
366static void logMergesort8(
367 Pgno *aContent, /* Pages in log */
368 u8 *aBuffer, /* Buffer of at least *pnList items to use */
369 u8 *aList, /* IN/OUT: List to sort */
370 int *pnList /* IN/OUT: Number of elements in aList[] */
371){
372 int nList = *pnList;
373 if( nList>1 ){
374 int nLeft = nList / 2; /* Elements in left list */
375 int nRight = nList - nLeft; /* Elements in right list */
376 u8 *aLeft = aList; /* Left list */
377 u8 *aRight = &aList[nLeft]; /* Right list */
378 int iLeft = 0; /* Current index in aLeft */
379 int iRight = 0; /* Current index in aright */
380 int iOut = 0; /* Current index in output buffer */
381
382 /* TODO: Change to non-recursive version. */
383 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
384 logMergesort8(aContent, aBuffer, aRight, &nRight);
385
386 while( iRight<nRight || iLeft<nLeft ){
387 u8 logpage;
388 Pgno dbpage;
389
390 if( (iLeft<nLeft)
391 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
392 ){
393 logpage = aLeft[iLeft++];
394 }else{
395 logpage = aRight[iRight++];
396 }
397 dbpage = aContent[logpage];
398
399 aBuffer[iOut++] = logpage;
400 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
401
402 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
403 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
404 }
405 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
406 *pnList = iOut;
407 }
408
409#ifdef SQLITE_DEBUG
410 {
411 int i;
412 for(i=1; i<*pnList; i++){
413 assert( aContent[aList[i]] > aContent[aList[i-1]] );
414 }
415 }
416#endif
417}
418
419
420/*
421** Return the index in the LogSummary.aData array that corresponds to
422** frame iFrame. The log-summary file consists of a header, followed by
423** alternating "map" and "index" blocks.
424*/
425static int logSummaryEntry(u32 iFrame){
426 return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
427}
428
429
430/*
431** Set an entry in the log-summary map to map log frame iFrame to db
432** page iPage. Values are always appended to the log-summary (i.e. the
433** value of iFrame is always exactly one more than the value passed to
434** the previous call), but that restriction is not enforced or asserted
435** here.
436*/
437static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
438 u32 iSlot = logSummaryEntry(iFrame);
439
440 /* Set the log-summary entry itself */
441 pSummary->aData[iSlot] = iPage;
442
443 /* If the frame number is a multiple of 256 (frames are numbered starting
444 ** at 1), build an index of the most recently added 256 frames.
445 */
446 if( (iFrame&0x000000FF)==0 ){
447 int i; /* Iterator used while initializing aIndex */
448 u32 *aFrame; /* Pointer to array of 256 frames */
449 int nIndex; /* Number of entries in index */
450 u8 *aIndex; /* 256 bytes to build index in */
451 u8 *aTmp; /* Scratch space to use while sorting */
452
453 aFrame = &pSummary->aData[iSlot-255];
454 aIndex = (u8 *)&pSummary->aData[iSlot+1];
455 aTmp = &aIndex[256];
456
457 nIndex = 256;
458 for(i=0; i<256; i++) aIndex[i] = (u8)i;
459 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
460 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
461 }
462}
463
464
465/*
466** Recover the log-summary by reading the log file. The caller must hold
467** an exclusive lock on the log-summary file.
468*/
469static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
470 int rc; /* Return Code */
471 i64 nSize; /* Size of log file */
472 LogSummaryHdr hdr; /* Recovered log-summary header */
473
474 memset(&hdr, 0, sizeof(hdr));
475
476 rc = sqlite3OsFileSize(pFd, &nSize);
477 if( rc!=SQLITE_OK ){
478 return rc;
479 }
480
481 if( nSize>LOG_FRAME_HDRSIZE ){
482 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
483 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
484 int nFrame; /* Number of bytes at aFrame */
485 u8 *aData; /* Pointer to data part of aFrame buffer */
486 int iFrame; /* Index of last frame read */
487 i64 iOffset; /* Next offset to read from log file */
488 int nPgsz; /* Page size according to the log */
489 u32 aCksum[2] = {2, 3}; /* Running checksum */
490
491 /* Read in the first frame header in the file (to determine the
492 ** database page size).
493 */
494 rc = sqlite3OsRead(pFd, aBuf, LOG_FRAME_HDRSIZE, 0);
495 if( rc!=SQLITE_OK ){
496 return rc;
497 }
498
499 /* If the database page size is not a power of two, or is greater than
500 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
501 */
502 nPgsz = sqlite3Get4byte(&aBuf[0]);
503 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
504 goto finished;
505 }
506
507 /* Malloc a buffer to read frames into. */
508 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
509 aFrame = (u8 *)sqlite3_malloc(nFrame);
510 if( !aFrame ){
511 return SQLITE_NOMEM;
512 }
513 aData = &aFrame[LOG_FRAME_HDRSIZE];
514
515 /* Read all frames from the log file. */
516 iFrame = 0;
517 iOffset = 0;
dan4a4b01d2010-04-16 11:30:18 +0000518 for(iOffset=0; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
dan7c246102010-04-12 19:00:29 +0000519 u32 pgno; /* Database page number for frame */
520 u32 nTruncate; /* dbsize field from frame header */
521 int isValid; /* True if this frame is valid */
522
523 /* Read and decode the next log frame. */
524 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
525 if( rc!=SQLITE_OK ) break;
526 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
527 if( !isValid ) break;
528 logSummaryAppend(pSummary, ++iFrame, pgno);
529
530 /* If nTruncate is non-zero, this is a commit record. */
531 if( nTruncate ){
532 hdr.iCheck1 = aCksum[0];
533 hdr.iCheck2 = aCksum[1];
534 hdr.iLastPg = iFrame;
535 hdr.nPage = nTruncate;
536 hdr.pgsz = nPgsz;
537 }
538 }
539
540 sqlite3_free(aFrame);
541 }else{
542 hdr.iCheck1 = 2;
543 hdr.iCheck2 = 3;
544 }
545
546finished:
547 logSummaryWriteHdr(pSummary, &hdr);
548 return rc;
549}
550
551
552/*
553** This function intializes the connection to the log-summary identified
554** by struct pSummary.
555*/
556static int logSummaryInit(LogSummary *pSummary, sqlite3_file *pFd){
557 int rc; /* Return Code */
558 char *zFile; /* File name for summary file */
559
560 assert( pSummary->fd<0 );
561 assert( pSummary->aData==0 );
562 assert( pSummary->nRef>0 );
563 assert( pSummary->zPath );
564
565 /* Open a file descriptor on the summary file. */
566 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
567 if( !zFile ){
568 return SQLITE_NOMEM;
569 }
570 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
571 sqlite3_free(zFile);
572 if( pSummary->fd<0 ){
573 return SQLITE_IOERR;
574 }
575
576 /* Grab an exclusive lock the summary file. Then mmap() it. TODO: This
577 ** code needs to be enhanced to support a growable mapping. For now, just
578 ** make the mapping very large to start with.
579 */
580 rc = logSummaryLock(pSummary);
581 if( rc!=SQLITE_OK ) return rc;
582 rc = logSummaryMap(pSummary, 512*1024);
583 if( rc!=SQLITE_OK ) goto out;
584
585 /* Grab a SHARED lock on the log file. Then try to upgrade to an EXCLUSIVE
586 ** lock. If successful, then this is the first (and only) connection to
587 ** the database. In this case assume the contents of the log-summary
588 ** cannot be trusted. Zero the log-summary header to make sure.
589 **
590 ** The SHARED lock on the log file is not released until the connection
591 ** to the database is closed.
592 */
593 rc = sqlite3OsLock(pFd, SQLITE_LOCK_SHARED);
594 if( rc!=SQLITE_OK ) goto out;
595 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
596 if( rc==SQLITE_OK ){
597 /* This is the first and only connection. */
598 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
599 rc = sqlite3OsUnlock(pFd, SQLITE_LOCK_SHARED);
600 }else if( rc==SQLITE_BUSY ){
601 rc = SQLITE_OK;
602 }
603
604 out:
605 logSummaryUnlock(pSummary);
606 return rc;
607}
608
609/*
610** Open a connection to the log file associated with database zDb. The
611** database file does not actually have to exist. zDb is used only to
612** figure out the name of the log file to open. If the log file does not
613** exist it is created by this call.
614*/
615int sqlite3LogOpen(
616 sqlite3_vfs *pVfs, /* vfs module to open log file with */
617 const char *zDb, /* Name of database file */
618 Log **ppLog /* OUT: Allocated Log handle */
619){
danb9bf16b2010-04-14 11:23:30 +0000620 int rc = SQLITE_OK; /* Return Code */
dan7c246102010-04-12 19:00:29 +0000621 Log *pRet; /* Object to allocate and return */
622 LogSummary *pSummary = 0; /* Summary object */
623 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
624 int flags; /* Flags passed to OsOpen() */
625 char *zWal = 0; /* Path to WAL file */
626 int nWal; /* Length of zWal in bytes */
627
628 /* Zero output variables */
629 assert( zDb );
630 *ppLog = 0;
631
632 /* Allocate an instance of struct Log to return. */
633 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
634 if( !pRet ) goto out;
635 pRet->pVfs = pVfs;
636 pRet->pFd = (sqlite3_file *)&pRet[1];
637 pRet->sync_flags = SQLITE_SYNC_NORMAL;
638
639 /* Normalize the path name. */
640 zWal = sqlite3_mprintf("%s-wal", zDb);
641 if( !zWal ) goto out;
642 logNormalizePath(zWal);
643 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_DB);
644 nWal = sqlite3Strlen30(zWal);
645
646 /* Enter the mutex that protects the linked-list of LogSummary structures */
647 if( sqlite3GlobalConfig.bCoreMutex ){
648 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
649 }
650 sqlite3_mutex_enter(mutex);
651
652 /* Search for an existing log summary object in the linked list. If one
653 ** cannot be found, allocate and initialize a new object.
654 */
655 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
656 int nPath = sqlite3Strlen30(pSummary->zPath);
657 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
658 }
659 if( !pSummary ){
660 int nByte = sizeof(LogSummary) + nWal + 1;
661 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
662 if( !pSummary ){
663 rc = SQLITE_NOMEM;
664 goto out;
665 }
666 if( sqlite3GlobalConfig.bCoreMutex ){
667 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
668 }
669 pSummary->zPath = (char *)&pSummary[1];
670 pSummary->fd = -1;
671 memcpy(pSummary->zPath, zWal, nWal);
672 pSummary->pNext = pLogSummary;
673 pLogSummary = pSummary;
674 }
675 pSummary->nRef++;
676 pRet->pSummary = pSummary;
677
678 /* Exit the mutex protecting the linked-list of LogSummary objects. */
679 sqlite3_mutex_leave(mutex);
680 mutex = 0;
681
682 /* Open file handle on the log file. */
683 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
684 if( rc!=SQLITE_OK ) goto out;
685
686 /* Object pSummary is shared between all connections to the database made
687 ** by this process. So at this point it may or may not be connected to
688 ** the log-summary. If it is not, connect it. Otherwise, just take the
689 ** SHARED lock on the log file.
690 */
691 sqlite3_mutex_enter(pSummary->mutex);
692 mutex = pSummary->mutex;
693 if( pSummary->fd<0 ){
694 rc = logSummaryInit(pSummary, pRet->pFd);
695 }else{
696 rc = sqlite3OsLock(pRet->pFd, SQLITE_LOCK_SHARED);
697 }
698
dan64d039e2010-04-13 19:27:31 +0000699 pRet->lock.pNext = pSummary->pLock;
700 pSummary->pLock = &pRet->lock;
701
dan7c246102010-04-12 19:00:29 +0000702 out:
703 sqlite3_mutex_leave(mutex);
704 sqlite3_free(zWal);
705 if( rc!=SQLITE_OK ){
706 assert(0);
707 if( pRet ){
708 sqlite3OsClose(pRet->pFd);
709 sqlite3_free(pRet);
710 }
711 assert( !pSummary || pSummary->nRef==0 );
712 sqlite3_free(pSummary);
713 }
714 *ppLog = pRet;
715 return rc;
716}
717
dan4a4b01d2010-04-16 11:30:18 +0000718static int logIteratorNext(
719 LogIterator *p, /* Iterator */
dan7c246102010-04-12 19:00:29 +0000720 u32 *piPage, /* OUT: Next db page to write */
721 u32 *piFrame /* OUT: Log frame to read from */
722){
723 u32 iMin = *piPage;
724 u32 iRet = 0xFFFFFFFF;
725 int i;
726 int nBlock = p->nFinal;
727
728 for(i=p->nSegment-1; i>=0; i--){
729 struct LogSegment *pSegment = &p->aSegment[i];
730 while( pSegment->iNext<nBlock ){
731 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
732 if( iPg>iMin ){
733 if( iPg<iRet ){
734 iRet = iPg;
735 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
736 }
737 break;
738 }
739 pSegment->iNext++;
740 }
741
742 nBlock = 256;
743 }
744
745 *piPage = iRet;
746 return (iRet==0xFFFFFFFF);
747}
748
dan4a4b01d2010-04-16 11:30:18 +0000749static LogIterator *logIteratorInit(Log *pLog){
dan7c246102010-04-12 19:00:29 +0000750 u32 *aData = pLog->pSummary->aData;
dan4a4b01d2010-04-16 11:30:18 +0000751 LogIterator *p; /* Return value */
dan7c246102010-04-12 19:00:29 +0000752 int nSegment; /* Number of segments to merge */
753 u32 iLast; /* Last frame in log */
754 int nByte; /* Number of bytes to allocate */
755 int i; /* Iterator variable */
756 int nFinal; /* Number of unindexed entries */
757 struct LogSegment *pFinal; /* Final (unindexed) segment */
758 u8 *aTmp; /* Temp space used by merge-sort */
759
760 iLast = pLog->hdr.iLastPg;
761 nSegment = (iLast >> 8) + 1;
762 nFinal = (iLast & 0x000000FF);
763
dan4a4b01d2010-04-16 11:30:18 +0000764 nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
765 p = (LogIterator *)sqlite3_malloc(nByte);
dan7c246102010-04-12 19:00:29 +0000766 if( p ){
767 memset(p, 0, nByte);
768 p->nSegment = nSegment;
769 p->nFinal = nFinal;
770 }
771
772 for(i=0; i<nSegment-1; i++){
773 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
774 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
775 }
776 pFinal = &p->aSegment[nSegment-1];
777
778 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
779 pFinal->aIndex = (u8 *)&pFinal[1];
780 aTmp = &pFinal->aIndex[256];
781 for(i=0; i<nFinal; i++){
782 pFinal->aIndex[i] = i;
783 }
784 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
785 p->nFinal = nFinal;
786
787 return p;
788}
789
790/*
dan4a4b01d2010-04-16 11:30:18 +0000791** Free a log iterator allocated by logIteratorInit().
dan7c246102010-04-12 19:00:29 +0000792*/
dan4a4b01d2010-04-16 11:30:18 +0000793static void logIteratorFree(LogIterator *p){
dan7c246102010-04-12 19:00:29 +0000794 sqlite3_free(p);
795}
796
797/*
798** Checkpoint the contents of the log file.
799*/
800static int logCheckpoint(
801 Log *pLog, /* Log connection */
802 sqlite3_file *pFd, /* File descriptor open on db file */
803 u8 *zBuf /* Temporary buffer to use */
804){
805 int rc; /* Return code */
806 int pgsz = pLog->hdr.pgsz; /* Database page-size */
dan4a4b01d2010-04-16 11:30:18 +0000807 LogIterator *pIter = 0; /* Log iterator context */
dan7c246102010-04-12 19:00:29 +0000808 u32 iDbpage = 0; /* Next database page to write */
danb9bf16b2010-04-14 11:23:30 +0000809 u32 iFrame = 0; /* Log frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +0000810
danbb2e9c92010-04-15 13:33:18 +0000811 if( pLog->hdr.iLastPg==0 ){
812 return SQLITE_OK;
813 }
814
dan7c246102010-04-12 19:00:29 +0000815 /* Allocate the iterator */
dan4a4b01d2010-04-16 11:30:18 +0000816 pIter = logIteratorInit(pLog);
dan7c246102010-04-12 19:00:29 +0000817 if( !pIter ) return SQLITE_NOMEM;
818
819 /* Sync the log file to disk */
820 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
821 if( rc!=SQLITE_OK ) goto out;
822
823 /* Iterate through the contents of the log, copying data to the db file. */
dan4a4b01d2010-04-16 11:30:18 +0000824 while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
dan7c246102010-04-12 19:00:29 +0000825 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
826 (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
827 );
828 if( rc!=SQLITE_OK ) goto out;
829 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
830 if( rc!=SQLITE_OK ) goto out;
831 }
832
833 /* Truncate the database file */
834 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
835 if( rc!=SQLITE_OK ) goto out;
836
837 /* Sync the database file. If successful, update the log-summary. */
838 rc = sqlite3OsSync(pFd, pLog->sync_flags);
839 if( rc!=SQLITE_OK ) goto out;
840 pLog->hdr.iLastPg = 0;
841 pLog->hdr.iCheck1 = 2;
842 pLog->hdr.iCheck2 = 3;
843 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
844
845 /* TODO: If a crash occurs and the current log is copied into the
846 ** database there is no problem. However, if a crash occurs while
847 ** writing the next transaction into the start of the log, such that:
848 **
849 ** * The first transaction currently in the log is left intact, but
850 ** * The second (or subsequent) transaction is damaged,
851 **
852 ** then the database could become corrupt.
853 **
854 ** The easiest thing to do would be to write and sync a dummy header
855 ** into the log at this point. Unfortunately, that turns out to be
856 ** an unwelcome performance hit. Alternatives are...
857 */
858#if 0
859 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
860 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
861 if( rc!=SQLITE_OK ) goto out;
862 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
863#endif
864
865 out:
dan4a4b01d2010-04-16 11:30:18 +0000866 logIteratorFree(pIter);
dan7c246102010-04-12 19:00:29 +0000867 return rc;
868}
869
870/*
871** Close a connection to a log file.
872*/
873int sqlite3LogClose(
874 Log *pLog, /* Log to close */
875 sqlite3_file *pFd, /* Database file */
876 u8 *zBuf /* Buffer of at least page-size bytes */
877){
878 int rc = SQLITE_OK;
879 if( pLog ){
dan64d039e2010-04-13 19:27:31 +0000880 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +0000881 LogSummary *pSummary = pLog->pSummary;
882 sqlite3_mutex *mutex = 0;
883
dan64d039e2010-04-13 19:27:31 +0000884 sqlite3_mutex_enter(pSummary->mutex);
885 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
886 *ppL = pLog->lock.pNext;
887 sqlite3_mutex_leave(pSummary->mutex);
888
dan7c246102010-04-12 19:00:29 +0000889 if( sqlite3GlobalConfig.bCoreMutex ){
890 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
891 }
892 sqlite3_mutex_enter(mutex);
893
894 /* Decrement the reference count on the log summary. If this is the last
895 ** reference to the log summary object in this process, the object will
896 ** be freed. If this is also the last connection to the database, then
897 ** checkpoint the database and truncate the log and log-summary files
898 ** to zero bytes in size.
899 **/
900 pSummary->nRef--;
901 if( pSummary->nRef==0 ){
902 LogSummary **pp;
903
904 rc = logSummaryLock(pSummary);
905 if( rc==SQLITE_OK ){
906 int isTruncate = 0;
907 int rc2 = sqlite3OsLock(pLog->pFd, SQLITE_LOCK_EXCLUSIVE);
908 if( rc2==SQLITE_OK ){
909 /* This is the last connection to the database (including other
910 ** processes). Do three things:
911 **
912 ** 1. Checkpoint the db.
913 ** 2. Truncate the log file to zero bytes.
914 ** 3. Truncate the log-summary file to zero bytes.
915 */
916 rc2 = logCheckpoint(pLog, pFd, zBuf);
917 if( rc2==SQLITE_OK ){
918 rc2 = sqlite3OsTruncate(pLog->pFd, 0);
919 }
920 isTruncate = 1;
921 }else if( rc2==SQLITE_BUSY ){
922 rc2 = SQLITE_OK;
923 }
924 logSummaryUnmap(pSummary, isTruncate);
925 sqlite3OsUnlock(pLog->pFd, SQLITE_LOCK_NONE);
926 rc = logSummaryUnlock(pSummary);
927 if( rc2!=SQLITE_OK ) rc = rc2;
928 }
929
930 /* Remove the LogSummary object from the global list. Then free the
931 ** mutex and the object itself.
932 */
933 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
934 *pp = (*pp)->pNext;
935 sqlite3_mutex_free(pSummary->mutex);
936 sqlite3_free(pSummary);
937 }
938
939 sqlite3_mutex_leave(mutex);
940
941 /* Close the connection to the log file and free the Log handle. */
942 sqlite3OsClose(pLog->pFd);
943 sqlite3_free(pLog);
944 }
945 return rc;
946}
947
948/*
949** Set the flags to pass to the sqlite3OsSync() function when syncing
950** the log file.
951*/
952#if 0
953void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
954 assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
955 pLog->sync_flags = sync_flags;
956}
957#endif
958
959/*
960** Enter and leave the log-summary mutex. In this context, entering the
961** log-summary mutex means:
962**
963** 1. Obtaining mutex pLog->pSummary->mutex, and
964** 2. Taking an exclusive lock on the log-summary file.
965**
966** i.e. this mutex locks out other processes as well as other threads
967** hosted in this address space.
968*/
969static int logEnterMutex(Log *pLog){
970 LogSummary *pSummary = pLog->pSummary;
971 int rc;
972
973 sqlite3_mutex_enter(pSummary->mutex);
974 rc = logSummaryLock(pSummary);
975 if( rc!=SQLITE_OK ){
976 sqlite3_mutex_leave(pSummary->mutex);
977 }
978 return rc;
979}
980static void logLeaveMutex(Log *pLog){
981 LogSummary *pSummary = pLog->pSummary;
982 logSummaryUnlock(pSummary);
983 sqlite3_mutex_leave(pSummary->mutex);
984}
985
986/*
dan64d039e2010-04-13 19:27:31 +0000987** Values for the second parameter to logLockRegion().
988*/
989#define LOG_UNLOCK 0
990#define LOG_RDLOCK 1
991#define LOG_WRLOCK 2
992
993static int logLockRegion(Log *pLog, u32 mRegion, int op){
994 LogSummary *pSummary = pLog->pSummary;
995 LogLock *p; /* Used to iterate through in-process locks */
dan02bb5962010-04-14 15:49:40 +0000996 u32 mOther; /* Locks held by other connections */
997 u32 mNew; /* New mask for pLog */
dan64d039e2010-04-13 19:27:31 +0000998
999 assert(
1000 /* Writer lock operations */
1001 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
1002 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
1003
dan02bb5962010-04-14 15:49:40 +00001004 /* Normal reader lock operations */
dan64d039e2010-04-13 19:27:31 +00001005 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
dan64d039e2010-04-13 19:27:31 +00001006 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
1007 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
dan02bb5962010-04-14 15:49:40 +00001008
1009 /* Region D reader lock operations */
1010 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
dan49320f82010-04-14 18:50:08 +00001011 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
dan64d039e2010-04-13 19:27:31 +00001012 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
1013
1014 /* Checkpointer lock operations */
1015 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
1016 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
dan64d039e2010-04-13 19:27:31 +00001017 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
dan02bb5962010-04-14 15:49:40 +00001018 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
dan64d039e2010-04-13 19:27:31 +00001019 );
1020
dan02bb5962010-04-14 15:49:40 +00001021 /* Assert that a connection never tries to go from an EXCLUSIVE to a
1022 ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
1023 ** happens though (when a region D reader upgrades to a writer).
1024 */
1025 assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
1026
dan64d039e2010-04-13 19:27:31 +00001027 sqlite3_mutex_enter(pSummary->mutex);
1028
dan02bb5962010-04-14 15:49:40 +00001029 /* Calculate a mask of logs held by all connections in this process apart
1030 ** from this one. The least significant byte of the mask contains a mask
1031 ** of the SHARED logs held. The next least significant byte of the mask
1032 ** indicates the EXCLUSIVE locks held. For example, to test if some other
1033 ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
1034 ** on region C, do:
1035 **
1036 ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
1037 ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
1038 **
1039 ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
1040 ** corresponding bit in the SHARED mask.
dan64d039e2010-04-13 19:27:31 +00001041 */
dan02bb5962010-04-14 15:49:40 +00001042 mOther = 0;
1043 for(p=pSummary->pLock; p; p=p->pNext){
1044 assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
1045 if( p!=&pLog->lock ){
1046 mOther |= p->mLock;
dan64d039e2010-04-13 19:27:31 +00001047 }
1048 }
1049
dan02bb5962010-04-14 15:49:40 +00001050 /* If this call is to lock a region (not to unlock one), test if locks held
1051 ** by any other connection in this process prevent the new locks from
1052 ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
1053 */
1054 if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
1055 sqlite3_mutex_leave(pSummary->mutex);
1056 return SQLITE_BUSY;
1057 }
1058
1059 /* Figure out the new log mask for this connection. */
dan64d039e2010-04-13 19:27:31 +00001060 switch( op ){
1061 case LOG_UNLOCK:
dan02bb5962010-04-14 15:49:40 +00001062 mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
dan64d039e2010-04-13 19:27:31 +00001063 break;
1064 case LOG_RDLOCK:
dan02bb5962010-04-14 15:49:40 +00001065 mNew = (pLog->lock.mLock | mRegion);
dan64d039e2010-04-13 19:27:31 +00001066 break;
1067 default:
1068 assert( op==LOG_WRLOCK );
dan02bb5962010-04-14 15:49:40 +00001069 mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
dan64d039e2010-04-13 19:27:31 +00001070 break;
1071 }
1072
dan02bb5962010-04-14 15:49:40 +00001073 /* Now modify the locks held on the log-summary file descriptor. This
1074 ** file descriptor is shared by all log connections in this process.
1075 ** Therefore:
1076 **
1077 ** + If one or more log connections in this process hold a SHARED lock
1078 ** on a region, the file-descriptor should hold a SHARED lock on
1079 ** the file region.
1080 **
1081 ** + If a log connection in this process holds an EXCLUSIVE lock on a
1082 ** region, the file-descriptor should also hold an EXCLUSIVE lock on
1083 ** the region in question.
1084 **
1085 ** If this is an LOG_UNLOCK operation, only regions for which no other
1086 ** connection holds a lock should actually be unlocked. And if this
1087 ** is a LOG_RDLOCK operation and other connections already hold all
1088 ** the required SHARED locks, then no system call is required.
1089 */
1090 if( op==LOG_UNLOCK ){
1091 mRegion = (mRegion & ~mOther);
dan64d039e2010-04-13 19:27:31 +00001092 }
dan02bb5962010-04-14 15:49:40 +00001093 if( (op==LOG_WRLOCK)
1094 || (op==LOG_UNLOCK && mRegion)
1095 || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
1096 ){
1097 struct LockMap {
1098 int iStart; /* Byte offset to start locking operation */
1099 int iLen; /* Length field for locking operation */
1100 } aMap[] = {
1101 /* 0000 */ {0, 0}, /* 0001 */ {4, 1},
1102 /* 0010 */ {3, 1}, /* 0011 */ {3, 2},
1103 /* 0100 */ {2, 1}, /* 0101 */ {0, 0},
1104 /* 0110 */ {2, 2}, /* 0111 */ {2, 3},
1105 /* 1000 */ {1, 1}, /* 1001 */ {0, 0},
1106 /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
1107 /* 1100 */ {1, 2}, /* 1101 */ {0, 0},
dane264d982010-04-14 18:06:50 +00001108 /* 1110 */ {0, 0}, /* 1111 */ {0, 0}
dan02bb5962010-04-14 15:49:40 +00001109 };
1110 int rc; /* Return code of fcntl() */
1111 struct flock f; /* Locking operation */
dan64d039e2010-04-13 19:27:31 +00001112
dan02bb5962010-04-14 15:49:40 +00001113 assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
1114
dan64d039e2010-04-13 19:27:31 +00001115 memset(&f, 0, sizeof(f));
1116 f.l_type = (op==LOG_WRLOCK?F_WRLCK:(op==LOG_RDLOCK?F_RDLCK:F_UNLCK));
1117 f.l_whence = SEEK_SET;
dan02bb5962010-04-14 15:49:40 +00001118 f.l_start = 32 + aMap[mRegion].iStart;
1119 f.l_len = aMap[mRegion].iLen;
dan64d039e2010-04-13 19:27:31 +00001120
1121 rc = fcntl(pSummary->fd, F_SETLK, &f);
1122 if( rc!=0 ){
1123 sqlite3_mutex_leave(pSummary->mutex);
1124 return SQLITE_BUSY;
1125 }
1126 }
1127
dan02bb5962010-04-14 15:49:40 +00001128 pLog->lock.mLock = mNew;
dan64d039e2010-04-13 19:27:31 +00001129 sqlite3_mutex_leave(pSummary->mutex);
1130 return SQLITE_OK;
1131}
1132
1133/*
danb9bf16b2010-04-14 11:23:30 +00001134** Try to read the log-summary header. Attempt to verify the header
1135** checksum. If the checksum can be verified, copy the log-summary
1136** header into structure pLog->hdr. If the contents of pLog->hdr are
1137** modified by this and pChanged is not NULL, set *pChanged to 1.
1138** Otherwise leave *pChanged unmodified.
1139**
1140** If the checksum cannot be verified return SQLITE_ERROR.
1141*/
1142int logSummaryTryHdr(Log *pLog, int *pChanged){
1143 u32 aCksum[2] = {1, 1};
1144 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1145
1146 /* First try to read the header without a lock. Verify the checksum
1147 ** before returning. This will almost always work.
1148 */
1149 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1150 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1151 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1152 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1153 ){
1154 return SQLITE_ERROR;
1155 }
1156
1157 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1158 if( pChanged ){
1159 *pChanged = 1;
1160 }
1161 memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
1162 }
1163 return SQLITE_OK;
1164}
1165
1166/*
1167** Read the log-summary header from the log-summary file into structure
1168** pLog->hdr. If attempting to verify the header checksum fails, try
1169** to recover the log before returning.
1170**
1171** If the log-summary header is successfully read, return SQLITE_OK.
1172** Otherwise an SQLite error code.
1173*/
1174int logSummaryReadHdr(Log *pLog, int *pChanged){
1175 int rc;
1176
1177 /* First try to read the header without a lock. Verify the checksum
1178 ** before returning. This will almost always work.
1179 */
1180 if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
1181 return SQLITE_OK;
1182 }
1183
1184 /* If the first attempt to read the header failed, lock the log-summary
1185 ** file and try again. If the header checksum verification fails this
1186 ** time as well, run log recovery.
1187 */
1188 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1189 if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
1190 if( pChanged ){
1191 *pChanged = 1;
1192 }
1193 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1194 if( rc==SQLITE_OK ){
1195 rc = logSummaryTryHdr(pLog, 0);
1196 }
1197 }
1198 logLeaveMutex(pLog);
1199 }
1200
1201 return rc;
1202}
1203
1204/*
dan64d039e2010-04-13 19:27:31 +00001205** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001206**
1207** If this call obtains a new read-lock and the database contents have been
1208** modified since the most recent call to LogCloseSnapshot() on this Log
1209** connection, then *pChanged is set to 1 before returning. Otherwise, it
1210** is left unmodified. This is used by the pager layer to determine whether
1211** or not any cached pages may be safely reused.
1212*/
1213int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
1214 int rc = SQLITE_OK;
1215 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001216 int nAttempt;
1217
1218 /* Obtain a snapshot-lock on the log-summary file. The procedure
1219 ** for obtaining the snapshot log is:
1220 **
1221 ** 1. Attempt a SHARED lock on regions A and B.
1222 ** 2a. If step 1 is successful, drop the lock on region B.
1223 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1224 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1225 ** successful.
1226 **
1227 ** If neither of the locks can be obtained after 5 tries, presumably
1228 ** something is wrong (i.e. a process not following the locking protocol).
1229 ** Return an error code in this case.
1230 */
1231 rc = SQLITE_BUSY;
1232 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1233 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1234 if( rc==SQLITE_BUSY ){
1235 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1236 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1237 }else{
1238 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1239 pLog->isLocked = LOG_REGION_A;
1240 }
1241 }
1242 if( rc!=SQLITE_OK ){
1243 return rc;
1244 }
1245
danb9bf16b2010-04-14 11:23:30 +00001246 rc = logSummaryReadHdr(pLog, pChanged);
dan64d039e2010-04-13 19:27:31 +00001247 if( rc!=SQLITE_OK ){
1248 /* An error occured while attempting log recovery. */
1249 sqlite3LogCloseSnapshot(pLog);
1250 }
dan7c246102010-04-12 19:00:29 +00001251 }
1252 return rc;
1253}
1254
1255/*
1256** Unlock the current snapshot.
1257*/
1258void sqlite3LogCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001259 if( pLog->isLocked ){
1260 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1261 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1262 }
dan7c246102010-04-12 19:00:29 +00001263 pLog->isLocked = 0;
1264}
1265
dan7c246102010-04-12 19:00:29 +00001266/*
1267** Read a page from the log, if it is present.
1268*/
1269int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
1270 u32 iRead = 0;
1271 u32 *aData = pLog->pSummary->aData;
1272 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1273
dan39c79f52010-04-15 10:58:51 +00001274 assert( pLog->isLocked );
1275
dan7c246102010-04-12 19:00:29 +00001276 /* Do a linear search of the unindexed block of page-numbers (if any)
1277 ** at the end of the log-summary. An alternative to this would be to
1278 ** build an index in private memory each time a read transaction is
1279 ** opened on a new snapshot.
1280 */
1281 if( pLog->hdr.iLastPg ){
1282 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1283 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1284 while( *pi!=pgno && pi!=piStop ) pi--;
1285 if( pi!=piStop ){
1286 iRead = (pi-piStop) + iFrame;
1287 }
1288 }
1289 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1290
1291 while( iRead==0 && iFrame>0 ){
1292 int iLow = 0;
1293 int iHigh = 255;
1294 u32 *aFrame;
1295 u8 *aIndex;
1296
1297 iFrame -= 256;
1298 aFrame = &aData[logSummaryEntry(iFrame+1)];
1299 aIndex = (u8 *)&aFrame[256];
1300
1301 while( iLow<=iHigh ){
1302 int iTest = (iLow+iHigh)>>1;
1303 u32 iPg = aFrame[aIndex[iTest]];
1304
1305 if( iPg==pgno ){
1306 iRead = iFrame + 1 + aIndex[iTest];
1307 break;
1308 }
1309 else if( iPg<pgno ){
1310 iLow = iTest+1;
1311 }else{
1312 iHigh = iTest-1;
1313 }
1314 }
1315 }
1316 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1317
1318 /* If iRead is non-zero, then it is the log frame number that contains the
1319 ** required page. Read and return data from the log file.
1320 */
1321 if( iRead ){
1322 i64 iOffset = (iRead-1) * (pLog->hdr.pgsz+LOG_FRAME_HDRSIZE);
1323 iOffset += LOG_FRAME_HDRSIZE;
1324 *pInLog = 1;
1325 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1326 }
1327
1328 *pInLog = 0;
1329 return SQLITE_OK;
1330}
1331
1332
1333/*
1334** Set *pPgno to the size of the database file (or zero, if unknown).
1335*/
1336void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
1337 assert( pLog->isLocked );
1338 *pPgno = pLog->hdr.nPage;
1339}
1340
1341/*
dan7c246102010-04-12 19:00:29 +00001342** This function returns SQLITE_OK if the caller may write to the database.
1343** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001344** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001345*/
1346int sqlite3LogWriteLock(Log *pLog, int op){
1347 assert( pLog->isLocked );
1348 if( op ){
dan64d039e2010-04-13 19:27:31 +00001349
1350 /* Obtain the writer lock */
1351 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1352 if( rc!=SQLITE_OK ){
1353 return rc;
1354 }
1355
dan39c79f52010-04-15 10:58:51 +00001356 /* If this is connection is a region D reader, then the SHARED lock on
1357 ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
1358 ** held on region A. This means that if the write-transaction is committed
dan49320f82010-04-14 18:50:08 +00001359 ** and this connection downgrades to a reader, it will be left with no
dan39c79f52010-04-15 10:58:51 +00001360 ** lock at all. And so its snapshot could get clobbered by a checkpoint
dan49320f82010-04-14 18:50:08 +00001361 ** operation.
1362 **
1363 ** To stop this from happening, grab a SHARED lock on region A now.
1364 ** This should always be successful, as the only time a client holds
1365 ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
1366 ** lock on region C (a checkpointer does this). This is not possible,
1367 ** as this connection currently has the EXCLUSIVE lock on region C.
dan02bb5962010-04-14 15:49:40 +00001368 */
dan49320f82010-04-14 18:50:08 +00001369 if( pLog->isLocked==LOG_REGION_D ){
1370 logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
1371 pLog->isLocked = LOG_REGION_A;
1372 }
dan02bb5962010-04-14 15:49:40 +00001373
dan39c79f52010-04-15 10:58:51 +00001374 /* If this connection is not reading the most recent database snapshot,
1375 ** it is not possible to write to the database. In this case release
1376 ** the write locks and return SQLITE_BUSY.
1377 */
dan7c246102010-04-12 19:00:29 +00001378 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
dan49320f82010-04-14 18:50:08 +00001379 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001380 return SQLITE_BUSY;
1381 }
1382 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001383
dan7c246102010-04-12 19:00:29 +00001384 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001385 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001386 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1387 pLog->isWriteLocked = 0;
1388 }
1389 return SQLITE_OK;
1390}
1391
1392/*
1393** Write a set of frames to the log. The caller must hold at least a
1394** RESERVED lock on the database file.
1395*/
1396int sqlite3LogFrames(
1397 Log *pLog, /* Log handle to write to */
1398 int nPgsz, /* Database page-size in bytes */
1399 PgHdr *pList, /* List of dirty pages to write */
1400 Pgno nTruncate, /* Database size after this commit */
1401 int isCommit, /* True if this is a commit */
1402 int isSync /* True to sync the log file */
1403){
1404 /* Each frame has a 20 byte header, as follows:
1405 **
1406 ** + Pseudo-random salt (4 bytes)
1407 ** + Page number (4 bytes)
1408 ** + New database size, or 0 if not a commit frame (4 bytes)
1409 ** + Checksum (CHECKSUM_BYTES bytes);
1410 **
1411 ** The checksum is computed based on the following:
1412 **
1413 ** + The previous checksum, or {2, 3} for the first frame in the log.
1414 ** + The non-checksum fields of the frame header, and
1415 ** + The frame contents (page data).
1416 **
1417 ** This format must also be understood by the code in logSummaryRecover().
1418 ** The size of the frame header is used by LogRead() and LogCheckpoint().
1419 */
1420 int rc; /* Used to catch return codes */
1421 u32 iFrame; /* Next frame address */
1422 u8 aFrame[LOG_FRAME_HDRSIZE];
1423 PgHdr *p; /* Iterator to run through pList with. */
1424 u32 aCksum[2];
1425
1426 PgHdr *pLast; /* Last frame in list */
1427 int nLast = 0; /* Number of extra copies of last page */
1428
1429 assert( LOG_FRAME_HDRSIZE==(4 * 3 + LOG_CKSM_BYTES) );
1430 assert( pList );
1431
1432 aCksum[0] = pLog->hdr.iCheck1;
1433 aCksum[1] = pLog->hdr.iCheck2;
1434
1435 /* Write the log file. */
1436 iFrame = pLog->hdr.iLastPg;
1437 for(p=pList; p; p=p->pDirty){
1438 u32 nDbsize; /* Db-size field for frame header */
1439 i64 iOffset; /* Write offset in log file */
1440
1441 iFrame++;
1442 iOffset = (iFrame-1) * (nPgsz+sizeof(aFrame));
1443
1444 /* Populate and write the frame header */
1445 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1446 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1447 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1448 if( rc!=SQLITE_OK ){
1449 return rc;
1450 }
1451
1452 /* Write the page data */
1453 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1454 if( rc!=SQLITE_OK ){
1455 return rc;
1456 }
1457 pLast = p;
1458 }
1459
1460 /* Sync the log file if the 'isSync' flag was specified. */
1461 if( isSync ){
1462#if 0
1463 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
1464 i64 iOffset = iFrame * (nPgsz+sizeof(aFrame));
1465
1466 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1467 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1468 }
1469 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1470 while( iOffset<iSegment ){
1471 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1472 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1473 if( rc!=SQLITE_OK ){
1474 return rc;
1475 }
1476
1477 iOffset += LOG_FRAME_HDRSIZE;
1478 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1479 if( rc!=SQLITE_OK ){
1480 return rc;
1481 }
1482 nLast++;
1483 iOffset += nPgsz;
1484 }
1485#endif
1486
1487 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1488 if( rc!=SQLITE_OK ){
1489 return rc;
1490 }
1491 }
1492
1493 /* Append data to the log summary. It is not necessary to lock the
1494 ** log-summary to do this as the RESERVED lock held on the db file
1495 ** guarantees that there are no other writers, and no data that may
1496 ** be in use by existing readers is being overwritten.
1497 */
1498 iFrame = pLog->hdr.iLastPg;
1499 for(p=pList; p; p=p->pDirty){
1500 iFrame++;
1501 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1502 }
1503 while( nLast>0 ){
1504 iFrame++;
1505 nLast--;
1506 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1507 }
1508
1509 /* Update the private copy of the header. */
1510 pLog->hdr.pgsz = nPgsz;
1511 pLog->hdr.iLastPg = iFrame;
1512 if( isCommit ){
1513 pLog->hdr.iChange++;
1514 pLog->hdr.nPage = nTruncate;
1515 }
1516 pLog->hdr.iCheck1 = aCksum[0];
1517 pLog->hdr.iCheck2 = aCksum[1];
1518
1519 /* If this is a commit, update the log-summary header too. */
1520 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1521 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1522 logLeaveMutex(pLog);
1523 }
1524
1525 return SQLITE_OK;
1526}
1527
1528/*
danb9bf16b2010-04-14 11:23:30 +00001529** Checkpoint the database:
1530**
1531** 1. Wait for an EXCLUSIVE lock on regions B and C.
1532** 2. Wait for an EXCLUSIVE lock on region A.
1533** 3. Copy the contents of the log into the database file.
1534** 4. Zero the log-summary header (so new readers will ignore the log).
1535** 5. Drop the locks obtained in steps 1 and 2.
dan7c246102010-04-12 19:00:29 +00001536*/
1537int sqlite3LogCheckpoint(
1538 Log *pLog, /* Log connection */
1539 sqlite3_file *pFd, /* File descriptor open on db file */
dan64d039e2010-04-13 19:27:31 +00001540 u8 *zBuf, /* Temporary buffer to use */
1541 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1542 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001543){
danb9bf16b2010-04-14 11:23:30 +00001544 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001545
dan39c79f52010-04-15 10:58:51 +00001546 assert( !pLog->isLocked );
1547
1548 /* Wait for an EXCLUSIVE lock on regions B and C. */
dan64d039e2010-04-13 19:27:31 +00001549 do {
1550 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1551 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1552 if( rc!=SQLITE_OK ) return rc;
1553
dan39c79f52010-04-15 10:58:51 +00001554 /* Wait for an EXCLUSIVE lock on region A. */
dan64d039e2010-04-13 19:27:31 +00001555 do {
1556 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1557 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001558 if( rc!=SQLITE_OK ){
1559 logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1560 return rc;
1561 }
dan64d039e2010-04-13 19:27:31 +00001562
danb9bf16b2010-04-14 11:23:30 +00001563 /* Copy data from the log to the database file. */
1564 rc = logSummaryReadHdr(pLog, 0);
1565 if( rc==SQLITE_OK ){
1566 rc = logCheckpoint(pLog, pFd, zBuf);
1567 }
1568
1569 /* Release the locks. */
dan64d039e2010-04-13 19:27:31 +00001570 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1571 return rc;
dan7c246102010-04-12 19:00:29 +00001572}
1573