blob: 7d37dfa1deab767b333ac139c25d07ed9f8fce06 [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
7#include "log.h"
8
9#include <unistd.h>
10#include <fcntl.h>
11#include <sys/mman.h>
12
13typedef struct LogSummaryHdr LogSummaryHdr;
14typedef struct LogSummary LogSummary;
15typedef struct LogCheckpoint LogCheckpoint;
16
17
18/*
19** The following structure may be used to store the same data that
20** is stored in the log-summary header.
21**
22** Member variables iCheck1 and iCheck2 contain the checksum for the
23** last frame written to the log, or 2 and 3 respectively if the log
24** is currently empty.
25*/
26struct LogSummaryHdr {
27 u32 iChange; /* Counter incremented each transaction */
28 u32 pgsz; /* Database page size in bytes */
29 u32 iLastPg; /* Address of last valid frame in log */
30 u32 nPage; /* Size of database in pages */
31 u32 iCheck1; /* Checkpoint value 1 */
32 u32 iCheck2; /* Checkpoint value 2 */
33};
34
35/* Size of serialized LogSummaryHdr object. */
36#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
37
38#define LOGSUMMARY_FRAME_OFFSET \
39 (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
40
41/* Size of frame header */
42#define LOG_FRAME_HDRSIZE 20
43
44/*
45** There is one instance of this structure for each log-summary object
46** that this process has a connection to. They are stored in a linked
47** list starting at pLogSummary (global variable).
48**
49** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
50** directly in this implementation because the VFS does not support
51** the required blocking file-locks.
52*/
53struct LogSummary {
54 sqlite3_mutex *mutex; /* Mutex used to protect this object */
55 int nRef; /* Number of pointers to this structure */
56 int fd; /* File descriptor open on log-summary */
57 char *zPath; /* Path to associated WAL file */
58 LogSummary *pNext; /* Next in global list */
59 int nData; /* Size of aData allocation/mapping */
60 u32 *aData; /* File body */
61};
62
63/*
64** List of all LogSummary objects created by this process. Protected by
65** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
66** here instead of borrowing the LRU mutex.
67*/
68#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
69static LogSummary *pLogSummary = 0;
70
71struct Log {
72 LogSummary *pSummary; /* Log file summary data */
73 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
74 sqlite3_file *pFd; /* File handle for log file */
75 int sync_flags; /* Flags to use with OsSync() */
76 int isLocked; /* True if a snapshot is held open */
77 int isWriteLocked; /* True if this is the writer connection */
78 LogSummaryHdr hdr; /* Log summary header for current snapshot */
79};
80
81/*
82** This structure is used to implement an iterator that iterates through
83** all frames in the log in database page order. Where two or more frames
84** correspond to the same database page, the iterator visits only the
85** frame most recently written to the log.
86**
87** The internals of this structure are only accessed by:
88**
89** logCheckpointInit() - Create a new iterator,
90** logCheckpointNext() - Step an iterator,
91** logCheckpointFree() - Free an iterator.
92**
93** This functionality is used by the checkpoint code (see logCheckpoint()).
94*/
95struct LogCheckpoint {
96 int nSegment; /* Size of LogCheckpoint.aSummary[] array */
97 int nFinal; /* Elements in segment nSegment-1 */
98 struct LogSegment {
99 int iNext; /* Next aIndex index */
100 u8 *aIndex; /* Pointer to index array */
101 u32 *aDbPage; /* Pointer to db page array */
102 } aSegment[1];
103};
104
105/*
106** Generate an 8 byte checksum based on the data in array aByte[] and the
107** initial values of aCksum[0] and aCksum[1]. The checksum is written into
108** aCksum[] before returning.
109*/
110#define LOG_CKSM_BYTES 8
111static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
112 u32 *z32 = (u32 *)aByte;
113 int n32 = nByte / sizeof(u32);
114 int i;
115
116 assert( LOG_CKSM_BYTES==2*sizeof(u32) );
117 assert( (nByte&0x00000003)==0 );
118
119 u32 cksum0 = aCksum[0];
120 u32 cksum1 = aCksum[1];
121
122 for(i=0; i<n32; i++){
123 cksum0 = (cksum0 >> 8) + (cksum0 ^ z32[i]);
124 cksum1 = (cksum1 >> 8) + (cksum1 ^ z32[i]);
125 }
126
127 aCksum[0] = cksum0;
128 aCksum[1] = cksum1;
129}
130
131/*
132** Argument zPath must be a nul-terminated string containing a path-name.
133** This function modifies the string in-place by removing any "./" or "../"
134** elements in the path. For example, the following input:
135**
136** "/home/user/plans/good/../evil/./world_domination.txt"
137**
138** is overwritten with the 'normalized' version:
139**
140** "/home/user/plans/evil/world_domination.txt"
141*/
142static void logNormalizePath(char *zPath){
143 int i, j;
144 char *z = zPath;
145 int n = strlen(z);
146
147 while( n>1 && z[n-1]=='/' ){ n--; }
148 for(i=j=0; i<n; i++){
149 if( z[i]=='/' ){
150 if( z[i+1]=='/' ) continue;
151 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
152 i += 1;
153 continue;
154 }
155 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
156 while( j>0 && z[j-1]!='/' ){ j--; }
157 if( j>0 ){ j--; }
158 i += 2;
159 continue;
160 }
161 }
162 z[j++] = z[i];
163 }
164 z[j] = 0;
165}
166
167/*
168** Lock the summary file pSummary->fd.
169*/
170static int logSummaryLock(LogSummary *pSummary){
171 int rc;
172 struct flock f;
173 memset(&f, 0, sizeof(f));
174 f.l_type = F_WRLCK;
175 f.l_whence = SEEK_SET;
176 f.l_start = 0;
177 f.l_len = 1;
178 rc = fcntl(pSummary->fd, F_SETLKW, &f);
179 if( rc!=0 ){
180 return SQLITE_IOERR;
181 }
182 return SQLITE_OK;
183}
184
185/*
186** Unlock the summary file pSummary->fd.
187*/
188static int logSummaryUnlock(LogSummary *pSummary){
189 int rc;
190 struct flock f;
191 memset(&f, 0, sizeof(f));
192 f.l_type = F_UNLCK;
193 f.l_whence = SEEK_SET;
194 f.l_start = 0;
195 f.l_len = 1;
196 rc = fcntl(pSummary->fd, F_SETLK, &f);
197 if( rc!=0 ){
198 return SQLITE_IOERR;
199 }
200 return SQLITE_OK;
201}
202
203/*
204** Memory map the first nByte bytes of the summary file opened with
205** pSummary->fd at pSummary->aData. If the summary file is smaller than
206** nByte bytes in size when this function is called, ftruncate() is
207** used to expand it before it is mapped.
208**
209** It is assumed that an exclusive lock is held on the summary file
210** by the caller (to protect the ftruncate()).
211*/
212static int logSummaryMap(LogSummary *pSummary, int nByte){
213 struct stat sStat;
214 int rc;
215 int fd = pSummary->fd;
216 void *pMap;
217
218 assert( pSummary->aData==0 );
219
220 /* If the file is less than nByte bytes in size, cause it to grow. */
221 rc = fstat(fd, &sStat);
222 if( rc!=0 ) return SQLITE_IOERR;
223 if( sStat.st_size<nByte ){
224 rc = ftruncate(fd, nByte);
225 if( rc!=0 ) return SQLITE_IOERR;
226 }
227
228 /* Map the file. */
229 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
230 if( pMap==MAP_FAILED ){
231 return SQLITE_IOERR;
232 }
233 pSummary->aData = (u32 *)pMap;
234 pSummary->nData = nByte;
235
236 return SQLITE_OK;
237}
238
239/*
240** Unmap the log-summary mapping and close the file-descriptor. If
241** the isTruncate argument is non-zero, truncate the log-summary file
242** region to zero bytes.
243**
244** Regardless of the value of isTruncate, close the file-descriptor
245** opened on the log-summary file.
246*/
247static int logSummaryUnmap(LogSummary *pSummary, int isTruncate){
248 int rc = SQLITE_OK;
249 if( pSummary->aData ){
250 assert( pSummary->fd>0 );
251 munmap(pSummary->aData, pSummary->nData);
252 pSummary->aData = 0;
253 if( isTruncate ){
254 rc = (ftruncate(pSummary->fd, 0) ? SQLITE_IOERR : SQLITE_OK);
255 }
256 }
257 if( pSummary->fd>0 ){
258 close(pSummary->fd);
259 pSummary->fd = -1;
260 }
261 return rc;
262}
263
264
265static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
266 u32 *aData = pSummary->aData;
267 memcpy(aData, pHdr, sizeof(LogSummaryHdr));
268 aData[LOGSUMMARY_HDR_NFIELD] = 1;
269 aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
270 logChecksumBytes(
271 (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
272 );
273}
274
275/*
276** This function encodes a single frame header and writes it to a buffer
277** supplied by the caller. A log frame-header is made up of a series of
278** 4-byte big-endian integers, as follows:
279**
280** 0: Database page size in bytes.
281** 4: Page number.
282** 8: New database size (for commit frames, otherwise zero).
283** 12: Frame checksum 1.
284** 16: Frame checksum 2.
285*/
286static void logEncodeFrame(
287 u32 *aCksum, /* IN/OUT: Checksum values */
288 u32 iPage, /* Database page number for frame */
289 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
290 int nData, /* Database page size (size of aData[]) */
291 u8 *aData, /* Pointer to page data (for checksum) */
292 u8 *aFrame /* OUT: Write encoded frame here */
293){
294 assert( LOG_FRAME_HDRSIZE==20 );
295
296 sqlite3Put4byte(&aFrame[0], nData);
297 sqlite3Put4byte(&aFrame[4], iPage);
298 sqlite3Put4byte(&aFrame[8], nTruncate);
299
300 logChecksumBytes(aFrame, 12, aCksum);
301 logChecksumBytes(aData, nData, aCksum);
302
303 sqlite3Put4byte(&aFrame[12], aCksum[0]);
304 sqlite3Put4byte(&aFrame[16], aCksum[1]);
305}
306
307/*
308** Return 1 and populate *piPage, *pnTruncate and aCksum if the
309** frame checksum looks Ok. Otherwise return 0.
310*/
311static int logDecodeFrame(
312 u32 *aCksum, /* IN/OUT: Checksum values */
313 u32 *piPage, /* OUT: Database page number for frame */
314 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
315 int nData, /* Database page size (size of aData[]) */
316 u8 *aData, /* Pointer to page data (for checksum) */
317 u8 *aFrame /* Frame data */
318){
319 logChecksumBytes(aFrame, 12, aCksum);
320 logChecksumBytes(aData, nData, aCksum);
321
322 if( aCksum[0]!=sqlite3Get4byte(&aFrame[12])
323 || aCksum[1]!=sqlite3Get4byte(&aFrame[16])
324 ){
325 /* Checksum failed. */
326 return 0;
327 }
328
329 *piPage = sqlite3Get4byte(&aFrame[4]);
330 *pnTruncate = sqlite3Get4byte(&aFrame[8]);
331 return 1;
332}
333
334static void logMergesort8(
335 Pgno *aContent, /* Pages in log */
336 u8 *aBuffer, /* Buffer of at least *pnList items to use */
337 u8 *aList, /* IN/OUT: List to sort */
338 int *pnList /* IN/OUT: Number of elements in aList[] */
339){
340 int nList = *pnList;
341 if( nList>1 ){
342 int nLeft = nList / 2; /* Elements in left list */
343 int nRight = nList - nLeft; /* Elements in right list */
344 u8 *aLeft = aList; /* Left list */
345 u8 *aRight = &aList[nLeft]; /* Right list */
346 int iLeft = 0; /* Current index in aLeft */
347 int iRight = 0; /* Current index in aright */
348 int iOut = 0; /* Current index in output buffer */
349
350 /* TODO: Change to non-recursive version. */
351 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
352 logMergesort8(aContent, aBuffer, aRight, &nRight);
353
354 while( iRight<nRight || iLeft<nLeft ){
355 u8 logpage;
356 Pgno dbpage;
357
358 if( (iLeft<nLeft)
359 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
360 ){
361 logpage = aLeft[iLeft++];
362 }else{
363 logpage = aRight[iRight++];
364 }
365 dbpage = aContent[logpage];
366
367 aBuffer[iOut++] = logpage;
368 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
369
370 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
371 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
372 }
373 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
374 *pnList = iOut;
375 }
376
377#ifdef SQLITE_DEBUG
378 {
379 int i;
380 for(i=1; i<*pnList; i++){
381 assert( aContent[aList[i]] > aContent[aList[i-1]] );
382 }
383 }
384#endif
385}
386
387
388/*
389** Return the index in the LogSummary.aData array that corresponds to
390** frame iFrame. The log-summary file consists of a header, followed by
391** alternating "map" and "index" blocks.
392*/
393static int logSummaryEntry(u32 iFrame){
394 return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
395}
396
397
398/*
399** Set an entry in the log-summary map to map log frame iFrame to db
400** page iPage. Values are always appended to the log-summary (i.e. the
401** value of iFrame is always exactly one more than the value passed to
402** the previous call), but that restriction is not enforced or asserted
403** here.
404*/
405static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
406 u32 iSlot = logSummaryEntry(iFrame);
407
408 /* Set the log-summary entry itself */
409 pSummary->aData[iSlot] = iPage;
410
411 /* If the frame number is a multiple of 256 (frames are numbered starting
412 ** at 1), build an index of the most recently added 256 frames.
413 */
414 if( (iFrame&0x000000FF)==0 ){
415 int i; /* Iterator used while initializing aIndex */
416 u32 *aFrame; /* Pointer to array of 256 frames */
417 int nIndex; /* Number of entries in index */
418 u8 *aIndex; /* 256 bytes to build index in */
419 u8 *aTmp; /* Scratch space to use while sorting */
420
421 aFrame = &pSummary->aData[iSlot-255];
422 aIndex = (u8 *)&pSummary->aData[iSlot+1];
423 aTmp = &aIndex[256];
424
425 nIndex = 256;
426 for(i=0; i<256; i++) aIndex[i] = (u8)i;
427 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
428 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
429 }
430}
431
432
433/*
434** Recover the log-summary by reading the log file. The caller must hold
435** an exclusive lock on the log-summary file.
436*/
437static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
438 int rc; /* Return Code */
439 i64 nSize; /* Size of log file */
440 LogSummaryHdr hdr; /* Recovered log-summary header */
441
442 memset(&hdr, 0, sizeof(hdr));
443
444 rc = sqlite3OsFileSize(pFd, &nSize);
445 if( rc!=SQLITE_OK ){
446 return rc;
447 }
448
449 if( nSize>LOG_FRAME_HDRSIZE ){
450 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
451 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
452 int nFrame; /* Number of bytes at aFrame */
453 u8 *aData; /* Pointer to data part of aFrame buffer */
454 int iFrame; /* Index of last frame read */
455 i64 iOffset; /* Next offset to read from log file */
456 int nPgsz; /* Page size according to the log */
457 u32 aCksum[2] = {2, 3}; /* Running checksum */
458
459 /* Read in the first frame header in the file (to determine the
460 ** database page size).
461 */
462 rc = sqlite3OsRead(pFd, aBuf, LOG_FRAME_HDRSIZE, 0);
463 if( rc!=SQLITE_OK ){
464 return rc;
465 }
466
467 /* If the database page size is not a power of two, or is greater than
468 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
469 */
470 nPgsz = sqlite3Get4byte(&aBuf[0]);
471 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
472 goto finished;
473 }
474
475 /* Malloc a buffer to read frames into. */
476 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
477 aFrame = (u8 *)sqlite3_malloc(nFrame);
478 if( !aFrame ){
479 return SQLITE_NOMEM;
480 }
481 aData = &aFrame[LOG_FRAME_HDRSIZE];
482
483 /* Read all frames from the log file. */
484 iFrame = 0;
485 iOffset = 0;
486 for(iOffset=0; (iOffset+nFrame)<nSize; iOffset+=nFrame){
487 u32 pgno; /* Database page number for frame */
488 u32 nTruncate; /* dbsize field from frame header */
489 int isValid; /* True if this frame is valid */
490
491 /* Read and decode the next log frame. */
492 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
493 if( rc!=SQLITE_OK ) break;
494 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
495 if( !isValid ) break;
496 logSummaryAppend(pSummary, ++iFrame, pgno);
497
498 /* If nTruncate is non-zero, this is a commit record. */
499 if( nTruncate ){
500 hdr.iCheck1 = aCksum[0];
501 hdr.iCheck2 = aCksum[1];
502 hdr.iLastPg = iFrame;
503 hdr.nPage = nTruncate;
504 hdr.pgsz = nPgsz;
505 }
506 }
507
508 sqlite3_free(aFrame);
509 }else{
510 hdr.iCheck1 = 2;
511 hdr.iCheck2 = 3;
512 }
513
514finished:
515 logSummaryWriteHdr(pSummary, &hdr);
516 return rc;
517}
518
519
520/*
521** This function intializes the connection to the log-summary identified
522** by struct pSummary.
523*/
524static int logSummaryInit(LogSummary *pSummary, sqlite3_file *pFd){
525 int rc; /* Return Code */
526 char *zFile; /* File name for summary file */
527
528 assert( pSummary->fd<0 );
529 assert( pSummary->aData==0 );
530 assert( pSummary->nRef>0 );
531 assert( pSummary->zPath );
532
533 /* Open a file descriptor on the summary file. */
534 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
535 if( !zFile ){
536 return SQLITE_NOMEM;
537 }
538 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
539 sqlite3_free(zFile);
540 if( pSummary->fd<0 ){
541 return SQLITE_IOERR;
542 }
543
544 /* Grab an exclusive lock the summary file. Then mmap() it. TODO: This
545 ** code needs to be enhanced to support a growable mapping. For now, just
546 ** make the mapping very large to start with.
547 */
548 rc = logSummaryLock(pSummary);
549 if( rc!=SQLITE_OK ) return rc;
550 rc = logSummaryMap(pSummary, 512*1024);
551 if( rc!=SQLITE_OK ) goto out;
552
553 /* Grab a SHARED lock on the log file. Then try to upgrade to an EXCLUSIVE
554 ** lock. If successful, then this is the first (and only) connection to
555 ** the database. In this case assume the contents of the log-summary
556 ** cannot be trusted. Zero the log-summary header to make sure.
557 **
558 ** The SHARED lock on the log file is not released until the connection
559 ** to the database is closed.
560 */
561 rc = sqlite3OsLock(pFd, SQLITE_LOCK_SHARED);
562 if( rc!=SQLITE_OK ) goto out;
563 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
564 if( rc==SQLITE_OK ){
565 /* This is the first and only connection. */
566 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
567 rc = sqlite3OsUnlock(pFd, SQLITE_LOCK_SHARED);
568 }else if( rc==SQLITE_BUSY ){
569 rc = SQLITE_OK;
570 }
571
572 out:
573 logSummaryUnlock(pSummary);
574 return rc;
575}
576
577/*
578** Open a connection to the log file associated with database zDb. The
579** database file does not actually have to exist. zDb is used only to
580** figure out the name of the log file to open. If the log file does not
581** exist it is created by this call.
582*/
583int sqlite3LogOpen(
584 sqlite3_vfs *pVfs, /* vfs module to open log file with */
585 const char *zDb, /* Name of database file */
586 Log **ppLog /* OUT: Allocated Log handle */
587){
588 int rc; /* Return Code */
589 Log *pRet; /* Object to allocate and return */
590 LogSummary *pSummary = 0; /* Summary object */
591 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
592 int flags; /* Flags passed to OsOpen() */
593 char *zWal = 0; /* Path to WAL file */
594 int nWal; /* Length of zWal in bytes */
595
596 /* Zero output variables */
597 assert( zDb );
598 *ppLog = 0;
599
600 /* Allocate an instance of struct Log to return. */
601 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
602 if( !pRet ) goto out;
603 pRet->pVfs = pVfs;
604 pRet->pFd = (sqlite3_file *)&pRet[1];
605 pRet->sync_flags = SQLITE_SYNC_NORMAL;
606
607 /* Normalize the path name. */
608 zWal = sqlite3_mprintf("%s-wal", zDb);
609 if( !zWal ) goto out;
610 logNormalizePath(zWal);
611 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_DB);
612 nWal = sqlite3Strlen30(zWal);
613
614 /* Enter the mutex that protects the linked-list of LogSummary structures */
615 if( sqlite3GlobalConfig.bCoreMutex ){
616 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
617 }
618 sqlite3_mutex_enter(mutex);
619
620 /* Search for an existing log summary object in the linked list. If one
621 ** cannot be found, allocate and initialize a new object.
622 */
623 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
624 int nPath = sqlite3Strlen30(pSummary->zPath);
625 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
626 }
627 if( !pSummary ){
628 int nByte = sizeof(LogSummary) + nWal + 1;
629 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
630 if( !pSummary ){
631 rc = SQLITE_NOMEM;
632 goto out;
633 }
634 if( sqlite3GlobalConfig.bCoreMutex ){
635 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
636 }
637 pSummary->zPath = (char *)&pSummary[1];
638 pSummary->fd = -1;
639 memcpy(pSummary->zPath, zWal, nWal);
640 pSummary->pNext = pLogSummary;
641 pLogSummary = pSummary;
642 }
643 pSummary->nRef++;
644 pRet->pSummary = pSummary;
645
646 /* Exit the mutex protecting the linked-list of LogSummary objects. */
647 sqlite3_mutex_leave(mutex);
648 mutex = 0;
649
650 /* Open file handle on the log file. */
651 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
652 if( rc!=SQLITE_OK ) goto out;
653
654 /* Object pSummary is shared between all connections to the database made
655 ** by this process. So at this point it may or may not be connected to
656 ** the log-summary. If it is not, connect it. Otherwise, just take the
657 ** SHARED lock on the log file.
658 */
659 sqlite3_mutex_enter(pSummary->mutex);
660 mutex = pSummary->mutex;
661 if( pSummary->fd<0 ){
662 rc = logSummaryInit(pSummary, pRet->pFd);
663 }else{
664 rc = sqlite3OsLock(pRet->pFd, SQLITE_LOCK_SHARED);
665 }
666
667 out:
668 sqlite3_mutex_leave(mutex);
669 sqlite3_free(zWal);
670 if( rc!=SQLITE_OK ){
671 assert(0);
672 if( pRet ){
673 sqlite3OsClose(pRet->pFd);
674 sqlite3_free(pRet);
675 }
676 assert( !pSummary || pSummary->nRef==0 );
677 sqlite3_free(pSummary);
678 }
679 *ppLog = pRet;
680 return rc;
681}
682
683static int logCheckpointNext(
684 LogCheckpoint *p, /* Iterator */
685 u32 *piPage, /* OUT: Next db page to write */
686 u32 *piFrame /* OUT: Log frame to read from */
687){
688 u32 iMin = *piPage;
689 u32 iRet = 0xFFFFFFFF;
690 int i;
691 int nBlock = p->nFinal;
692
693 for(i=p->nSegment-1; i>=0; i--){
694 struct LogSegment *pSegment = &p->aSegment[i];
695 while( pSegment->iNext<nBlock ){
696 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
697 if( iPg>iMin ){
698 if( iPg<iRet ){
699 iRet = iPg;
700 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
701 }
702 break;
703 }
704 pSegment->iNext++;
705 }
706
707 nBlock = 256;
708 }
709
710 *piPage = iRet;
711 return (iRet==0xFFFFFFFF);
712}
713
714static LogCheckpoint *logCheckpointInit(Log *pLog){
715 u32 *aData = pLog->pSummary->aData;
716 LogCheckpoint *p; /* Return value */
717 int nSegment; /* Number of segments to merge */
718 u32 iLast; /* Last frame in log */
719 int nByte; /* Number of bytes to allocate */
720 int i; /* Iterator variable */
721 int nFinal; /* Number of unindexed entries */
722 struct LogSegment *pFinal; /* Final (unindexed) segment */
723 u8 *aTmp; /* Temp space used by merge-sort */
724
725 iLast = pLog->hdr.iLastPg;
726 nSegment = (iLast >> 8) + 1;
727 nFinal = (iLast & 0x000000FF);
728
729 nByte = sizeof(LogCheckpoint) + (nSegment-1)*sizeof(struct LogSegment) + 512;
730 p = (LogCheckpoint *)sqlite3_malloc(nByte);
731 if( p ){
732 memset(p, 0, nByte);
733 p->nSegment = nSegment;
734 p->nFinal = nFinal;
735 }
736
737 for(i=0; i<nSegment-1; i++){
738 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
739 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
740 }
741 pFinal = &p->aSegment[nSegment-1];
742
743 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
744 pFinal->aIndex = (u8 *)&pFinal[1];
745 aTmp = &pFinal->aIndex[256];
746 for(i=0; i<nFinal; i++){
747 pFinal->aIndex[i] = i;
748 }
749 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
750 p->nFinal = nFinal;
751
752 return p;
753}
754
755/*
756** Free a log iterator allocated by logCheckpointInit().
757*/
758static void logCheckpointFree(LogCheckpoint *p){
759 sqlite3_free(p);
760}
761
762/*
763** Checkpoint the contents of the log file.
764*/
765static int logCheckpoint(
766 Log *pLog, /* Log connection */
767 sqlite3_file *pFd, /* File descriptor open on db file */
768 u8 *zBuf /* Temporary buffer to use */
769){
770 int rc; /* Return code */
771 int pgsz = pLog->hdr.pgsz; /* Database page-size */
772 LogCheckpoint *pIter = 0; /* Log iterator context */
773 u32 iDbpage = 0; /* Next database page to write */
774 u32 iFrame; /* Log frame containing data for iDbpage */
775
776 /* Allocate the iterator */
777 pIter = logCheckpointInit(pLog);
778 if( !pIter ) return SQLITE_NOMEM;
779
780 /* Sync the log file to disk */
781 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
782 if( rc!=SQLITE_OK ) goto out;
783
784 /* Iterate through the contents of the log, copying data to the db file. */
785 while( 0==logCheckpointNext(pIter, &iDbpage, &iFrame) ){
786 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
787 (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
788 );
789 if( rc!=SQLITE_OK ) goto out;
790 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
791 if( rc!=SQLITE_OK ) goto out;
792 }
793
794 /* Truncate the database file */
795 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
796 if( rc!=SQLITE_OK ) goto out;
797
798 /* Sync the database file. If successful, update the log-summary. */
799 rc = sqlite3OsSync(pFd, pLog->sync_flags);
800 if( rc!=SQLITE_OK ) goto out;
801 pLog->hdr.iLastPg = 0;
802 pLog->hdr.iCheck1 = 2;
803 pLog->hdr.iCheck2 = 3;
804 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
805
806 /* TODO: If a crash occurs and the current log is copied into the
807 ** database there is no problem. However, if a crash occurs while
808 ** writing the next transaction into the start of the log, such that:
809 **
810 ** * The first transaction currently in the log is left intact, but
811 ** * The second (or subsequent) transaction is damaged,
812 **
813 ** then the database could become corrupt.
814 **
815 ** The easiest thing to do would be to write and sync a dummy header
816 ** into the log at this point. Unfortunately, that turns out to be
817 ** an unwelcome performance hit. Alternatives are...
818 */
819#if 0
820 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
821 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
822 if( rc!=SQLITE_OK ) goto out;
823 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
824#endif
825
826 out:
827 logCheckpointFree(pIter);
828 return rc;
829}
830
831/*
832** Close a connection to a log file.
833*/
834int sqlite3LogClose(
835 Log *pLog, /* Log to close */
836 sqlite3_file *pFd, /* Database file */
837 u8 *zBuf /* Buffer of at least page-size bytes */
838){
839 int rc = SQLITE_OK;
840 if( pLog ){
841 LogSummary *pSummary = pLog->pSummary;
842 sqlite3_mutex *mutex = 0;
843
844 if( sqlite3GlobalConfig.bCoreMutex ){
845 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
846 }
847 sqlite3_mutex_enter(mutex);
848
849 /* Decrement the reference count on the log summary. If this is the last
850 ** reference to the log summary object in this process, the object will
851 ** be freed. If this is also the last connection to the database, then
852 ** checkpoint the database and truncate the log and log-summary files
853 ** to zero bytes in size.
854 **/
855 pSummary->nRef--;
856 if( pSummary->nRef==0 ){
857 LogSummary **pp;
858
859 rc = logSummaryLock(pSummary);
860 if( rc==SQLITE_OK ){
861 int isTruncate = 0;
862 int rc2 = sqlite3OsLock(pLog->pFd, SQLITE_LOCK_EXCLUSIVE);
863 if( rc2==SQLITE_OK ){
864 /* This is the last connection to the database (including other
865 ** processes). Do three things:
866 **
867 ** 1. Checkpoint the db.
868 ** 2. Truncate the log file to zero bytes.
869 ** 3. Truncate the log-summary file to zero bytes.
870 */
871 rc2 = logCheckpoint(pLog, pFd, zBuf);
872 if( rc2==SQLITE_OK ){
873 rc2 = sqlite3OsTruncate(pLog->pFd, 0);
874 }
875 isTruncate = 1;
876 }else if( rc2==SQLITE_BUSY ){
877 rc2 = SQLITE_OK;
878 }
879 logSummaryUnmap(pSummary, isTruncate);
880 sqlite3OsUnlock(pLog->pFd, SQLITE_LOCK_NONE);
881 rc = logSummaryUnlock(pSummary);
882 if( rc2!=SQLITE_OK ) rc = rc2;
883 }
884
885 /* Remove the LogSummary object from the global list. Then free the
886 ** mutex and the object itself.
887 */
888 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
889 *pp = (*pp)->pNext;
890 sqlite3_mutex_free(pSummary->mutex);
891 sqlite3_free(pSummary);
892 }
893
894 sqlite3_mutex_leave(mutex);
895
896 /* Close the connection to the log file and free the Log handle. */
897 sqlite3OsClose(pLog->pFd);
898 sqlite3_free(pLog);
899 }
900 return rc;
901}
902
903/*
904** Set the flags to pass to the sqlite3OsSync() function when syncing
905** the log file.
906*/
907#if 0
908void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
909 assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
910 pLog->sync_flags = sync_flags;
911}
912#endif
913
914/*
915** Enter and leave the log-summary mutex. In this context, entering the
916** log-summary mutex means:
917**
918** 1. Obtaining mutex pLog->pSummary->mutex, and
919** 2. Taking an exclusive lock on the log-summary file.
920**
921** i.e. this mutex locks out other processes as well as other threads
922** hosted in this address space.
923*/
924static int logEnterMutex(Log *pLog){
925 LogSummary *pSummary = pLog->pSummary;
926 int rc;
927
928 sqlite3_mutex_enter(pSummary->mutex);
929 rc = logSummaryLock(pSummary);
930 if( rc!=SQLITE_OK ){
931 sqlite3_mutex_leave(pSummary->mutex);
932 }
933 return rc;
934}
935static void logLeaveMutex(Log *pLog){
936 LogSummary *pSummary = pLog->pSummary;
937 logSummaryUnlock(pSummary);
938 sqlite3_mutex_leave(pSummary->mutex);
939}
940
941/*
942** The caller must hold a SHARED lock on the database file.
943**
944** If this call obtains a new read-lock and the database contents have been
945** modified since the most recent call to LogCloseSnapshot() on this Log
946** connection, then *pChanged is set to 1 before returning. Otherwise, it
947** is left unmodified. This is used by the pager layer to determine whether
948** or not any cached pages may be safely reused.
949*/
950int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
951 int rc = SQLITE_OK;
952 if( pLog->isLocked==0 ){
953 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
954 u32 aCksum[2] = {1, 1};
955 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
956 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
957
958 /* Verify the checksum on the log-summary header. If it fails,
959 ** recover the log-summary from the log file.
960 */
961 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
962 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
963 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
964 ){
965 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
966 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
967 *pChanged = 1;
968 }
969 if( rc==SQLITE_OK ){
970 pLog->isLocked = 1;
971 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
972 *pChanged = 1;
973 memcpy(&pLog->hdr, aHdr, LOGSUMMARY_HDR_NFIELD*sizeof(u32));
974 }
975 }
976 logLeaveMutex(pLog);
977 }
978 }
979 return rc;
980}
981
982/*
983** Unlock the current snapshot.
984*/
985void sqlite3LogCloseSnapshot(Log *pLog){
986 pLog->isLocked = 0;
987}
988
989
990
991/*
992** Read a page from the log, if it is present.
993*/
994int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
995 u32 iRead = 0;
996 u32 *aData = pLog->pSummary->aData;
997 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
998
999 /* Do a linear search of the unindexed block of page-numbers (if any)
1000 ** at the end of the log-summary. An alternative to this would be to
1001 ** build an index in private memory each time a read transaction is
1002 ** opened on a new snapshot.
1003 */
1004 if( pLog->hdr.iLastPg ){
1005 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1006 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1007 while( *pi!=pgno && pi!=piStop ) pi--;
1008 if( pi!=piStop ){
1009 iRead = (pi-piStop) + iFrame;
1010 }
1011 }
1012 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1013
1014 while( iRead==0 && iFrame>0 ){
1015 int iLow = 0;
1016 int iHigh = 255;
1017 u32 *aFrame;
1018 u8 *aIndex;
1019
1020 iFrame -= 256;
1021 aFrame = &aData[logSummaryEntry(iFrame+1)];
1022 aIndex = (u8 *)&aFrame[256];
1023
1024 while( iLow<=iHigh ){
1025 int iTest = (iLow+iHigh)>>1;
1026 u32 iPg = aFrame[aIndex[iTest]];
1027
1028 if( iPg==pgno ){
1029 iRead = iFrame + 1 + aIndex[iTest];
1030 break;
1031 }
1032 else if( iPg<pgno ){
1033 iLow = iTest+1;
1034 }else{
1035 iHigh = iTest-1;
1036 }
1037 }
1038 }
1039 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1040
1041 /* If iRead is non-zero, then it is the log frame number that contains the
1042 ** required page. Read and return data from the log file.
1043 */
1044 if( iRead ){
1045 i64 iOffset = (iRead-1) * (pLog->hdr.pgsz+LOG_FRAME_HDRSIZE);
1046 iOffset += LOG_FRAME_HDRSIZE;
1047 *pInLog = 1;
1048 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1049 }
1050
1051 *pInLog = 0;
1052 return SQLITE_OK;
1053}
1054
1055
1056/*
1057** Set *pPgno to the size of the database file (or zero, if unknown).
1058*/
1059void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
1060 assert( pLog->isLocked );
1061 *pPgno = pLog->hdr.nPage;
1062}
1063
1064/*
1065** The caller must hold at least a RESERVED lock on the database file
1066** when invoking this function.
1067**
1068** This function returns SQLITE_OK if the caller may write to the database.
1069** Otherwise, if the caller is operating on a snapshot that has already
1070** been overwritten by another writer, SQLITE_OBE is returned.
1071*/
1072int sqlite3LogWriteLock(Log *pLog, int op){
1073 assert( pLog->isLocked );
1074 if( op ){
1075 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
1076 return SQLITE_BUSY;
1077 }
1078 pLog->isWriteLocked = 1;
1079 }else if( pLog->isWriteLocked ){
1080 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1081 pLog->isWriteLocked = 0;
1082 }
1083 return SQLITE_OK;
1084}
1085
1086/*
1087** Write a set of frames to the log. The caller must hold at least a
1088** RESERVED lock on the database file.
1089*/
1090int sqlite3LogFrames(
1091 Log *pLog, /* Log handle to write to */
1092 int nPgsz, /* Database page-size in bytes */
1093 PgHdr *pList, /* List of dirty pages to write */
1094 Pgno nTruncate, /* Database size after this commit */
1095 int isCommit, /* True if this is a commit */
1096 int isSync /* True to sync the log file */
1097){
1098 /* Each frame has a 20 byte header, as follows:
1099 **
1100 ** + Pseudo-random salt (4 bytes)
1101 ** + Page number (4 bytes)
1102 ** + New database size, or 0 if not a commit frame (4 bytes)
1103 ** + Checksum (CHECKSUM_BYTES bytes);
1104 **
1105 ** The checksum is computed based on the following:
1106 **
1107 ** + The previous checksum, or {2, 3} for the first frame in the log.
1108 ** + The non-checksum fields of the frame header, and
1109 ** + The frame contents (page data).
1110 **
1111 ** This format must also be understood by the code in logSummaryRecover().
1112 ** The size of the frame header is used by LogRead() and LogCheckpoint().
1113 */
1114 int rc; /* Used to catch return codes */
1115 u32 iFrame; /* Next frame address */
1116 u8 aFrame[LOG_FRAME_HDRSIZE];
1117 PgHdr *p; /* Iterator to run through pList with. */
1118 u32 aCksum[2];
1119
1120 PgHdr *pLast; /* Last frame in list */
1121 int nLast = 0; /* Number of extra copies of last page */
1122
1123 assert( LOG_FRAME_HDRSIZE==(4 * 3 + LOG_CKSM_BYTES) );
1124 assert( pList );
1125
1126 aCksum[0] = pLog->hdr.iCheck1;
1127 aCksum[1] = pLog->hdr.iCheck2;
1128
1129 /* Write the log file. */
1130 iFrame = pLog->hdr.iLastPg;
1131 for(p=pList; p; p=p->pDirty){
1132 u32 nDbsize; /* Db-size field for frame header */
1133 i64 iOffset; /* Write offset in log file */
1134
1135 iFrame++;
1136 iOffset = (iFrame-1) * (nPgsz+sizeof(aFrame));
1137
1138 /* Populate and write the frame header */
1139 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1140 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1141 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1142 if( rc!=SQLITE_OK ){
1143 return rc;
1144 }
1145
1146 /* Write the page data */
1147 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1148 if( rc!=SQLITE_OK ){
1149 return rc;
1150 }
1151 pLast = p;
1152 }
1153
1154 /* Sync the log file if the 'isSync' flag was specified. */
1155 if( isSync ){
1156#if 0
1157 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
1158 i64 iOffset = iFrame * (nPgsz+sizeof(aFrame));
1159
1160 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1161 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1162 }
1163 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1164 while( iOffset<iSegment ){
1165 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1166 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1167 if( rc!=SQLITE_OK ){
1168 return rc;
1169 }
1170
1171 iOffset += LOG_FRAME_HDRSIZE;
1172 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1173 if( rc!=SQLITE_OK ){
1174 return rc;
1175 }
1176 nLast++;
1177 iOffset += nPgsz;
1178 }
1179#endif
1180
1181 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1182 if( rc!=SQLITE_OK ){
1183 return rc;
1184 }
1185 }
1186
1187 /* Append data to the log summary. It is not necessary to lock the
1188 ** log-summary to do this as the RESERVED lock held on the db file
1189 ** guarantees that there are no other writers, and no data that may
1190 ** be in use by existing readers is being overwritten.
1191 */
1192 iFrame = pLog->hdr.iLastPg;
1193 for(p=pList; p; p=p->pDirty){
1194 iFrame++;
1195 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1196 }
1197 while( nLast>0 ){
1198 iFrame++;
1199 nLast--;
1200 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1201 }
1202
1203 /* Update the private copy of the header. */
1204 pLog->hdr.pgsz = nPgsz;
1205 pLog->hdr.iLastPg = iFrame;
1206 if( isCommit ){
1207 pLog->hdr.iChange++;
1208 pLog->hdr.nPage = nTruncate;
1209 }
1210 pLog->hdr.iCheck1 = aCksum[0];
1211 pLog->hdr.iCheck2 = aCksum[1];
1212
1213 /* If this is a commit, update the log-summary header too. */
1214 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1215 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1216 logLeaveMutex(pLog);
1217 }
1218
1219 return SQLITE_OK;
1220}
1221
1222/*
1223** Checkpoint the database. When this function is called the caller
1224** must hold an exclusive lock on the database file.
1225*/
1226int sqlite3LogCheckpoint(
1227 Log *pLog, /* Log connection */
1228 sqlite3_file *pFd, /* File descriptor open on db file */
1229 u8 *zBuf /* Temporary buffer to use */
1230){
1231
1232 /* Assert() that the caller is holding an EXCLUSIVE lock on the
1233 ** database file.
1234 */
1235#ifdef SQLITE_DEBUG
1236 int lock;
1237 sqlite3OsFileControl(pFd, SQLITE_FCNTL_LOCKSTATE, &lock);
1238 assert( lock>=4 );
1239#endif
1240
1241 return logCheckpoint(pLog, pFd, zBuf);
1242}
1243