blob: 112b907f4c6ff593f2ac09d71ba518a3085e0f50 [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
7#include "log.h"
8
9#include <unistd.h>
10#include <fcntl.h>
11#include <sys/mman.h>
12
13typedef struct LogSummaryHdr LogSummaryHdr;
14typedef struct LogSummary LogSummary;
15typedef struct LogCheckpoint LogCheckpoint;
dan64d039e2010-04-13 19:27:31 +000016typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000017
18
19/*
20** The following structure may be used to store the same data that
21** is stored in the log-summary header.
22**
23** Member variables iCheck1 and iCheck2 contain the checksum for the
24** last frame written to the log, or 2 and 3 respectively if the log
25** is currently empty.
26*/
27struct LogSummaryHdr {
28 u32 iChange; /* Counter incremented each transaction */
29 u32 pgsz; /* Database page size in bytes */
30 u32 iLastPg; /* Address of last valid frame in log */
31 u32 nPage; /* Size of database in pages */
32 u32 iCheck1; /* Checkpoint value 1 */
33 u32 iCheck2; /* Checkpoint value 2 */
34};
35
36/* Size of serialized LogSummaryHdr object. */
37#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
38
39#define LOGSUMMARY_FRAME_OFFSET \
40 (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
41
42/* Size of frame header */
43#define LOG_FRAME_HDRSIZE 20
44
45/*
46** There is one instance of this structure for each log-summary object
47** that this process has a connection to. They are stored in a linked
48** list starting at pLogSummary (global variable).
49**
50** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
51** directly in this implementation because the VFS does not support
52** the required blocking file-locks.
53*/
54struct LogSummary {
55 sqlite3_mutex *mutex; /* Mutex used to protect this object */
56 int nRef; /* Number of pointers to this structure */
57 int fd; /* File descriptor open on log-summary */
58 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +000059 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +000060 LogSummary *pNext; /* Next in global list */
61 int nData; /* Size of aData allocation/mapping */
62 u32 *aData; /* File body */
63};
64
dan64d039e2010-04-13 19:27:31 +000065
dan7c246102010-04-12 19:00:29 +000066/*
dan64d039e2010-04-13 19:27:31 +000067** The four lockable regions associated with each log-summary. A connection
68** may take either a SHARED or EXCLUSIVE lock on each.
dan7c246102010-04-12 19:00:29 +000069*/
dan64d039e2010-04-13 19:27:31 +000070#define LOG_REGION_A 0x01
71#define LOG_REGION_B 0x02
72#define LOG_REGION_C 0x04
73#define LOG_REGION_D 0x08
74
75/*
76** A single instance of this structure is allocated as part of each
77** connection to a database log. All structures associated with the
78** same log file are linked together into a list using LogLock.pNext
79** starting at LogSummary.pLock.
80**
81** The mLock field of the structure describes the locks (if any)
82** currently held by the connection. If a SHARED lock is held on
83** any of the four locking regions, then the associated LOG_REGION_X
84** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
85** then the (LOG_REGION_X << 8) bit is set.
86*/
87struct LogLock {
88 LogLock *pNext; /* Next lock on the same log */
89 u32 mLock; /* Mask of locks */
90};
dan7c246102010-04-12 19:00:29 +000091
92struct Log {
93 LogSummary *pSummary; /* Log file summary data */
94 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
95 sqlite3_file *pFd; /* File handle for log file */
96 int sync_flags; /* Flags to use with OsSync() */
dan64d039e2010-04-13 19:27:31 +000097 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +000098 int isWriteLocked; /* True if this is the writer connection */
99 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000100 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000101};
102
dan64d039e2010-04-13 19:27:31 +0000103
dan7c246102010-04-12 19:00:29 +0000104/*
105** This structure is used to implement an iterator that iterates through
106** all frames in the log in database page order. Where two or more frames
107** correspond to the same database page, the iterator visits only the
108** frame most recently written to the log.
109**
110** The internals of this structure are only accessed by:
111**
112** logCheckpointInit() - Create a new iterator,
113** logCheckpointNext() - Step an iterator,
114** logCheckpointFree() - Free an iterator.
115**
116** This functionality is used by the checkpoint code (see logCheckpoint()).
117*/
118struct LogCheckpoint {
119 int nSegment; /* Size of LogCheckpoint.aSummary[] array */
120 int nFinal; /* Elements in segment nSegment-1 */
121 struct LogSegment {
122 int iNext; /* Next aIndex index */
123 u8 *aIndex; /* Pointer to index array */
124 u32 *aDbPage; /* Pointer to db page array */
125 } aSegment[1];
126};
127
dan64d039e2010-04-13 19:27:31 +0000128
129/*
130** List of all LogSummary objects created by this process. Protected by
131** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
132** here instead of borrowing the LRU mutex.
133*/
134#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
135static LogSummary *pLogSummary = 0;
136
dan7c246102010-04-12 19:00:29 +0000137/*
138** Generate an 8 byte checksum based on the data in array aByte[] and the
139** initial values of aCksum[0] and aCksum[1]. The checksum is written into
140** aCksum[] before returning.
141*/
142#define LOG_CKSM_BYTES 8
143static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000144 u64 sum1 = aCksum[0];
145 u64 sum2 = aCksum[1];
146 u32 *a32 = (u32 *)aByte;
147 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000148
149 assert( LOG_CKSM_BYTES==2*sizeof(u32) );
150 assert( (nByte&0x00000003)==0 );
151
dan39c79f52010-04-15 10:58:51 +0000152 do {
153 sum1 += (*a32++);
154 sum2 += sum1;
155 } while( a32<aEnd );
dan7c246102010-04-12 19:00:29 +0000156
dan39c79f52010-04-15 10:58:51 +0000157 aCksum[0] = sum1 + (sum1>>24);
158 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000159}
160
161/*
162** Argument zPath must be a nul-terminated string containing a path-name.
163** This function modifies the string in-place by removing any "./" or "../"
164** elements in the path. For example, the following input:
165**
166** "/home/user/plans/good/../evil/./world_domination.txt"
167**
168** is overwritten with the 'normalized' version:
169**
170** "/home/user/plans/evil/world_domination.txt"
171*/
172static void logNormalizePath(char *zPath){
173 int i, j;
174 char *z = zPath;
175 int n = strlen(z);
176
177 while( n>1 && z[n-1]=='/' ){ n--; }
178 for(i=j=0; i<n; i++){
179 if( z[i]=='/' ){
180 if( z[i+1]=='/' ) continue;
181 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
182 i += 1;
183 continue;
184 }
185 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
186 while( j>0 && z[j-1]!='/' ){ j--; }
187 if( j>0 ){ j--; }
188 i += 2;
189 continue;
190 }
191 }
192 z[j++] = z[i];
193 }
194 z[j] = 0;
195}
196
197/*
198** Lock the summary file pSummary->fd.
199*/
200static int logSummaryLock(LogSummary *pSummary){
201 int rc;
202 struct flock f;
203 memset(&f, 0, sizeof(f));
204 f.l_type = F_WRLCK;
205 f.l_whence = SEEK_SET;
206 f.l_start = 0;
207 f.l_len = 1;
208 rc = fcntl(pSummary->fd, F_SETLKW, &f);
209 if( rc!=0 ){
210 return SQLITE_IOERR;
211 }
212 return SQLITE_OK;
213}
214
215/*
216** Unlock the summary file pSummary->fd.
217*/
218static int logSummaryUnlock(LogSummary *pSummary){
219 int rc;
220 struct flock f;
221 memset(&f, 0, sizeof(f));
222 f.l_type = F_UNLCK;
223 f.l_whence = SEEK_SET;
224 f.l_start = 0;
225 f.l_len = 1;
226 rc = fcntl(pSummary->fd, F_SETLK, &f);
227 if( rc!=0 ){
228 return SQLITE_IOERR;
229 }
230 return SQLITE_OK;
231}
232
233/*
234** Memory map the first nByte bytes of the summary file opened with
235** pSummary->fd at pSummary->aData. If the summary file is smaller than
236** nByte bytes in size when this function is called, ftruncate() is
237** used to expand it before it is mapped.
238**
239** It is assumed that an exclusive lock is held on the summary file
240** by the caller (to protect the ftruncate()).
241*/
242static int logSummaryMap(LogSummary *pSummary, int nByte){
243 struct stat sStat;
244 int rc;
245 int fd = pSummary->fd;
246 void *pMap;
247
248 assert( pSummary->aData==0 );
249
250 /* If the file is less than nByte bytes in size, cause it to grow. */
251 rc = fstat(fd, &sStat);
252 if( rc!=0 ) return SQLITE_IOERR;
253 if( sStat.st_size<nByte ){
254 rc = ftruncate(fd, nByte);
255 if( rc!=0 ) return SQLITE_IOERR;
256 }
257
258 /* Map the file. */
259 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
260 if( pMap==MAP_FAILED ){
261 return SQLITE_IOERR;
262 }
263 pSummary->aData = (u32 *)pMap;
264 pSummary->nData = nByte;
265
266 return SQLITE_OK;
267}
268
269/*
270** Unmap the log-summary mapping and close the file-descriptor. If
271** the isTruncate argument is non-zero, truncate the log-summary file
272** region to zero bytes.
273**
274** Regardless of the value of isTruncate, close the file-descriptor
275** opened on the log-summary file.
276*/
277static int logSummaryUnmap(LogSummary *pSummary, int isTruncate){
278 int rc = SQLITE_OK;
279 if( pSummary->aData ){
280 assert( pSummary->fd>0 );
281 munmap(pSummary->aData, pSummary->nData);
282 pSummary->aData = 0;
283 if( isTruncate ){
284 rc = (ftruncate(pSummary->fd, 0) ? SQLITE_IOERR : SQLITE_OK);
285 }
286 }
287 if( pSummary->fd>0 ){
288 close(pSummary->fd);
289 pSummary->fd = -1;
290 }
291 return rc;
292}
293
294
295static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
296 u32 *aData = pSummary->aData;
297 memcpy(aData, pHdr, sizeof(LogSummaryHdr));
298 aData[LOGSUMMARY_HDR_NFIELD] = 1;
299 aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
300 logChecksumBytes(
301 (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
302 );
303}
304
305/*
306** This function encodes a single frame header and writes it to a buffer
307** supplied by the caller. A log frame-header is made up of a series of
308** 4-byte big-endian integers, as follows:
309**
310** 0: Database page size in bytes.
311** 4: Page number.
312** 8: New database size (for commit frames, otherwise zero).
313** 12: Frame checksum 1.
314** 16: Frame checksum 2.
315*/
316static void logEncodeFrame(
317 u32 *aCksum, /* IN/OUT: Checksum values */
318 u32 iPage, /* Database page number for frame */
319 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
320 int nData, /* Database page size (size of aData[]) */
321 u8 *aData, /* Pointer to page data (for checksum) */
322 u8 *aFrame /* OUT: Write encoded frame here */
323){
324 assert( LOG_FRAME_HDRSIZE==20 );
325
326 sqlite3Put4byte(&aFrame[0], nData);
327 sqlite3Put4byte(&aFrame[4], iPage);
328 sqlite3Put4byte(&aFrame[8], nTruncate);
329
330 logChecksumBytes(aFrame, 12, aCksum);
331 logChecksumBytes(aData, nData, aCksum);
332
333 sqlite3Put4byte(&aFrame[12], aCksum[0]);
334 sqlite3Put4byte(&aFrame[16], aCksum[1]);
335}
336
337/*
338** Return 1 and populate *piPage, *pnTruncate and aCksum if the
339** frame checksum looks Ok. Otherwise return 0.
340*/
341static int logDecodeFrame(
342 u32 *aCksum, /* IN/OUT: Checksum values */
343 u32 *piPage, /* OUT: Database page number for frame */
344 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
345 int nData, /* Database page size (size of aData[]) */
346 u8 *aData, /* Pointer to page data (for checksum) */
347 u8 *aFrame /* Frame data */
348){
349 logChecksumBytes(aFrame, 12, aCksum);
350 logChecksumBytes(aData, nData, aCksum);
351
352 if( aCksum[0]!=sqlite3Get4byte(&aFrame[12])
353 || aCksum[1]!=sqlite3Get4byte(&aFrame[16])
354 ){
355 /* Checksum failed. */
356 return 0;
357 }
358
359 *piPage = sqlite3Get4byte(&aFrame[4]);
360 *pnTruncate = sqlite3Get4byte(&aFrame[8]);
361 return 1;
362}
363
364static void logMergesort8(
365 Pgno *aContent, /* Pages in log */
366 u8 *aBuffer, /* Buffer of at least *pnList items to use */
367 u8 *aList, /* IN/OUT: List to sort */
368 int *pnList /* IN/OUT: Number of elements in aList[] */
369){
370 int nList = *pnList;
371 if( nList>1 ){
372 int nLeft = nList / 2; /* Elements in left list */
373 int nRight = nList - nLeft; /* Elements in right list */
374 u8 *aLeft = aList; /* Left list */
375 u8 *aRight = &aList[nLeft]; /* Right list */
376 int iLeft = 0; /* Current index in aLeft */
377 int iRight = 0; /* Current index in aright */
378 int iOut = 0; /* Current index in output buffer */
379
380 /* TODO: Change to non-recursive version. */
381 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
382 logMergesort8(aContent, aBuffer, aRight, &nRight);
383
384 while( iRight<nRight || iLeft<nLeft ){
385 u8 logpage;
386 Pgno dbpage;
387
388 if( (iLeft<nLeft)
389 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
390 ){
391 logpage = aLeft[iLeft++];
392 }else{
393 logpage = aRight[iRight++];
394 }
395 dbpage = aContent[logpage];
396
397 aBuffer[iOut++] = logpage;
398 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
399
400 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
401 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
402 }
403 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
404 *pnList = iOut;
405 }
406
407#ifdef SQLITE_DEBUG
408 {
409 int i;
410 for(i=1; i<*pnList; i++){
411 assert( aContent[aList[i]] > aContent[aList[i-1]] );
412 }
413 }
414#endif
415}
416
417
418/*
419** Return the index in the LogSummary.aData array that corresponds to
420** frame iFrame. The log-summary file consists of a header, followed by
421** alternating "map" and "index" blocks.
422*/
423static int logSummaryEntry(u32 iFrame){
424 return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
425}
426
427
428/*
429** Set an entry in the log-summary map to map log frame iFrame to db
430** page iPage. Values are always appended to the log-summary (i.e. the
431** value of iFrame is always exactly one more than the value passed to
432** the previous call), but that restriction is not enforced or asserted
433** here.
434*/
435static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
436 u32 iSlot = logSummaryEntry(iFrame);
437
438 /* Set the log-summary entry itself */
439 pSummary->aData[iSlot] = iPage;
440
441 /* If the frame number is a multiple of 256 (frames are numbered starting
442 ** at 1), build an index of the most recently added 256 frames.
443 */
444 if( (iFrame&0x000000FF)==0 ){
445 int i; /* Iterator used while initializing aIndex */
446 u32 *aFrame; /* Pointer to array of 256 frames */
447 int nIndex; /* Number of entries in index */
448 u8 *aIndex; /* 256 bytes to build index in */
449 u8 *aTmp; /* Scratch space to use while sorting */
450
451 aFrame = &pSummary->aData[iSlot-255];
452 aIndex = (u8 *)&pSummary->aData[iSlot+1];
453 aTmp = &aIndex[256];
454
455 nIndex = 256;
456 for(i=0; i<256; i++) aIndex[i] = (u8)i;
457 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
458 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
459 }
460}
461
462
463/*
464** Recover the log-summary by reading the log file. The caller must hold
465** an exclusive lock on the log-summary file.
466*/
467static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
468 int rc; /* Return Code */
469 i64 nSize; /* Size of log file */
470 LogSummaryHdr hdr; /* Recovered log-summary header */
471
472 memset(&hdr, 0, sizeof(hdr));
473
474 rc = sqlite3OsFileSize(pFd, &nSize);
475 if( rc!=SQLITE_OK ){
476 return rc;
477 }
478
479 if( nSize>LOG_FRAME_HDRSIZE ){
480 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
481 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
482 int nFrame; /* Number of bytes at aFrame */
483 u8 *aData; /* Pointer to data part of aFrame buffer */
484 int iFrame; /* Index of last frame read */
485 i64 iOffset; /* Next offset to read from log file */
486 int nPgsz; /* Page size according to the log */
487 u32 aCksum[2] = {2, 3}; /* Running checksum */
488
489 /* Read in the first frame header in the file (to determine the
490 ** database page size).
491 */
492 rc = sqlite3OsRead(pFd, aBuf, LOG_FRAME_HDRSIZE, 0);
493 if( rc!=SQLITE_OK ){
494 return rc;
495 }
496
497 /* If the database page size is not a power of two, or is greater than
498 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
499 */
500 nPgsz = sqlite3Get4byte(&aBuf[0]);
501 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
502 goto finished;
503 }
504
505 /* Malloc a buffer to read frames into. */
506 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
507 aFrame = (u8 *)sqlite3_malloc(nFrame);
508 if( !aFrame ){
509 return SQLITE_NOMEM;
510 }
511 aData = &aFrame[LOG_FRAME_HDRSIZE];
512
513 /* Read all frames from the log file. */
514 iFrame = 0;
515 iOffset = 0;
516 for(iOffset=0; (iOffset+nFrame)<nSize; iOffset+=nFrame){
517 u32 pgno; /* Database page number for frame */
518 u32 nTruncate; /* dbsize field from frame header */
519 int isValid; /* True if this frame is valid */
520
521 /* Read and decode the next log frame. */
522 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
523 if( rc!=SQLITE_OK ) break;
524 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
525 if( !isValid ) break;
526 logSummaryAppend(pSummary, ++iFrame, pgno);
527
528 /* If nTruncate is non-zero, this is a commit record. */
529 if( nTruncate ){
530 hdr.iCheck1 = aCksum[0];
531 hdr.iCheck2 = aCksum[1];
532 hdr.iLastPg = iFrame;
533 hdr.nPage = nTruncate;
534 hdr.pgsz = nPgsz;
535 }
536 }
537
538 sqlite3_free(aFrame);
539 }else{
540 hdr.iCheck1 = 2;
541 hdr.iCheck2 = 3;
542 }
543
544finished:
545 logSummaryWriteHdr(pSummary, &hdr);
546 return rc;
547}
548
549
550/*
551** This function intializes the connection to the log-summary identified
552** by struct pSummary.
553*/
554static int logSummaryInit(LogSummary *pSummary, sqlite3_file *pFd){
555 int rc; /* Return Code */
556 char *zFile; /* File name for summary file */
557
558 assert( pSummary->fd<0 );
559 assert( pSummary->aData==0 );
560 assert( pSummary->nRef>0 );
561 assert( pSummary->zPath );
562
563 /* Open a file descriptor on the summary file. */
564 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
565 if( !zFile ){
566 return SQLITE_NOMEM;
567 }
568 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
569 sqlite3_free(zFile);
570 if( pSummary->fd<0 ){
571 return SQLITE_IOERR;
572 }
573
574 /* Grab an exclusive lock the summary file. Then mmap() it. TODO: This
575 ** code needs to be enhanced to support a growable mapping. For now, just
576 ** make the mapping very large to start with.
577 */
578 rc = logSummaryLock(pSummary);
579 if( rc!=SQLITE_OK ) return rc;
580 rc = logSummaryMap(pSummary, 512*1024);
581 if( rc!=SQLITE_OK ) goto out;
582
583 /* Grab a SHARED lock on the log file. Then try to upgrade to an EXCLUSIVE
584 ** lock. If successful, then this is the first (and only) connection to
585 ** the database. In this case assume the contents of the log-summary
586 ** cannot be trusted. Zero the log-summary header to make sure.
587 **
588 ** The SHARED lock on the log file is not released until the connection
589 ** to the database is closed.
590 */
591 rc = sqlite3OsLock(pFd, SQLITE_LOCK_SHARED);
592 if( rc!=SQLITE_OK ) goto out;
593 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
594 if( rc==SQLITE_OK ){
595 /* This is the first and only connection. */
596 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
597 rc = sqlite3OsUnlock(pFd, SQLITE_LOCK_SHARED);
598 }else if( rc==SQLITE_BUSY ){
599 rc = SQLITE_OK;
600 }
601
602 out:
603 logSummaryUnlock(pSummary);
604 return rc;
605}
606
607/*
608** Open a connection to the log file associated with database zDb. The
609** database file does not actually have to exist. zDb is used only to
610** figure out the name of the log file to open. If the log file does not
611** exist it is created by this call.
612*/
613int sqlite3LogOpen(
614 sqlite3_vfs *pVfs, /* vfs module to open log file with */
615 const char *zDb, /* Name of database file */
616 Log **ppLog /* OUT: Allocated Log handle */
617){
danb9bf16b2010-04-14 11:23:30 +0000618 int rc = SQLITE_OK; /* Return Code */
dan7c246102010-04-12 19:00:29 +0000619 Log *pRet; /* Object to allocate and return */
620 LogSummary *pSummary = 0; /* Summary object */
621 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
622 int flags; /* Flags passed to OsOpen() */
623 char *zWal = 0; /* Path to WAL file */
624 int nWal; /* Length of zWal in bytes */
625
626 /* Zero output variables */
627 assert( zDb );
628 *ppLog = 0;
629
630 /* Allocate an instance of struct Log to return. */
631 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
632 if( !pRet ) goto out;
633 pRet->pVfs = pVfs;
634 pRet->pFd = (sqlite3_file *)&pRet[1];
635 pRet->sync_flags = SQLITE_SYNC_NORMAL;
636
637 /* Normalize the path name. */
638 zWal = sqlite3_mprintf("%s-wal", zDb);
639 if( !zWal ) goto out;
640 logNormalizePath(zWal);
641 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_DB);
642 nWal = sqlite3Strlen30(zWal);
643
644 /* Enter the mutex that protects the linked-list of LogSummary structures */
645 if( sqlite3GlobalConfig.bCoreMutex ){
646 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
647 }
648 sqlite3_mutex_enter(mutex);
649
650 /* Search for an existing log summary object in the linked list. If one
651 ** cannot be found, allocate and initialize a new object.
652 */
653 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
654 int nPath = sqlite3Strlen30(pSummary->zPath);
655 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
656 }
657 if( !pSummary ){
658 int nByte = sizeof(LogSummary) + nWal + 1;
659 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
660 if( !pSummary ){
661 rc = SQLITE_NOMEM;
662 goto out;
663 }
664 if( sqlite3GlobalConfig.bCoreMutex ){
665 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
666 }
667 pSummary->zPath = (char *)&pSummary[1];
668 pSummary->fd = -1;
669 memcpy(pSummary->zPath, zWal, nWal);
670 pSummary->pNext = pLogSummary;
671 pLogSummary = pSummary;
672 }
673 pSummary->nRef++;
674 pRet->pSummary = pSummary;
675
676 /* Exit the mutex protecting the linked-list of LogSummary objects. */
677 sqlite3_mutex_leave(mutex);
678 mutex = 0;
679
680 /* Open file handle on the log file. */
681 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
682 if( rc!=SQLITE_OK ) goto out;
683
684 /* Object pSummary is shared between all connections to the database made
685 ** by this process. So at this point it may or may not be connected to
686 ** the log-summary. If it is not, connect it. Otherwise, just take the
687 ** SHARED lock on the log file.
688 */
689 sqlite3_mutex_enter(pSummary->mutex);
690 mutex = pSummary->mutex;
691 if( pSummary->fd<0 ){
692 rc = logSummaryInit(pSummary, pRet->pFd);
693 }else{
694 rc = sqlite3OsLock(pRet->pFd, SQLITE_LOCK_SHARED);
695 }
696
dan64d039e2010-04-13 19:27:31 +0000697 pRet->lock.pNext = pSummary->pLock;
698 pSummary->pLock = &pRet->lock;
699
dan7c246102010-04-12 19:00:29 +0000700 out:
701 sqlite3_mutex_leave(mutex);
702 sqlite3_free(zWal);
703 if( rc!=SQLITE_OK ){
704 assert(0);
705 if( pRet ){
706 sqlite3OsClose(pRet->pFd);
707 sqlite3_free(pRet);
708 }
709 assert( !pSummary || pSummary->nRef==0 );
710 sqlite3_free(pSummary);
711 }
712 *ppLog = pRet;
713 return rc;
714}
715
716static int logCheckpointNext(
717 LogCheckpoint *p, /* Iterator */
718 u32 *piPage, /* OUT: Next db page to write */
719 u32 *piFrame /* OUT: Log frame to read from */
720){
721 u32 iMin = *piPage;
722 u32 iRet = 0xFFFFFFFF;
723 int i;
724 int nBlock = p->nFinal;
725
726 for(i=p->nSegment-1; i>=0; i--){
727 struct LogSegment *pSegment = &p->aSegment[i];
728 while( pSegment->iNext<nBlock ){
729 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
730 if( iPg>iMin ){
731 if( iPg<iRet ){
732 iRet = iPg;
733 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
734 }
735 break;
736 }
737 pSegment->iNext++;
738 }
739
740 nBlock = 256;
741 }
742
743 *piPage = iRet;
744 return (iRet==0xFFFFFFFF);
745}
746
747static LogCheckpoint *logCheckpointInit(Log *pLog){
748 u32 *aData = pLog->pSummary->aData;
749 LogCheckpoint *p; /* Return value */
750 int nSegment; /* Number of segments to merge */
751 u32 iLast; /* Last frame in log */
752 int nByte; /* Number of bytes to allocate */
753 int i; /* Iterator variable */
754 int nFinal; /* Number of unindexed entries */
755 struct LogSegment *pFinal; /* Final (unindexed) segment */
756 u8 *aTmp; /* Temp space used by merge-sort */
757
758 iLast = pLog->hdr.iLastPg;
759 nSegment = (iLast >> 8) + 1;
760 nFinal = (iLast & 0x000000FF);
761
762 nByte = sizeof(LogCheckpoint) + (nSegment-1)*sizeof(struct LogSegment) + 512;
763 p = (LogCheckpoint *)sqlite3_malloc(nByte);
764 if( p ){
765 memset(p, 0, nByte);
766 p->nSegment = nSegment;
767 p->nFinal = nFinal;
768 }
769
770 for(i=0; i<nSegment-1; i++){
771 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
772 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
773 }
774 pFinal = &p->aSegment[nSegment-1];
775
776 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
777 pFinal->aIndex = (u8 *)&pFinal[1];
778 aTmp = &pFinal->aIndex[256];
779 for(i=0; i<nFinal; i++){
780 pFinal->aIndex[i] = i;
781 }
782 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
783 p->nFinal = nFinal;
784
785 return p;
786}
787
788/*
789** Free a log iterator allocated by logCheckpointInit().
790*/
791static void logCheckpointFree(LogCheckpoint *p){
792 sqlite3_free(p);
793}
794
795/*
796** Checkpoint the contents of the log file.
797*/
798static int logCheckpoint(
799 Log *pLog, /* Log connection */
800 sqlite3_file *pFd, /* File descriptor open on db file */
801 u8 *zBuf /* Temporary buffer to use */
802){
803 int rc; /* Return code */
804 int pgsz = pLog->hdr.pgsz; /* Database page-size */
805 LogCheckpoint *pIter = 0; /* Log iterator context */
806 u32 iDbpage = 0; /* Next database page to write */
danb9bf16b2010-04-14 11:23:30 +0000807 u32 iFrame = 0; /* Log frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +0000808
809 /* Allocate the iterator */
810 pIter = logCheckpointInit(pLog);
811 if( !pIter ) return SQLITE_NOMEM;
812
813 /* Sync the log file to disk */
814 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
815 if( rc!=SQLITE_OK ) goto out;
816
817 /* Iterate through the contents of the log, copying data to the db file. */
818 while( 0==logCheckpointNext(pIter, &iDbpage, &iFrame) ){
819 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
820 (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
821 );
822 if( rc!=SQLITE_OK ) goto out;
823 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
824 if( rc!=SQLITE_OK ) goto out;
825 }
826
827 /* Truncate the database file */
828 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
829 if( rc!=SQLITE_OK ) goto out;
830
831 /* Sync the database file. If successful, update the log-summary. */
832 rc = sqlite3OsSync(pFd, pLog->sync_flags);
833 if( rc!=SQLITE_OK ) goto out;
834 pLog->hdr.iLastPg = 0;
835 pLog->hdr.iCheck1 = 2;
836 pLog->hdr.iCheck2 = 3;
837 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
838
839 /* TODO: If a crash occurs and the current log is copied into the
840 ** database there is no problem. However, if a crash occurs while
841 ** writing the next transaction into the start of the log, such that:
842 **
843 ** * The first transaction currently in the log is left intact, but
844 ** * The second (or subsequent) transaction is damaged,
845 **
846 ** then the database could become corrupt.
847 **
848 ** The easiest thing to do would be to write and sync a dummy header
849 ** into the log at this point. Unfortunately, that turns out to be
850 ** an unwelcome performance hit. Alternatives are...
851 */
852#if 0
853 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
854 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
855 if( rc!=SQLITE_OK ) goto out;
856 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
857#endif
858
859 out:
860 logCheckpointFree(pIter);
861 return rc;
862}
863
864/*
865** Close a connection to a log file.
866*/
867int sqlite3LogClose(
868 Log *pLog, /* Log to close */
869 sqlite3_file *pFd, /* Database file */
870 u8 *zBuf /* Buffer of at least page-size bytes */
871){
872 int rc = SQLITE_OK;
873 if( pLog ){
dan64d039e2010-04-13 19:27:31 +0000874 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +0000875 LogSummary *pSummary = pLog->pSummary;
876 sqlite3_mutex *mutex = 0;
877
dan64d039e2010-04-13 19:27:31 +0000878 sqlite3_mutex_enter(pSummary->mutex);
879 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
880 *ppL = pLog->lock.pNext;
881 sqlite3_mutex_leave(pSummary->mutex);
882
dan7c246102010-04-12 19:00:29 +0000883 if( sqlite3GlobalConfig.bCoreMutex ){
884 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
885 }
886 sqlite3_mutex_enter(mutex);
887
888 /* Decrement the reference count on the log summary. If this is the last
889 ** reference to the log summary object in this process, the object will
890 ** be freed. If this is also the last connection to the database, then
891 ** checkpoint the database and truncate the log and log-summary files
892 ** to zero bytes in size.
893 **/
894 pSummary->nRef--;
895 if( pSummary->nRef==0 ){
896 LogSummary **pp;
897
898 rc = logSummaryLock(pSummary);
899 if( rc==SQLITE_OK ){
900 int isTruncate = 0;
901 int rc2 = sqlite3OsLock(pLog->pFd, SQLITE_LOCK_EXCLUSIVE);
902 if( rc2==SQLITE_OK ){
903 /* This is the last connection to the database (including other
904 ** processes). Do three things:
905 **
906 ** 1. Checkpoint the db.
907 ** 2. Truncate the log file to zero bytes.
908 ** 3. Truncate the log-summary file to zero bytes.
909 */
910 rc2 = logCheckpoint(pLog, pFd, zBuf);
911 if( rc2==SQLITE_OK ){
912 rc2 = sqlite3OsTruncate(pLog->pFd, 0);
913 }
914 isTruncate = 1;
915 }else if( rc2==SQLITE_BUSY ){
916 rc2 = SQLITE_OK;
917 }
918 logSummaryUnmap(pSummary, isTruncate);
919 sqlite3OsUnlock(pLog->pFd, SQLITE_LOCK_NONE);
920 rc = logSummaryUnlock(pSummary);
921 if( rc2!=SQLITE_OK ) rc = rc2;
922 }
923
924 /* Remove the LogSummary object from the global list. Then free the
925 ** mutex and the object itself.
926 */
927 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
928 *pp = (*pp)->pNext;
929 sqlite3_mutex_free(pSummary->mutex);
930 sqlite3_free(pSummary);
931 }
932
933 sqlite3_mutex_leave(mutex);
934
935 /* Close the connection to the log file and free the Log handle. */
936 sqlite3OsClose(pLog->pFd);
937 sqlite3_free(pLog);
938 }
939 return rc;
940}
941
942/*
943** Set the flags to pass to the sqlite3OsSync() function when syncing
944** the log file.
945*/
946#if 0
947void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
948 assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
949 pLog->sync_flags = sync_flags;
950}
951#endif
952
953/*
954** Enter and leave the log-summary mutex. In this context, entering the
955** log-summary mutex means:
956**
957** 1. Obtaining mutex pLog->pSummary->mutex, and
958** 2. Taking an exclusive lock on the log-summary file.
959**
960** i.e. this mutex locks out other processes as well as other threads
961** hosted in this address space.
962*/
963static int logEnterMutex(Log *pLog){
964 LogSummary *pSummary = pLog->pSummary;
965 int rc;
966
967 sqlite3_mutex_enter(pSummary->mutex);
968 rc = logSummaryLock(pSummary);
969 if( rc!=SQLITE_OK ){
970 sqlite3_mutex_leave(pSummary->mutex);
971 }
972 return rc;
973}
974static void logLeaveMutex(Log *pLog){
975 LogSummary *pSummary = pLog->pSummary;
976 logSummaryUnlock(pSummary);
977 sqlite3_mutex_leave(pSummary->mutex);
978}
979
980/*
dan64d039e2010-04-13 19:27:31 +0000981** Values for the second parameter to logLockRegion().
982*/
983#define LOG_UNLOCK 0
984#define LOG_RDLOCK 1
985#define LOG_WRLOCK 2
986
987static int logLockRegion(Log *pLog, u32 mRegion, int op){
988 LogSummary *pSummary = pLog->pSummary;
989 LogLock *p; /* Used to iterate through in-process locks */
dan02bb5962010-04-14 15:49:40 +0000990 u32 mOther; /* Locks held by other connections */
991 u32 mNew; /* New mask for pLog */
dan64d039e2010-04-13 19:27:31 +0000992
993 assert(
994 /* Writer lock operations */
995 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
996 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
997
dan02bb5962010-04-14 15:49:40 +0000998 /* Normal reader lock operations */
dan64d039e2010-04-13 19:27:31 +0000999 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
dan64d039e2010-04-13 19:27:31 +00001000 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
1001 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
dan02bb5962010-04-14 15:49:40 +00001002
1003 /* Region D reader lock operations */
1004 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
dan49320f82010-04-14 18:50:08 +00001005 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
dan64d039e2010-04-13 19:27:31 +00001006 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
1007
1008 /* Checkpointer lock operations */
1009 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
1010 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
dan64d039e2010-04-13 19:27:31 +00001011 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
dan02bb5962010-04-14 15:49:40 +00001012 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
dan64d039e2010-04-13 19:27:31 +00001013 );
1014
dan02bb5962010-04-14 15:49:40 +00001015 /* Assert that a connection never tries to go from an EXCLUSIVE to a
1016 ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
1017 ** happens though (when a region D reader upgrades to a writer).
1018 */
1019 assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
1020
dan64d039e2010-04-13 19:27:31 +00001021 sqlite3_mutex_enter(pSummary->mutex);
1022
dan02bb5962010-04-14 15:49:40 +00001023 /* Calculate a mask of logs held by all connections in this process apart
1024 ** from this one. The least significant byte of the mask contains a mask
1025 ** of the SHARED logs held. The next least significant byte of the mask
1026 ** indicates the EXCLUSIVE locks held. For example, to test if some other
1027 ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
1028 ** on region C, do:
1029 **
1030 ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
1031 ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
1032 **
1033 ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
1034 ** corresponding bit in the SHARED mask.
dan64d039e2010-04-13 19:27:31 +00001035 */
dan02bb5962010-04-14 15:49:40 +00001036 mOther = 0;
1037 for(p=pSummary->pLock; p; p=p->pNext){
1038 assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
1039 if( p!=&pLog->lock ){
1040 mOther |= p->mLock;
dan64d039e2010-04-13 19:27:31 +00001041 }
1042 }
1043
dan02bb5962010-04-14 15:49:40 +00001044 /* If this call is to lock a region (not to unlock one), test if locks held
1045 ** by any other connection in this process prevent the new locks from
1046 ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
1047 */
1048 if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
1049 sqlite3_mutex_leave(pSummary->mutex);
1050 return SQLITE_BUSY;
1051 }
1052
1053 /* Figure out the new log mask for this connection. */
dan64d039e2010-04-13 19:27:31 +00001054 switch( op ){
1055 case LOG_UNLOCK:
dan02bb5962010-04-14 15:49:40 +00001056 mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
dan64d039e2010-04-13 19:27:31 +00001057 break;
1058 case LOG_RDLOCK:
dan02bb5962010-04-14 15:49:40 +00001059 mNew = (pLog->lock.mLock | mRegion);
dan64d039e2010-04-13 19:27:31 +00001060 break;
1061 default:
1062 assert( op==LOG_WRLOCK );
dan02bb5962010-04-14 15:49:40 +00001063 mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
dan64d039e2010-04-13 19:27:31 +00001064 break;
1065 }
1066
dan02bb5962010-04-14 15:49:40 +00001067 /* Now modify the locks held on the log-summary file descriptor. This
1068 ** file descriptor is shared by all log connections in this process.
1069 ** Therefore:
1070 **
1071 ** + If one or more log connections in this process hold a SHARED lock
1072 ** on a region, the file-descriptor should hold a SHARED lock on
1073 ** the file region.
1074 **
1075 ** + If a log connection in this process holds an EXCLUSIVE lock on a
1076 ** region, the file-descriptor should also hold an EXCLUSIVE lock on
1077 ** the region in question.
1078 **
1079 ** If this is an LOG_UNLOCK operation, only regions for which no other
1080 ** connection holds a lock should actually be unlocked. And if this
1081 ** is a LOG_RDLOCK operation and other connections already hold all
1082 ** the required SHARED locks, then no system call is required.
1083 */
1084 if( op==LOG_UNLOCK ){
1085 mRegion = (mRegion & ~mOther);
dan64d039e2010-04-13 19:27:31 +00001086 }
dan02bb5962010-04-14 15:49:40 +00001087 if( (op==LOG_WRLOCK)
1088 || (op==LOG_UNLOCK && mRegion)
1089 || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
1090 ){
1091 struct LockMap {
1092 int iStart; /* Byte offset to start locking operation */
1093 int iLen; /* Length field for locking operation */
1094 } aMap[] = {
1095 /* 0000 */ {0, 0}, /* 0001 */ {4, 1},
1096 /* 0010 */ {3, 1}, /* 0011 */ {3, 2},
1097 /* 0100 */ {2, 1}, /* 0101 */ {0, 0},
1098 /* 0110 */ {2, 2}, /* 0111 */ {2, 3},
1099 /* 1000 */ {1, 1}, /* 1001 */ {0, 0},
1100 /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
1101 /* 1100 */ {1, 2}, /* 1101 */ {0, 0},
dane264d982010-04-14 18:06:50 +00001102 /* 1110 */ {0, 0}, /* 1111 */ {0, 0}
dan02bb5962010-04-14 15:49:40 +00001103 };
1104 int rc; /* Return code of fcntl() */
1105 struct flock f; /* Locking operation */
dan64d039e2010-04-13 19:27:31 +00001106
dan02bb5962010-04-14 15:49:40 +00001107 assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
1108
dan64d039e2010-04-13 19:27:31 +00001109 memset(&f, 0, sizeof(f));
1110 f.l_type = (op==LOG_WRLOCK?F_WRLCK:(op==LOG_RDLOCK?F_RDLCK:F_UNLCK));
1111 f.l_whence = SEEK_SET;
dan02bb5962010-04-14 15:49:40 +00001112 f.l_start = 32 + aMap[mRegion].iStart;
1113 f.l_len = aMap[mRegion].iLen;
dan64d039e2010-04-13 19:27:31 +00001114
1115 rc = fcntl(pSummary->fd, F_SETLK, &f);
1116 if( rc!=0 ){
1117 sqlite3_mutex_leave(pSummary->mutex);
1118 return SQLITE_BUSY;
1119 }
1120 }
1121
dan02bb5962010-04-14 15:49:40 +00001122 pLog->lock.mLock = mNew;
dan64d039e2010-04-13 19:27:31 +00001123 sqlite3_mutex_leave(pSummary->mutex);
1124 return SQLITE_OK;
1125}
1126
1127/*
danb9bf16b2010-04-14 11:23:30 +00001128** Try to read the log-summary header. Attempt to verify the header
1129** checksum. If the checksum can be verified, copy the log-summary
1130** header into structure pLog->hdr. If the contents of pLog->hdr are
1131** modified by this and pChanged is not NULL, set *pChanged to 1.
1132** Otherwise leave *pChanged unmodified.
1133**
1134** If the checksum cannot be verified return SQLITE_ERROR.
1135*/
1136int logSummaryTryHdr(Log *pLog, int *pChanged){
1137 u32 aCksum[2] = {1, 1};
1138 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1139
1140 /* First try to read the header without a lock. Verify the checksum
1141 ** before returning. This will almost always work.
1142 */
1143 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1144 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1145 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1146 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1147 ){
1148 return SQLITE_ERROR;
1149 }
1150
1151 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1152 if( pChanged ){
1153 *pChanged = 1;
1154 }
1155 memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
1156 }
1157 return SQLITE_OK;
1158}
1159
1160/*
1161** Read the log-summary header from the log-summary file into structure
1162** pLog->hdr. If attempting to verify the header checksum fails, try
1163** to recover the log before returning.
1164**
1165** If the log-summary header is successfully read, return SQLITE_OK.
1166** Otherwise an SQLite error code.
1167*/
1168int logSummaryReadHdr(Log *pLog, int *pChanged){
1169 int rc;
1170
1171 /* First try to read the header without a lock. Verify the checksum
1172 ** before returning. This will almost always work.
1173 */
1174 if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
1175 return SQLITE_OK;
1176 }
1177
1178 /* If the first attempt to read the header failed, lock the log-summary
1179 ** file and try again. If the header checksum verification fails this
1180 ** time as well, run log recovery.
1181 */
1182 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1183 if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
1184 if( pChanged ){
1185 *pChanged = 1;
1186 }
1187 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1188 if( rc==SQLITE_OK ){
1189 rc = logSummaryTryHdr(pLog, 0);
1190 }
1191 }
1192 logLeaveMutex(pLog);
1193 }
1194
1195 return rc;
1196}
1197
1198/*
dan64d039e2010-04-13 19:27:31 +00001199** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001200**
1201** If this call obtains a new read-lock and the database contents have been
1202** modified since the most recent call to LogCloseSnapshot() on this Log
1203** connection, then *pChanged is set to 1 before returning. Otherwise, it
1204** is left unmodified. This is used by the pager layer to determine whether
1205** or not any cached pages may be safely reused.
1206*/
1207int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
1208 int rc = SQLITE_OK;
1209 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001210 int nAttempt;
1211
1212 /* Obtain a snapshot-lock on the log-summary file. The procedure
1213 ** for obtaining the snapshot log is:
1214 **
1215 ** 1. Attempt a SHARED lock on regions A and B.
1216 ** 2a. If step 1 is successful, drop the lock on region B.
1217 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1218 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1219 ** successful.
1220 **
1221 ** If neither of the locks can be obtained after 5 tries, presumably
1222 ** something is wrong (i.e. a process not following the locking protocol).
1223 ** Return an error code in this case.
1224 */
1225 rc = SQLITE_BUSY;
1226 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1227 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1228 if( rc==SQLITE_BUSY ){
1229 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1230 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1231 }else{
1232 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1233 pLog->isLocked = LOG_REGION_A;
1234 }
1235 }
1236 if( rc!=SQLITE_OK ){
1237 return rc;
1238 }
1239
danb9bf16b2010-04-14 11:23:30 +00001240 rc = logSummaryReadHdr(pLog, pChanged);
dan64d039e2010-04-13 19:27:31 +00001241 if( rc!=SQLITE_OK ){
1242 /* An error occured while attempting log recovery. */
1243 sqlite3LogCloseSnapshot(pLog);
1244 }
dan7c246102010-04-12 19:00:29 +00001245 }
1246 return rc;
1247}
1248
1249/*
1250** Unlock the current snapshot.
1251*/
1252void sqlite3LogCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001253 if( pLog->isLocked ){
1254 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1255 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1256 }
dan7c246102010-04-12 19:00:29 +00001257 pLog->isLocked = 0;
1258}
1259
dan7c246102010-04-12 19:00:29 +00001260/*
1261** Read a page from the log, if it is present.
1262*/
1263int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
1264 u32 iRead = 0;
1265 u32 *aData = pLog->pSummary->aData;
1266 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1267
dan39c79f52010-04-15 10:58:51 +00001268 assert( pLog->isLocked );
1269
dan7c246102010-04-12 19:00:29 +00001270 /* Do a linear search of the unindexed block of page-numbers (if any)
1271 ** at the end of the log-summary. An alternative to this would be to
1272 ** build an index in private memory each time a read transaction is
1273 ** opened on a new snapshot.
1274 */
1275 if( pLog->hdr.iLastPg ){
1276 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1277 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1278 while( *pi!=pgno && pi!=piStop ) pi--;
1279 if( pi!=piStop ){
1280 iRead = (pi-piStop) + iFrame;
1281 }
1282 }
1283 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1284
1285 while( iRead==0 && iFrame>0 ){
1286 int iLow = 0;
1287 int iHigh = 255;
1288 u32 *aFrame;
1289 u8 *aIndex;
1290
1291 iFrame -= 256;
1292 aFrame = &aData[logSummaryEntry(iFrame+1)];
1293 aIndex = (u8 *)&aFrame[256];
1294
1295 while( iLow<=iHigh ){
1296 int iTest = (iLow+iHigh)>>1;
1297 u32 iPg = aFrame[aIndex[iTest]];
1298
1299 if( iPg==pgno ){
1300 iRead = iFrame + 1 + aIndex[iTest];
1301 break;
1302 }
1303 else if( iPg<pgno ){
1304 iLow = iTest+1;
1305 }else{
1306 iHigh = iTest-1;
1307 }
1308 }
1309 }
1310 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1311
1312 /* If iRead is non-zero, then it is the log frame number that contains the
1313 ** required page. Read and return data from the log file.
1314 */
1315 if( iRead ){
1316 i64 iOffset = (iRead-1) * (pLog->hdr.pgsz+LOG_FRAME_HDRSIZE);
1317 iOffset += LOG_FRAME_HDRSIZE;
1318 *pInLog = 1;
1319 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1320 }
1321
1322 *pInLog = 0;
1323 return SQLITE_OK;
1324}
1325
1326
1327/*
1328** Set *pPgno to the size of the database file (or zero, if unknown).
1329*/
1330void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
1331 assert( pLog->isLocked );
1332 *pPgno = pLog->hdr.nPage;
1333}
1334
1335/*
dan7c246102010-04-12 19:00:29 +00001336** This function returns SQLITE_OK if the caller may write to the database.
1337** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001338** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001339*/
1340int sqlite3LogWriteLock(Log *pLog, int op){
1341 assert( pLog->isLocked );
1342 if( op ){
dan64d039e2010-04-13 19:27:31 +00001343
1344 /* Obtain the writer lock */
1345 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1346 if( rc!=SQLITE_OK ){
1347 return rc;
1348 }
1349
dan39c79f52010-04-15 10:58:51 +00001350 /* If this is connection is a region D reader, then the SHARED lock on
1351 ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
1352 ** held on region A. This means that if the write-transaction is committed
dan49320f82010-04-14 18:50:08 +00001353 ** and this connection downgrades to a reader, it will be left with no
dan39c79f52010-04-15 10:58:51 +00001354 ** lock at all. And so its snapshot could get clobbered by a checkpoint
dan49320f82010-04-14 18:50:08 +00001355 ** operation.
1356 **
1357 ** To stop this from happening, grab a SHARED lock on region A now.
1358 ** This should always be successful, as the only time a client holds
1359 ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
1360 ** lock on region C (a checkpointer does this). This is not possible,
1361 ** as this connection currently has the EXCLUSIVE lock on region C.
dan02bb5962010-04-14 15:49:40 +00001362 */
dan49320f82010-04-14 18:50:08 +00001363 if( pLog->isLocked==LOG_REGION_D ){
1364 logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
1365 pLog->isLocked = LOG_REGION_A;
1366 }
dan02bb5962010-04-14 15:49:40 +00001367
dan39c79f52010-04-15 10:58:51 +00001368 /* If this connection is not reading the most recent database snapshot,
1369 ** it is not possible to write to the database. In this case release
1370 ** the write locks and return SQLITE_BUSY.
1371 */
dan7c246102010-04-12 19:00:29 +00001372 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
dan49320f82010-04-14 18:50:08 +00001373 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001374 return SQLITE_BUSY;
1375 }
1376 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001377
dan7c246102010-04-12 19:00:29 +00001378 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001379 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001380 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1381 pLog->isWriteLocked = 0;
1382 }
1383 return SQLITE_OK;
1384}
1385
1386/*
1387** Write a set of frames to the log. The caller must hold at least a
1388** RESERVED lock on the database file.
1389*/
1390int sqlite3LogFrames(
1391 Log *pLog, /* Log handle to write to */
1392 int nPgsz, /* Database page-size in bytes */
1393 PgHdr *pList, /* List of dirty pages to write */
1394 Pgno nTruncate, /* Database size after this commit */
1395 int isCommit, /* True if this is a commit */
1396 int isSync /* True to sync the log file */
1397){
1398 /* Each frame has a 20 byte header, as follows:
1399 **
1400 ** + Pseudo-random salt (4 bytes)
1401 ** + Page number (4 bytes)
1402 ** + New database size, or 0 if not a commit frame (4 bytes)
1403 ** + Checksum (CHECKSUM_BYTES bytes);
1404 **
1405 ** The checksum is computed based on the following:
1406 **
1407 ** + The previous checksum, or {2, 3} for the first frame in the log.
1408 ** + The non-checksum fields of the frame header, and
1409 ** + The frame contents (page data).
1410 **
1411 ** This format must also be understood by the code in logSummaryRecover().
1412 ** The size of the frame header is used by LogRead() and LogCheckpoint().
1413 */
1414 int rc; /* Used to catch return codes */
1415 u32 iFrame; /* Next frame address */
1416 u8 aFrame[LOG_FRAME_HDRSIZE];
1417 PgHdr *p; /* Iterator to run through pList with. */
1418 u32 aCksum[2];
1419
1420 PgHdr *pLast; /* Last frame in list */
1421 int nLast = 0; /* Number of extra copies of last page */
1422
1423 assert( LOG_FRAME_HDRSIZE==(4 * 3 + LOG_CKSM_BYTES) );
1424 assert( pList );
1425
1426 aCksum[0] = pLog->hdr.iCheck1;
1427 aCksum[1] = pLog->hdr.iCheck2;
1428
1429 /* Write the log file. */
1430 iFrame = pLog->hdr.iLastPg;
1431 for(p=pList; p; p=p->pDirty){
1432 u32 nDbsize; /* Db-size field for frame header */
1433 i64 iOffset; /* Write offset in log file */
1434
1435 iFrame++;
1436 iOffset = (iFrame-1) * (nPgsz+sizeof(aFrame));
1437
1438 /* Populate and write the frame header */
1439 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1440 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1441 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1442 if( rc!=SQLITE_OK ){
1443 return rc;
1444 }
1445
1446 /* Write the page data */
1447 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1448 if( rc!=SQLITE_OK ){
1449 return rc;
1450 }
1451 pLast = p;
1452 }
1453
1454 /* Sync the log file if the 'isSync' flag was specified. */
1455 if( isSync ){
1456#if 0
1457 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
1458 i64 iOffset = iFrame * (nPgsz+sizeof(aFrame));
1459
1460 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1461 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1462 }
1463 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1464 while( iOffset<iSegment ){
1465 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1466 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1467 if( rc!=SQLITE_OK ){
1468 return rc;
1469 }
1470
1471 iOffset += LOG_FRAME_HDRSIZE;
1472 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1473 if( rc!=SQLITE_OK ){
1474 return rc;
1475 }
1476 nLast++;
1477 iOffset += nPgsz;
1478 }
1479#endif
1480
1481 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1482 if( rc!=SQLITE_OK ){
1483 return rc;
1484 }
1485 }
1486
1487 /* Append data to the log summary. It is not necessary to lock the
1488 ** log-summary to do this as the RESERVED lock held on the db file
1489 ** guarantees that there are no other writers, and no data that may
1490 ** be in use by existing readers is being overwritten.
1491 */
1492 iFrame = pLog->hdr.iLastPg;
1493 for(p=pList; p; p=p->pDirty){
1494 iFrame++;
1495 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1496 }
1497 while( nLast>0 ){
1498 iFrame++;
1499 nLast--;
1500 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1501 }
1502
1503 /* Update the private copy of the header. */
1504 pLog->hdr.pgsz = nPgsz;
1505 pLog->hdr.iLastPg = iFrame;
1506 if( isCommit ){
1507 pLog->hdr.iChange++;
1508 pLog->hdr.nPage = nTruncate;
1509 }
1510 pLog->hdr.iCheck1 = aCksum[0];
1511 pLog->hdr.iCheck2 = aCksum[1];
1512
1513 /* If this is a commit, update the log-summary header too. */
1514 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1515 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1516 logLeaveMutex(pLog);
1517 }
1518
1519 return SQLITE_OK;
1520}
1521
1522/*
danb9bf16b2010-04-14 11:23:30 +00001523** Checkpoint the database:
1524**
1525** 1. Wait for an EXCLUSIVE lock on regions B and C.
1526** 2. Wait for an EXCLUSIVE lock on region A.
1527** 3. Copy the contents of the log into the database file.
1528** 4. Zero the log-summary header (so new readers will ignore the log).
1529** 5. Drop the locks obtained in steps 1 and 2.
dan7c246102010-04-12 19:00:29 +00001530*/
1531int sqlite3LogCheckpoint(
1532 Log *pLog, /* Log connection */
1533 sqlite3_file *pFd, /* File descriptor open on db file */
dan64d039e2010-04-13 19:27:31 +00001534 u8 *zBuf, /* Temporary buffer to use */
1535 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1536 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001537){
danb9bf16b2010-04-14 11:23:30 +00001538 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001539
dan39c79f52010-04-15 10:58:51 +00001540 assert( !pLog->isLocked );
1541
1542 /* Wait for an EXCLUSIVE lock on regions B and C. */
dan64d039e2010-04-13 19:27:31 +00001543 do {
1544 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1545 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1546 if( rc!=SQLITE_OK ) return rc;
1547
dan39c79f52010-04-15 10:58:51 +00001548 /* Wait for an EXCLUSIVE lock on region A. */
dan64d039e2010-04-13 19:27:31 +00001549 do {
1550 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1551 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001552 if( rc!=SQLITE_OK ){
1553 logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1554 return rc;
1555 }
dan64d039e2010-04-13 19:27:31 +00001556
danb9bf16b2010-04-14 11:23:30 +00001557 /* Copy data from the log to the database file. */
1558 rc = logSummaryReadHdr(pLog, 0);
1559 if( rc==SQLITE_OK ){
1560 rc = logCheckpoint(pLog, pFd, zBuf);
1561 }
1562
1563 /* Release the locks. */
dan64d039e2010-04-13 19:27:31 +00001564 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1565 return rc;
dan7c246102010-04-12 19:00:29 +00001566}
1567