blob: 9b25216df40894720ef33db0fc13e451a9d14f1f [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
7#include "log.h"
8
9#include <unistd.h>
10#include <fcntl.h>
11#include <sys/mman.h>
12
13typedef struct LogSummaryHdr LogSummaryHdr;
14typedef struct LogSummary LogSummary;
15typedef struct LogCheckpoint LogCheckpoint;
dan64d039e2010-04-13 19:27:31 +000016typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000017
18
19/*
20** The following structure may be used to store the same data that
21** is stored in the log-summary header.
22**
23** Member variables iCheck1 and iCheck2 contain the checksum for the
24** last frame written to the log, or 2 and 3 respectively if the log
25** is currently empty.
26*/
27struct LogSummaryHdr {
28 u32 iChange; /* Counter incremented each transaction */
29 u32 pgsz; /* Database page size in bytes */
30 u32 iLastPg; /* Address of last valid frame in log */
31 u32 nPage; /* Size of database in pages */
32 u32 iCheck1; /* Checkpoint value 1 */
33 u32 iCheck2; /* Checkpoint value 2 */
34};
35
36/* Size of serialized LogSummaryHdr object. */
37#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
38
39#define LOGSUMMARY_FRAME_OFFSET \
40 (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
41
42/* Size of frame header */
43#define LOG_FRAME_HDRSIZE 20
44
45/*
46** There is one instance of this structure for each log-summary object
47** that this process has a connection to. They are stored in a linked
48** list starting at pLogSummary (global variable).
49**
50** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
51** directly in this implementation because the VFS does not support
52** the required blocking file-locks.
53*/
54struct LogSummary {
55 sqlite3_mutex *mutex; /* Mutex used to protect this object */
56 int nRef; /* Number of pointers to this structure */
57 int fd; /* File descriptor open on log-summary */
58 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +000059 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +000060 LogSummary *pNext; /* Next in global list */
61 int nData; /* Size of aData allocation/mapping */
62 u32 *aData; /* File body */
63};
64
dan64d039e2010-04-13 19:27:31 +000065
dan7c246102010-04-12 19:00:29 +000066/*
dan64d039e2010-04-13 19:27:31 +000067** The four lockable regions associated with each log-summary. A connection
68** may take either a SHARED or EXCLUSIVE lock on each.
dan7c246102010-04-12 19:00:29 +000069*/
dan64d039e2010-04-13 19:27:31 +000070#define LOG_REGION_A 0x01
71#define LOG_REGION_B 0x02
72#define LOG_REGION_C 0x04
73#define LOG_REGION_D 0x08
74
75/*
76** A single instance of this structure is allocated as part of each
77** connection to a database log. All structures associated with the
78** same log file are linked together into a list using LogLock.pNext
79** starting at LogSummary.pLock.
80**
81** The mLock field of the structure describes the locks (if any)
82** currently held by the connection. If a SHARED lock is held on
83** any of the four locking regions, then the associated LOG_REGION_X
84** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
85** then the (LOG_REGION_X << 8) bit is set.
86*/
87struct LogLock {
88 LogLock *pNext; /* Next lock on the same log */
89 u32 mLock; /* Mask of locks */
90};
dan7c246102010-04-12 19:00:29 +000091
92struct Log {
93 LogSummary *pSummary; /* Log file summary data */
94 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
95 sqlite3_file *pFd; /* File handle for log file */
96 int sync_flags; /* Flags to use with OsSync() */
dan64d039e2010-04-13 19:27:31 +000097 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +000098 int isWriteLocked; /* True if this is the writer connection */
99 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000100 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000101};
102
dan64d039e2010-04-13 19:27:31 +0000103
dan7c246102010-04-12 19:00:29 +0000104/*
105** This structure is used to implement an iterator that iterates through
106** all frames in the log in database page order. Where two or more frames
107** correspond to the same database page, the iterator visits only the
108** frame most recently written to the log.
109**
110** The internals of this structure are only accessed by:
111**
112** logCheckpointInit() - Create a new iterator,
113** logCheckpointNext() - Step an iterator,
114** logCheckpointFree() - Free an iterator.
115**
116** This functionality is used by the checkpoint code (see logCheckpoint()).
117*/
118struct LogCheckpoint {
119 int nSegment; /* Size of LogCheckpoint.aSummary[] array */
120 int nFinal; /* Elements in segment nSegment-1 */
121 struct LogSegment {
122 int iNext; /* Next aIndex index */
123 u8 *aIndex; /* Pointer to index array */
124 u32 *aDbPage; /* Pointer to db page array */
125 } aSegment[1];
126};
127
dan64d039e2010-04-13 19:27:31 +0000128
129/*
130** List of all LogSummary objects created by this process. Protected by
131** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
132** here instead of borrowing the LRU mutex.
133*/
134#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
135static LogSummary *pLogSummary = 0;
136
dan7c246102010-04-12 19:00:29 +0000137/*
138** Generate an 8 byte checksum based on the data in array aByte[] and the
139** initial values of aCksum[0] and aCksum[1]. The checksum is written into
140** aCksum[] before returning.
141*/
142#define LOG_CKSM_BYTES 8
143static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
144 u32 *z32 = (u32 *)aByte;
145 int n32 = nByte / sizeof(u32);
146 int i;
147
148 assert( LOG_CKSM_BYTES==2*sizeof(u32) );
149 assert( (nByte&0x00000003)==0 );
150
151 u32 cksum0 = aCksum[0];
152 u32 cksum1 = aCksum[1];
153
154 for(i=0; i<n32; i++){
155 cksum0 = (cksum0 >> 8) + (cksum0 ^ z32[i]);
156 cksum1 = (cksum1 >> 8) + (cksum1 ^ z32[i]);
157 }
158
159 aCksum[0] = cksum0;
160 aCksum[1] = cksum1;
161}
162
163/*
164** Argument zPath must be a nul-terminated string containing a path-name.
165** This function modifies the string in-place by removing any "./" or "../"
166** elements in the path. For example, the following input:
167**
168** "/home/user/plans/good/../evil/./world_domination.txt"
169**
170** is overwritten with the 'normalized' version:
171**
172** "/home/user/plans/evil/world_domination.txt"
173*/
174static void logNormalizePath(char *zPath){
175 int i, j;
176 char *z = zPath;
177 int n = strlen(z);
178
179 while( n>1 && z[n-1]=='/' ){ n--; }
180 for(i=j=0; i<n; i++){
181 if( z[i]=='/' ){
182 if( z[i+1]=='/' ) continue;
183 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
184 i += 1;
185 continue;
186 }
187 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
188 while( j>0 && z[j-1]!='/' ){ j--; }
189 if( j>0 ){ j--; }
190 i += 2;
191 continue;
192 }
193 }
194 z[j++] = z[i];
195 }
196 z[j] = 0;
197}
198
199/*
200** Lock the summary file pSummary->fd.
201*/
202static int logSummaryLock(LogSummary *pSummary){
203 int rc;
204 struct flock f;
205 memset(&f, 0, sizeof(f));
206 f.l_type = F_WRLCK;
207 f.l_whence = SEEK_SET;
208 f.l_start = 0;
209 f.l_len = 1;
210 rc = fcntl(pSummary->fd, F_SETLKW, &f);
211 if( rc!=0 ){
212 return SQLITE_IOERR;
213 }
214 return SQLITE_OK;
215}
216
217/*
218** Unlock the summary file pSummary->fd.
219*/
220static int logSummaryUnlock(LogSummary *pSummary){
221 int rc;
222 struct flock f;
223 memset(&f, 0, sizeof(f));
224 f.l_type = F_UNLCK;
225 f.l_whence = SEEK_SET;
226 f.l_start = 0;
227 f.l_len = 1;
228 rc = fcntl(pSummary->fd, F_SETLK, &f);
229 if( rc!=0 ){
230 return SQLITE_IOERR;
231 }
232 return SQLITE_OK;
233}
234
235/*
236** Memory map the first nByte bytes of the summary file opened with
237** pSummary->fd at pSummary->aData. If the summary file is smaller than
238** nByte bytes in size when this function is called, ftruncate() is
239** used to expand it before it is mapped.
240**
241** It is assumed that an exclusive lock is held on the summary file
242** by the caller (to protect the ftruncate()).
243*/
244static int logSummaryMap(LogSummary *pSummary, int nByte){
245 struct stat sStat;
246 int rc;
247 int fd = pSummary->fd;
248 void *pMap;
249
250 assert( pSummary->aData==0 );
251
252 /* If the file is less than nByte bytes in size, cause it to grow. */
253 rc = fstat(fd, &sStat);
254 if( rc!=0 ) return SQLITE_IOERR;
255 if( sStat.st_size<nByte ){
256 rc = ftruncate(fd, nByte);
257 if( rc!=0 ) return SQLITE_IOERR;
258 }
259
260 /* Map the file. */
261 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
262 if( pMap==MAP_FAILED ){
263 return SQLITE_IOERR;
264 }
265 pSummary->aData = (u32 *)pMap;
266 pSummary->nData = nByte;
267
268 return SQLITE_OK;
269}
270
271/*
272** Unmap the log-summary mapping and close the file-descriptor. If
273** the isTruncate argument is non-zero, truncate the log-summary file
274** region to zero bytes.
275**
276** Regardless of the value of isTruncate, close the file-descriptor
277** opened on the log-summary file.
278*/
279static int logSummaryUnmap(LogSummary *pSummary, int isTruncate){
280 int rc = SQLITE_OK;
281 if( pSummary->aData ){
282 assert( pSummary->fd>0 );
283 munmap(pSummary->aData, pSummary->nData);
284 pSummary->aData = 0;
285 if( isTruncate ){
286 rc = (ftruncate(pSummary->fd, 0) ? SQLITE_IOERR : SQLITE_OK);
287 }
288 }
289 if( pSummary->fd>0 ){
290 close(pSummary->fd);
291 pSummary->fd = -1;
292 }
293 return rc;
294}
295
296
297static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
298 u32 *aData = pSummary->aData;
299 memcpy(aData, pHdr, sizeof(LogSummaryHdr));
300 aData[LOGSUMMARY_HDR_NFIELD] = 1;
301 aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
302 logChecksumBytes(
303 (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
304 );
305}
306
307/*
308** This function encodes a single frame header and writes it to a buffer
309** supplied by the caller. A log frame-header is made up of a series of
310** 4-byte big-endian integers, as follows:
311**
312** 0: Database page size in bytes.
313** 4: Page number.
314** 8: New database size (for commit frames, otherwise zero).
315** 12: Frame checksum 1.
316** 16: Frame checksum 2.
317*/
318static void logEncodeFrame(
319 u32 *aCksum, /* IN/OUT: Checksum values */
320 u32 iPage, /* Database page number for frame */
321 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
322 int nData, /* Database page size (size of aData[]) */
323 u8 *aData, /* Pointer to page data (for checksum) */
324 u8 *aFrame /* OUT: Write encoded frame here */
325){
326 assert( LOG_FRAME_HDRSIZE==20 );
327
328 sqlite3Put4byte(&aFrame[0], nData);
329 sqlite3Put4byte(&aFrame[4], iPage);
330 sqlite3Put4byte(&aFrame[8], nTruncate);
331
332 logChecksumBytes(aFrame, 12, aCksum);
333 logChecksumBytes(aData, nData, aCksum);
334
335 sqlite3Put4byte(&aFrame[12], aCksum[0]);
336 sqlite3Put4byte(&aFrame[16], aCksum[1]);
337}
338
339/*
340** Return 1 and populate *piPage, *pnTruncate and aCksum if the
341** frame checksum looks Ok. Otherwise return 0.
342*/
343static int logDecodeFrame(
344 u32 *aCksum, /* IN/OUT: Checksum values */
345 u32 *piPage, /* OUT: Database page number for frame */
346 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
347 int nData, /* Database page size (size of aData[]) */
348 u8 *aData, /* Pointer to page data (for checksum) */
349 u8 *aFrame /* Frame data */
350){
351 logChecksumBytes(aFrame, 12, aCksum);
352 logChecksumBytes(aData, nData, aCksum);
353
354 if( aCksum[0]!=sqlite3Get4byte(&aFrame[12])
355 || aCksum[1]!=sqlite3Get4byte(&aFrame[16])
356 ){
357 /* Checksum failed. */
358 return 0;
359 }
360
361 *piPage = sqlite3Get4byte(&aFrame[4]);
362 *pnTruncate = sqlite3Get4byte(&aFrame[8]);
363 return 1;
364}
365
366static void logMergesort8(
367 Pgno *aContent, /* Pages in log */
368 u8 *aBuffer, /* Buffer of at least *pnList items to use */
369 u8 *aList, /* IN/OUT: List to sort */
370 int *pnList /* IN/OUT: Number of elements in aList[] */
371){
372 int nList = *pnList;
373 if( nList>1 ){
374 int nLeft = nList / 2; /* Elements in left list */
375 int nRight = nList - nLeft; /* Elements in right list */
376 u8 *aLeft = aList; /* Left list */
377 u8 *aRight = &aList[nLeft]; /* Right list */
378 int iLeft = 0; /* Current index in aLeft */
379 int iRight = 0; /* Current index in aright */
380 int iOut = 0; /* Current index in output buffer */
381
382 /* TODO: Change to non-recursive version. */
383 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
384 logMergesort8(aContent, aBuffer, aRight, &nRight);
385
386 while( iRight<nRight || iLeft<nLeft ){
387 u8 logpage;
388 Pgno dbpage;
389
390 if( (iLeft<nLeft)
391 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
392 ){
393 logpage = aLeft[iLeft++];
394 }else{
395 logpage = aRight[iRight++];
396 }
397 dbpage = aContent[logpage];
398
399 aBuffer[iOut++] = logpage;
400 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
401
402 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
403 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
404 }
405 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
406 *pnList = iOut;
407 }
408
409#ifdef SQLITE_DEBUG
410 {
411 int i;
412 for(i=1; i<*pnList; i++){
413 assert( aContent[aList[i]] > aContent[aList[i-1]] );
414 }
415 }
416#endif
417}
418
419
420/*
421** Return the index in the LogSummary.aData array that corresponds to
422** frame iFrame. The log-summary file consists of a header, followed by
423** alternating "map" and "index" blocks.
424*/
425static int logSummaryEntry(u32 iFrame){
426 return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
427}
428
429
430/*
431** Set an entry in the log-summary map to map log frame iFrame to db
432** page iPage. Values are always appended to the log-summary (i.e. the
433** value of iFrame is always exactly one more than the value passed to
434** the previous call), but that restriction is not enforced or asserted
435** here.
436*/
437static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
438 u32 iSlot = logSummaryEntry(iFrame);
439
440 /* Set the log-summary entry itself */
441 pSummary->aData[iSlot] = iPage;
442
443 /* If the frame number is a multiple of 256 (frames are numbered starting
444 ** at 1), build an index of the most recently added 256 frames.
445 */
446 if( (iFrame&0x000000FF)==0 ){
447 int i; /* Iterator used while initializing aIndex */
448 u32 *aFrame; /* Pointer to array of 256 frames */
449 int nIndex; /* Number of entries in index */
450 u8 *aIndex; /* 256 bytes to build index in */
451 u8 *aTmp; /* Scratch space to use while sorting */
452
453 aFrame = &pSummary->aData[iSlot-255];
454 aIndex = (u8 *)&pSummary->aData[iSlot+1];
455 aTmp = &aIndex[256];
456
457 nIndex = 256;
458 for(i=0; i<256; i++) aIndex[i] = (u8)i;
459 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
460 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
461 }
462}
463
464
465/*
466** Recover the log-summary by reading the log file. The caller must hold
467** an exclusive lock on the log-summary file.
468*/
469static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
470 int rc; /* Return Code */
471 i64 nSize; /* Size of log file */
472 LogSummaryHdr hdr; /* Recovered log-summary header */
473
474 memset(&hdr, 0, sizeof(hdr));
475
476 rc = sqlite3OsFileSize(pFd, &nSize);
477 if( rc!=SQLITE_OK ){
478 return rc;
479 }
480
481 if( nSize>LOG_FRAME_HDRSIZE ){
482 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
483 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
484 int nFrame; /* Number of bytes at aFrame */
485 u8 *aData; /* Pointer to data part of aFrame buffer */
486 int iFrame; /* Index of last frame read */
487 i64 iOffset; /* Next offset to read from log file */
488 int nPgsz; /* Page size according to the log */
489 u32 aCksum[2] = {2, 3}; /* Running checksum */
490
491 /* Read in the first frame header in the file (to determine the
492 ** database page size).
493 */
494 rc = sqlite3OsRead(pFd, aBuf, LOG_FRAME_HDRSIZE, 0);
495 if( rc!=SQLITE_OK ){
496 return rc;
497 }
498
499 /* If the database page size is not a power of two, or is greater than
500 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
501 */
502 nPgsz = sqlite3Get4byte(&aBuf[0]);
503 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
504 goto finished;
505 }
506
507 /* Malloc a buffer to read frames into. */
508 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
509 aFrame = (u8 *)sqlite3_malloc(nFrame);
510 if( !aFrame ){
511 return SQLITE_NOMEM;
512 }
513 aData = &aFrame[LOG_FRAME_HDRSIZE];
514
515 /* Read all frames from the log file. */
516 iFrame = 0;
517 iOffset = 0;
518 for(iOffset=0; (iOffset+nFrame)<nSize; iOffset+=nFrame){
519 u32 pgno; /* Database page number for frame */
520 u32 nTruncate; /* dbsize field from frame header */
521 int isValid; /* True if this frame is valid */
522
523 /* Read and decode the next log frame. */
524 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
525 if( rc!=SQLITE_OK ) break;
526 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
527 if( !isValid ) break;
528 logSummaryAppend(pSummary, ++iFrame, pgno);
529
530 /* If nTruncate is non-zero, this is a commit record. */
531 if( nTruncate ){
532 hdr.iCheck1 = aCksum[0];
533 hdr.iCheck2 = aCksum[1];
534 hdr.iLastPg = iFrame;
535 hdr.nPage = nTruncate;
536 hdr.pgsz = nPgsz;
537 }
538 }
539
540 sqlite3_free(aFrame);
541 }else{
542 hdr.iCheck1 = 2;
543 hdr.iCheck2 = 3;
544 }
545
546finished:
547 logSummaryWriteHdr(pSummary, &hdr);
548 return rc;
549}
550
551
552/*
553** This function intializes the connection to the log-summary identified
554** by struct pSummary.
555*/
556static int logSummaryInit(LogSummary *pSummary, sqlite3_file *pFd){
557 int rc; /* Return Code */
558 char *zFile; /* File name for summary file */
559
560 assert( pSummary->fd<0 );
561 assert( pSummary->aData==0 );
562 assert( pSummary->nRef>0 );
563 assert( pSummary->zPath );
564
565 /* Open a file descriptor on the summary file. */
566 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
567 if( !zFile ){
568 return SQLITE_NOMEM;
569 }
570 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
571 sqlite3_free(zFile);
572 if( pSummary->fd<0 ){
573 return SQLITE_IOERR;
574 }
575
576 /* Grab an exclusive lock the summary file. Then mmap() it. TODO: This
577 ** code needs to be enhanced to support a growable mapping. For now, just
578 ** make the mapping very large to start with.
579 */
580 rc = logSummaryLock(pSummary);
581 if( rc!=SQLITE_OK ) return rc;
582 rc = logSummaryMap(pSummary, 512*1024);
583 if( rc!=SQLITE_OK ) goto out;
584
585 /* Grab a SHARED lock on the log file. Then try to upgrade to an EXCLUSIVE
586 ** lock. If successful, then this is the first (and only) connection to
587 ** the database. In this case assume the contents of the log-summary
588 ** cannot be trusted. Zero the log-summary header to make sure.
589 **
590 ** The SHARED lock on the log file is not released until the connection
591 ** to the database is closed.
592 */
593 rc = sqlite3OsLock(pFd, SQLITE_LOCK_SHARED);
594 if( rc!=SQLITE_OK ) goto out;
595 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
596 if( rc==SQLITE_OK ){
597 /* This is the first and only connection. */
598 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
599 rc = sqlite3OsUnlock(pFd, SQLITE_LOCK_SHARED);
600 }else if( rc==SQLITE_BUSY ){
601 rc = SQLITE_OK;
602 }
603
604 out:
605 logSummaryUnlock(pSummary);
606 return rc;
607}
608
609/*
610** Open a connection to the log file associated with database zDb. The
611** database file does not actually have to exist. zDb is used only to
612** figure out the name of the log file to open. If the log file does not
613** exist it is created by this call.
614*/
615int sqlite3LogOpen(
616 sqlite3_vfs *pVfs, /* vfs module to open log file with */
617 const char *zDb, /* Name of database file */
618 Log **ppLog /* OUT: Allocated Log handle */
619){
danb9bf16b2010-04-14 11:23:30 +0000620 int rc = SQLITE_OK; /* Return Code */
dan7c246102010-04-12 19:00:29 +0000621 Log *pRet; /* Object to allocate and return */
622 LogSummary *pSummary = 0; /* Summary object */
623 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
624 int flags; /* Flags passed to OsOpen() */
625 char *zWal = 0; /* Path to WAL file */
626 int nWal; /* Length of zWal in bytes */
627
628 /* Zero output variables */
629 assert( zDb );
630 *ppLog = 0;
631
632 /* Allocate an instance of struct Log to return. */
633 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
634 if( !pRet ) goto out;
635 pRet->pVfs = pVfs;
636 pRet->pFd = (sqlite3_file *)&pRet[1];
637 pRet->sync_flags = SQLITE_SYNC_NORMAL;
638
639 /* Normalize the path name. */
640 zWal = sqlite3_mprintf("%s-wal", zDb);
641 if( !zWal ) goto out;
642 logNormalizePath(zWal);
643 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_DB);
644 nWal = sqlite3Strlen30(zWal);
645
646 /* Enter the mutex that protects the linked-list of LogSummary structures */
647 if( sqlite3GlobalConfig.bCoreMutex ){
648 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
649 }
650 sqlite3_mutex_enter(mutex);
651
652 /* Search for an existing log summary object in the linked list. If one
653 ** cannot be found, allocate and initialize a new object.
654 */
655 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
656 int nPath = sqlite3Strlen30(pSummary->zPath);
657 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
658 }
659 if( !pSummary ){
660 int nByte = sizeof(LogSummary) + nWal + 1;
661 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
662 if( !pSummary ){
663 rc = SQLITE_NOMEM;
664 goto out;
665 }
666 if( sqlite3GlobalConfig.bCoreMutex ){
667 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
668 }
669 pSummary->zPath = (char *)&pSummary[1];
670 pSummary->fd = -1;
671 memcpy(pSummary->zPath, zWal, nWal);
672 pSummary->pNext = pLogSummary;
673 pLogSummary = pSummary;
674 }
675 pSummary->nRef++;
676 pRet->pSummary = pSummary;
677
678 /* Exit the mutex protecting the linked-list of LogSummary objects. */
679 sqlite3_mutex_leave(mutex);
680 mutex = 0;
681
682 /* Open file handle on the log file. */
683 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
684 if( rc!=SQLITE_OK ) goto out;
685
686 /* Object pSummary is shared between all connections to the database made
687 ** by this process. So at this point it may or may not be connected to
688 ** the log-summary. If it is not, connect it. Otherwise, just take the
689 ** SHARED lock on the log file.
690 */
691 sqlite3_mutex_enter(pSummary->mutex);
692 mutex = pSummary->mutex;
693 if( pSummary->fd<0 ){
694 rc = logSummaryInit(pSummary, pRet->pFd);
695 }else{
696 rc = sqlite3OsLock(pRet->pFd, SQLITE_LOCK_SHARED);
697 }
698
dan64d039e2010-04-13 19:27:31 +0000699 pRet->lock.pNext = pSummary->pLock;
700 pSummary->pLock = &pRet->lock;
701
dan7c246102010-04-12 19:00:29 +0000702 out:
703 sqlite3_mutex_leave(mutex);
704 sqlite3_free(zWal);
705 if( rc!=SQLITE_OK ){
706 assert(0);
707 if( pRet ){
708 sqlite3OsClose(pRet->pFd);
709 sqlite3_free(pRet);
710 }
711 assert( !pSummary || pSummary->nRef==0 );
712 sqlite3_free(pSummary);
713 }
714 *ppLog = pRet;
715 return rc;
716}
717
718static int logCheckpointNext(
719 LogCheckpoint *p, /* Iterator */
720 u32 *piPage, /* OUT: Next db page to write */
721 u32 *piFrame /* OUT: Log frame to read from */
722){
723 u32 iMin = *piPage;
724 u32 iRet = 0xFFFFFFFF;
725 int i;
726 int nBlock = p->nFinal;
727
728 for(i=p->nSegment-1; i>=0; i--){
729 struct LogSegment *pSegment = &p->aSegment[i];
730 while( pSegment->iNext<nBlock ){
731 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
732 if( iPg>iMin ){
733 if( iPg<iRet ){
734 iRet = iPg;
735 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
736 }
737 break;
738 }
739 pSegment->iNext++;
740 }
741
742 nBlock = 256;
743 }
744
745 *piPage = iRet;
746 return (iRet==0xFFFFFFFF);
747}
748
749static LogCheckpoint *logCheckpointInit(Log *pLog){
750 u32 *aData = pLog->pSummary->aData;
751 LogCheckpoint *p; /* Return value */
752 int nSegment; /* Number of segments to merge */
753 u32 iLast; /* Last frame in log */
754 int nByte; /* Number of bytes to allocate */
755 int i; /* Iterator variable */
756 int nFinal; /* Number of unindexed entries */
757 struct LogSegment *pFinal; /* Final (unindexed) segment */
758 u8 *aTmp; /* Temp space used by merge-sort */
759
760 iLast = pLog->hdr.iLastPg;
761 nSegment = (iLast >> 8) + 1;
762 nFinal = (iLast & 0x000000FF);
763
764 nByte = sizeof(LogCheckpoint) + (nSegment-1)*sizeof(struct LogSegment) + 512;
765 p = (LogCheckpoint *)sqlite3_malloc(nByte);
766 if( p ){
767 memset(p, 0, nByte);
768 p->nSegment = nSegment;
769 p->nFinal = nFinal;
770 }
771
772 for(i=0; i<nSegment-1; i++){
773 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
774 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
775 }
776 pFinal = &p->aSegment[nSegment-1];
777
778 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
779 pFinal->aIndex = (u8 *)&pFinal[1];
780 aTmp = &pFinal->aIndex[256];
781 for(i=0; i<nFinal; i++){
782 pFinal->aIndex[i] = i;
783 }
784 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
785 p->nFinal = nFinal;
786
787 return p;
788}
789
790/*
791** Free a log iterator allocated by logCheckpointInit().
792*/
793static void logCheckpointFree(LogCheckpoint *p){
794 sqlite3_free(p);
795}
796
797/*
798** Checkpoint the contents of the log file.
799*/
800static int logCheckpoint(
801 Log *pLog, /* Log connection */
802 sqlite3_file *pFd, /* File descriptor open on db file */
803 u8 *zBuf /* Temporary buffer to use */
804){
805 int rc; /* Return code */
806 int pgsz = pLog->hdr.pgsz; /* Database page-size */
807 LogCheckpoint *pIter = 0; /* Log iterator context */
808 u32 iDbpage = 0; /* Next database page to write */
danb9bf16b2010-04-14 11:23:30 +0000809 u32 iFrame = 0; /* Log frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +0000810
811 /* Allocate the iterator */
812 pIter = logCheckpointInit(pLog);
813 if( !pIter ) return SQLITE_NOMEM;
814
815 /* Sync the log file to disk */
816 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
817 if( rc!=SQLITE_OK ) goto out;
818
819 /* Iterate through the contents of the log, copying data to the db file. */
820 while( 0==logCheckpointNext(pIter, &iDbpage, &iFrame) ){
821 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
822 (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
823 );
824 if( rc!=SQLITE_OK ) goto out;
825 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
826 if( rc!=SQLITE_OK ) goto out;
827 }
828
829 /* Truncate the database file */
830 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
831 if( rc!=SQLITE_OK ) goto out;
832
833 /* Sync the database file. If successful, update the log-summary. */
834 rc = sqlite3OsSync(pFd, pLog->sync_flags);
835 if( rc!=SQLITE_OK ) goto out;
836 pLog->hdr.iLastPg = 0;
837 pLog->hdr.iCheck1 = 2;
838 pLog->hdr.iCheck2 = 3;
839 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
840
841 /* TODO: If a crash occurs and the current log is copied into the
842 ** database there is no problem. However, if a crash occurs while
843 ** writing the next transaction into the start of the log, such that:
844 **
845 ** * The first transaction currently in the log is left intact, but
846 ** * The second (or subsequent) transaction is damaged,
847 **
848 ** then the database could become corrupt.
849 **
850 ** The easiest thing to do would be to write and sync a dummy header
851 ** into the log at this point. Unfortunately, that turns out to be
852 ** an unwelcome performance hit. Alternatives are...
853 */
854#if 0
855 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
856 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
857 if( rc!=SQLITE_OK ) goto out;
858 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
859#endif
860
861 out:
862 logCheckpointFree(pIter);
863 return rc;
864}
865
866/*
867** Close a connection to a log file.
868*/
869int sqlite3LogClose(
870 Log *pLog, /* Log to close */
871 sqlite3_file *pFd, /* Database file */
872 u8 *zBuf /* Buffer of at least page-size bytes */
873){
874 int rc = SQLITE_OK;
875 if( pLog ){
dan64d039e2010-04-13 19:27:31 +0000876 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +0000877 LogSummary *pSummary = pLog->pSummary;
878 sqlite3_mutex *mutex = 0;
879
dan64d039e2010-04-13 19:27:31 +0000880 sqlite3_mutex_enter(pSummary->mutex);
881 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
882 *ppL = pLog->lock.pNext;
883 sqlite3_mutex_leave(pSummary->mutex);
884
dan7c246102010-04-12 19:00:29 +0000885 if( sqlite3GlobalConfig.bCoreMutex ){
886 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
887 }
888 sqlite3_mutex_enter(mutex);
889
890 /* Decrement the reference count on the log summary. If this is the last
891 ** reference to the log summary object in this process, the object will
892 ** be freed. If this is also the last connection to the database, then
893 ** checkpoint the database and truncate the log and log-summary files
894 ** to zero bytes in size.
895 **/
896 pSummary->nRef--;
897 if( pSummary->nRef==0 ){
898 LogSummary **pp;
899
900 rc = logSummaryLock(pSummary);
901 if( rc==SQLITE_OK ){
902 int isTruncate = 0;
903 int rc2 = sqlite3OsLock(pLog->pFd, SQLITE_LOCK_EXCLUSIVE);
904 if( rc2==SQLITE_OK ){
905 /* This is the last connection to the database (including other
906 ** processes). Do three things:
907 **
908 ** 1. Checkpoint the db.
909 ** 2. Truncate the log file to zero bytes.
910 ** 3. Truncate the log-summary file to zero bytes.
911 */
912 rc2 = logCheckpoint(pLog, pFd, zBuf);
913 if( rc2==SQLITE_OK ){
914 rc2 = sqlite3OsTruncate(pLog->pFd, 0);
915 }
916 isTruncate = 1;
917 }else if( rc2==SQLITE_BUSY ){
918 rc2 = SQLITE_OK;
919 }
920 logSummaryUnmap(pSummary, isTruncate);
921 sqlite3OsUnlock(pLog->pFd, SQLITE_LOCK_NONE);
922 rc = logSummaryUnlock(pSummary);
923 if( rc2!=SQLITE_OK ) rc = rc2;
924 }
925
926 /* Remove the LogSummary object from the global list. Then free the
927 ** mutex and the object itself.
928 */
929 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
930 *pp = (*pp)->pNext;
931 sqlite3_mutex_free(pSummary->mutex);
932 sqlite3_free(pSummary);
933 }
934
935 sqlite3_mutex_leave(mutex);
936
937 /* Close the connection to the log file and free the Log handle. */
938 sqlite3OsClose(pLog->pFd);
939 sqlite3_free(pLog);
940 }
941 return rc;
942}
943
944/*
945** Set the flags to pass to the sqlite3OsSync() function when syncing
946** the log file.
947*/
948#if 0
949void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
950 assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
951 pLog->sync_flags = sync_flags;
952}
953#endif
954
955/*
956** Enter and leave the log-summary mutex. In this context, entering the
957** log-summary mutex means:
958**
959** 1. Obtaining mutex pLog->pSummary->mutex, and
960** 2. Taking an exclusive lock on the log-summary file.
961**
962** i.e. this mutex locks out other processes as well as other threads
963** hosted in this address space.
964*/
965static int logEnterMutex(Log *pLog){
966 LogSummary *pSummary = pLog->pSummary;
967 int rc;
968
969 sqlite3_mutex_enter(pSummary->mutex);
970 rc = logSummaryLock(pSummary);
971 if( rc!=SQLITE_OK ){
972 sqlite3_mutex_leave(pSummary->mutex);
973 }
974 return rc;
975}
976static void logLeaveMutex(Log *pLog){
977 LogSummary *pSummary = pLog->pSummary;
978 logSummaryUnlock(pSummary);
979 sqlite3_mutex_leave(pSummary->mutex);
980}
981
982/*
dan64d039e2010-04-13 19:27:31 +0000983** Values for the second parameter to logLockRegion().
984*/
985#define LOG_UNLOCK 0
986#define LOG_RDLOCK 1
987#define LOG_WRLOCK 2
988
989static int logLockRegion(Log *pLog, u32 mRegion, int op){
990 LogSummary *pSummary = pLog->pSummary;
991 LogLock *p; /* Used to iterate through in-process locks */
dan02bb5962010-04-14 15:49:40 +0000992 u32 mOther; /* Locks held by other connections */
993 u32 mNew; /* New mask for pLog */
dan64d039e2010-04-13 19:27:31 +0000994
995 assert(
996 /* Writer lock operations */
997 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
998 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
999
dan02bb5962010-04-14 15:49:40 +00001000 /* Normal reader lock operations */
dan64d039e2010-04-13 19:27:31 +00001001 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
dan64d039e2010-04-13 19:27:31 +00001002 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
1003 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
dan02bb5962010-04-14 15:49:40 +00001004
1005 /* Region D reader lock operations */
1006 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
dan64d039e2010-04-13 19:27:31 +00001007 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
1008
1009 /* Checkpointer lock operations */
1010 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
1011 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
dan64d039e2010-04-13 19:27:31 +00001012 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
dan02bb5962010-04-14 15:49:40 +00001013 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
dan64d039e2010-04-13 19:27:31 +00001014 );
1015
dan02bb5962010-04-14 15:49:40 +00001016 /* Assert that a connection never tries to go from an EXCLUSIVE to a
1017 ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
1018 ** happens though (when a region D reader upgrades to a writer).
1019 */
1020 assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );
1021
dan64d039e2010-04-13 19:27:31 +00001022 sqlite3_mutex_enter(pSummary->mutex);
1023
dan02bb5962010-04-14 15:49:40 +00001024 /* Calculate a mask of logs held by all connections in this process apart
1025 ** from this one. The least significant byte of the mask contains a mask
1026 ** of the SHARED logs held. The next least significant byte of the mask
1027 ** indicates the EXCLUSIVE locks held. For example, to test if some other
1028 ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
1029 ** on region C, do:
1030 **
1031 ** hasSharedOnA = (mOther & (LOG_REGION_A<<0));
1032 ** hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
1033 **
1034 ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
1035 ** corresponding bit in the SHARED mask.
dan64d039e2010-04-13 19:27:31 +00001036 */
dan02bb5962010-04-14 15:49:40 +00001037 mOther = 0;
1038 for(p=pSummary->pLock; p; p=p->pNext){
1039 assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
1040 if( p!=&pLog->lock ){
1041 mOther |= p->mLock;
dan64d039e2010-04-13 19:27:31 +00001042 }
1043 }
1044
dan02bb5962010-04-14 15:49:40 +00001045 /* If this call is to lock a region (not to unlock one), test if locks held
1046 ** by any other connection in this process prevent the new locks from
1047 ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
1048 */
1049 if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
1050 sqlite3_mutex_leave(pSummary->mutex);
1051 return SQLITE_BUSY;
1052 }
1053
1054 /* Figure out the new log mask for this connection. */
dan64d039e2010-04-13 19:27:31 +00001055 switch( op ){
1056 case LOG_UNLOCK:
dan02bb5962010-04-14 15:49:40 +00001057 mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
dan64d039e2010-04-13 19:27:31 +00001058 break;
1059 case LOG_RDLOCK:
dan02bb5962010-04-14 15:49:40 +00001060 mNew = (pLog->lock.mLock | mRegion);
dan64d039e2010-04-13 19:27:31 +00001061 break;
1062 default:
1063 assert( op==LOG_WRLOCK );
dan02bb5962010-04-14 15:49:40 +00001064 mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
dan64d039e2010-04-13 19:27:31 +00001065 break;
1066 }
1067
dan02bb5962010-04-14 15:49:40 +00001068 /* Now modify the locks held on the log-summary file descriptor. This
1069 ** file descriptor is shared by all log connections in this process.
1070 ** Therefore:
1071 **
1072 ** + If one or more log connections in this process hold a SHARED lock
1073 ** on a region, the file-descriptor should hold a SHARED lock on
1074 ** the file region.
1075 **
1076 ** + If a log connection in this process holds an EXCLUSIVE lock on a
1077 ** region, the file-descriptor should also hold an EXCLUSIVE lock on
1078 ** the region in question.
1079 **
1080 ** If this is an LOG_UNLOCK operation, only regions for which no other
1081 ** connection holds a lock should actually be unlocked. And if this
1082 ** is a LOG_RDLOCK operation and other connections already hold all
1083 ** the required SHARED locks, then no system call is required.
1084 */
1085 if( op==LOG_UNLOCK ){
1086 mRegion = (mRegion & ~mOther);
dan64d039e2010-04-13 19:27:31 +00001087 }
dan02bb5962010-04-14 15:49:40 +00001088 if( (op==LOG_WRLOCK)
1089 || (op==LOG_UNLOCK && mRegion)
1090 || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
1091 ){
1092 struct LockMap {
1093 int iStart; /* Byte offset to start locking operation */
1094 int iLen; /* Length field for locking operation */
1095 } aMap[] = {
1096 /* 0000 */ {0, 0}, /* 0001 */ {4, 1},
1097 /* 0010 */ {3, 1}, /* 0011 */ {3, 2},
1098 /* 0100 */ {2, 1}, /* 0101 */ {0, 0},
1099 /* 0110 */ {2, 2}, /* 0111 */ {2, 3},
1100 /* 1000 */ {1, 1}, /* 1001 */ {0, 0},
1101 /* 1010 */ {0, 0}, /* 1011 */ {0, 0},
1102 /* 1100 */ {1, 2}, /* 1101 */ {0, 0},
1103 /* 1110 */ {1, 3}, /* 1111 */ {0, 0}
1104 };
1105 int rc; /* Return code of fcntl() */
1106 struct flock f; /* Locking operation */
dan64d039e2010-04-13 19:27:31 +00001107
dan02bb5962010-04-14 15:49:40 +00001108 assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );
1109
dan64d039e2010-04-13 19:27:31 +00001110 memset(&f, 0, sizeof(f));
1111 f.l_type = (op==LOG_WRLOCK?F_WRLCK:(op==LOG_RDLOCK?F_RDLCK:F_UNLCK));
1112 f.l_whence = SEEK_SET;
dan02bb5962010-04-14 15:49:40 +00001113 f.l_start = 32 + aMap[mRegion].iStart;
1114 f.l_len = aMap[mRegion].iLen;
dan64d039e2010-04-13 19:27:31 +00001115
1116 rc = fcntl(pSummary->fd, F_SETLK, &f);
1117 if( rc!=0 ){
1118 sqlite3_mutex_leave(pSummary->mutex);
1119 return SQLITE_BUSY;
1120 }
1121 }
1122
dan02bb5962010-04-14 15:49:40 +00001123 pLog->lock.mLock = mNew;
dan64d039e2010-04-13 19:27:31 +00001124 sqlite3_mutex_leave(pSummary->mutex);
1125 return SQLITE_OK;
1126}
1127
1128/*
danb9bf16b2010-04-14 11:23:30 +00001129** Try to read the log-summary header. Attempt to verify the header
1130** checksum. If the checksum can be verified, copy the log-summary
1131** header into structure pLog->hdr. If the contents of pLog->hdr are
1132** modified by this and pChanged is not NULL, set *pChanged to 1.
1133** Otherwise leave *pChanged unmodified.
1134**
1135** If the checksum cannot be verified return SQLITE_ERROR.
1136*/
1137int logSummaryTryHdr(Log *pLog, int *pChanged){
1138 u32 aCksum[2] = {1, 1};
1139 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1140
1141 /* First try to read the header without a lock. Verify the checksum
1142 ** before returning. This will almost always work.
1143 */
1144 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1145 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1146 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1147 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1148 ){
1149 return SQLITE_ERROR;
1150 }
1151
1152 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1153 if( pChanged ){
1154 *pChanged = 1;
1155 }
1156 memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
1157 }
1158 return SQLITE_OK;
1159}
1160
1161/*
1162** Read the log-summary header from the log-summary file into structure
1163** pLog->hdr. If attempting to verify the header checksum fails, try
1164** to recover the log before returning.
1165**
1166** If the log-summary header is successfully read, return SQLITE_OK.
1167** Otherwise an SQLite error code.
1168*/
1169int logSummaryReadHdr(Log *pLog, int *pChanged){
1170 int rc;
1171
1172 /* First try to read the header without a lock. Verify the checksum
1173 ** before returning. This will almost always work.
1174 */
1175 if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
1176 return SQLITE_OK;
1177 }
1178
1179 /* If the first attempt to read the header failed, lock the log-summary
1180 ** file and try again. If the header checksum verification fails this
1181 ** time as well, run log recovery.
1182 */
1183 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1184 if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
1185 if( pChanged ){
1186 *pChanged = 1;
1187 }
1188 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1189 if( rc==SQLITE_OK ){
1190 rc = logSummaryTryHdr(pLog, 0);
1191 }
1192 }
1193 logLeaveMutex(pLog);
1194 }
1195
1196 return rc;
1197}
1198
1199/*
dan64d039e2010-04-13 19:27:31 +00001200** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001201**
1202** If this call obtains a new read-lock and the database contents have been
1203** modified since the most recent call to LogCloseSnapshot() on this Log
1204** connection, then *pChanged is set to 1 before returning. Otherwise, it
1205** is left unmodified. This is used by the pager layer to determine whether
1206** or not any cached pages may be safely reused.
1207*/
1208int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
1209 int rc = SQLITE_OK;
1210 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001211 int nAttempt;
1212
1213 /* Obtain a snapshot-lock on the log-summary file. The procedure
1214 ** for obtaining the snapshot log is:
1215 **
1216 ** 1. Attempt a SHARED lock on regions A and B.
1217 ** 2a. If step 1 is successful, drop the lock on region B.
1218 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1219 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1220 ** successful.
1221 **
1222 ** If neither of the locks can be obtained after 5 tries, presumably
1223 ** something is wrong (i.e. a process not following the locking protocol).
1224 ** Return an error code in this case.
1225 */
1226 rc = SQLITE_BUSY;
1227 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1228 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1229 if( rc==SQLITE_BUSY ){
1230 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1231 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1232 }else{
1233 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1234 pLog->isLocked = LOG_REGION_A;
1235 }
1236 }
1237 if( rc!=SQLITE_OK ){
1238 return rc;
1239 }
1240
danb9bf16b2010-04-14 11:23:30 +00001241 rc = logSummaryReadHdr(pLog, pChanged);
dan64d039e2010-04-13 19:27:31 +00001242 if( rc!=SQLITE_OK ){
1243 /* An error occured while attempting log recovery. */
1244 sqlite3LogCloseSnapshot(pLog);
1245 }
dan7c246102010-04-12 19:00:29 +00001246 }
1247 return rc;
1248}
1249
1250/*
1251** Unlock the current snapshot.
1252*/
1253void sqlite3LogCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001254 if( pLog->isLocked ){
1255 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1256 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1257 }
dan7c246102010-04-12 19:00:29 +00001258 pLog->isLocked = 0;
1259}
1260
1261
1262
1263/*
1264** Read a page from the log, if it is present.
1265*/
1266int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
1267 u32 iRead = 0;
1268 u32 *aData = pLog->pSummary->aData;
1269 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1270
1271 /* Do a linear search of the unindexed block of page-numbers (if any)
1272 ** at the end of the log-summary. An alternative to this would be to
1273 ** build an index in private memory each time a read transaction is
1274 ** opened on a new snapshot.
1275 */
1276 if( pLog->hdr.iLastPg ){
1277 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1278 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1279 while( *pi!=pgno && pi!=piStop ) pi--;
1280 if( pi!=piStop ){
1281 iRead = (pi-piStop) + iFrame;
1282 }
1283 }
1284 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1285
1286 while( iRead==0 && iFrame>0 ){
1287 int iLow = 0;
1288 int iHigh = 255;
1289 u32 *aFrame;
1290 u8 *aIndex;
1291
1292 iFrame -= 256;
1293 aFrame = &aData[logSummaryEntry(iFrame+1)];
1294 aIndex = (u8 *)&aFrame[256];
1295
1296 while( iLow<=iHigh ){
1297 int iTest = (iLow+iHigh)>>1;
1298 u32 iPg = aFrame[aIndex[iTest]];
1299
1300 if( iPg==pgno ){
1301 iRead = iFrame + 1 + aIndex[iTest];
1302 break;
1303 }
1304 else if( iPg<pgno ){
1305 iLow = iTest+1;
1306 }else{
1307 iHigh = iTest-1;
1308 }
1309 }
1310 }
1311 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1312
1313 /* If iRead is non-zero, then it is the log frame number that contains the
1314 ** required page. Read and return data from the log file.
1315 */
1316 if( iRead ){
1317 i64 iOffset = (iRead-1) * (pLog->hdr.pgsz+LOG_FRAME_HDRSIZE);
1318 iOffset += LOG_FRAME_HDRSIZE;
1319 *pInLog = 1;
1320 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1321 }
1322
1323 *pInLog = 0;
1324 return SQLITE_OK;
1325}
1326
1327
1328/*
1329** Set *pPgno to the size of the database file (or zero, if unknown).
1330*/
1331void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
1332 assert( pLog->isLocked );
1333 *pPgno = pLog->hdr.nPage;
1334}
1335
1336/*
1337** The caller must hold at least a RESERVED lock on the database file
1338** when invoking this function.
1339**
1340** This function returns SQLITE_OK if the caller may write to the database.
1341** Otherwise, if the caller is operating on a snapshot that has already
1342** been overwritten by another writer, SQLITE_OBE is returned.
1343*/
1344int sqlite3LogWriteLock(Log *pLog, int op){
1345 assert( pLog->isLocked );
1346 if( op ){
dan64d039e2010-04-13 19:27:31 +00001347
1348 /* Obtain the writer lock */
1349 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1350 if( rc!=SQLITE_OK ){
1351 return rc;
1352 }
1353
dan02bb5962010-04-14 15:49:40 +00001354 /* TODO: What if this is a region D reader? And after writing this
1355 ** transaction it continues to hold a read-lock on the db? Maybe we
1356 ** need to switch it to a region A reader here so that unlocking C|D
1357 ** does not leave the connection with no lock at all.
1358 */
1359 assert( pLog->isLocked!=LOG_REGION_D );
1360
dan7c246102010-04-12 19:00:29 +00001361 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
1362 return SQLITE_BUSY;
1363 }
1364 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001365
dan7c246102010-04-12 19:00:29 +00001366 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001367 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001368 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1369 pLog->isWriteLocked = 0;
1370 }
1371 return SQLITE_OK;
1372}
1373
1374/*
1375** Write a set of frames to the log. The caller must hold at least a
1376** RESERVED lock on the database file.
1377*/
1378int sqlite3LogFrames(
1379 Log *pLog, /* Log handle to write to */
1380 int nPgsz, /* Database page-size in bytes */
1381 PgHdr *pList, /* List of dirty pages to write */
1382 Pgno nTruncate, /* Database size after this commit */
1383 int isCommit, /* True if this is a commit */
1384 int isSync /* True to sync the log file */
1385){
1386 /* Each frame has a 20 byte header, as follows:
1387 **
1388 ** + Pseudo-random salt (4 bytes)
1389 ** + Page number (4 bytes)
1390 ** + New database size, or 0 if not a commit frame (4 bytes)
1391 ** + Checksum (CHECKSUM_BYTES bytes);
1392 **
1393 ** The checksum is computed based on the following:
1394 **
1395 ** + The previous checksum, or {2, 3} for the first frame in the log.
1396 ** + The non-checksum fields of the frame header, and
1397 ** + The frame contents (page data).
1398 **
1399 ** This format must also be understood by the code in logSummaryRecover().
1400 ** The size of the frame header is used by LogRead() and LogCheckpoint().
1401 */
1402 int rc; /* Used to catch return codes */
1403 u32 iFrame; /* Next frame address */
1404 u8 aFrame[LOG_FRAME_HDRSIZE];
1405 PgHdr *p; /* Iterator to run through pList with. */
1406 u32 aCksum[2];
1407
1408 PgHdr *pLast; /* Last frame in list */
1409 int nLast = 0; /* Number of extra copies of last page */
1410
1411 assert( LOG_FRAME_HDRSIZE==(4 * 3 + LOG_CKSM_BYTES) );
1412 assert( pList );
1413
1414 aCksum[0] = pLog->hdr.iCheck1;
1415 aCksum[1] = pLog->hdr.iCheck2;
1416
1417 /* Write the log file. */
1418 iFrame = pLog->hdr.iLastPg;
1419 for(p=pList; p; p=p->pDirty){
1420 u32 nDbsize; /* Db-size field for frame header */
1421 i64 iOffset; /* Write offset in log file */
1422
1423 iFrame++;
1424 iOffset = (iFrame-1) * (nPgsz+sizeof(aFrame));
1425
1426 /* Populate and write the frame header */
1427 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1428 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1429 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1430 if( rc!=SQLITE_OK ){
1431 return rc;
1432 }
1433
1434 /* Write the page data */
1435 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1436 if( rc!=SQLITE_OK ){
1437 return rc;
1438 }
1439 pLast = p;
1440 }
1441
1442 /* Sync the log file if the 'isSync' flag was specified. */
1443 if( isSync ){
1444#if 0
1445 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
1446 i64 iOffset = iFrame * (nPgsz+sizeof(aFrame));
1447
1448 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1449 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1450 }
1451 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1452 while( iOffset<iSegment ){
1453 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1454 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1455 if( rc!=SQLITE_OK ){
1456 return rc;
1457 }
1458
1459 iOffset += LOG_FRAME_HDRSIZE;
1460 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1461 if( rc!=SQLITE_OK ){
1462 return rc;
1463 }
1464 nLast++;
1465 iOffset += nPgsz;
1466 }
1467#endif
1468
1469 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1470 if( rc!=SQLITE_OK ){
1471 return rc;
1472 }
1473 }
1474
1475 /* Append data to the log summary. It is not necessary to lock the
1476 ** log-summary to do this as the RESERVED lock held on the db file
1477 ** guarantees that there are no other writers, and no data that may
1478 ** be in use by existing readers is being overwritten.
1479 */
1480 iFrame = pLog->hdr.iLastPg;
1481 for(p=pList; p; p=p->pDirty){
1482 iFrame++;
1483 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1484 }
1485 while( nLast>0 ){
1486 iFrame++;
1487 nLast--;
1488 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1489 }
1490
1491 /* Update the private copy of the header. */
1492 pLog->hdr.pgsz = nPgsz;
1493 pLog->hdr.iLastPg = iFrame;
1494 if( isCommit ){
1495 pLog->hdr.iChange++;
1496 pLog->hdr.nPage = nTruncate;
1497 }
1498 pLog->hdr.iCheck1 = aCksum[0];
1499 pLog->hdr.iCheck2 = aCksum[1];
1500
1501 /* If this is a commit, update the log-summary header too. */
1502 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1503 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1504 logLeaveMutex(pLog);
1505 }
1506
1507 return SQLITE_OK;
1508}
1509
1510/*
danb9bf16b2010-04-14 11:23:30 +00001511** Checkpoint the database:
1512**
1513** 1. Wait for an EXCLUSIVE lock on regions B and C.
1514** 2. Wait for an EXCLUSIVE lock on region A.
1515** 3. Copy the contents of the log into the database file.
1516** 4. Zero the log-summary header (so new readers will ignore the log).
1517** 5. Drop the locks obtained in steps 1 and 2.
dan7c246102010-04-12 19:00:29 +00001518*/
1519int sqlite3LogCheckpoint(
1520 Log *pLog, /* Log connection */
1521 sqlite3_file *pFd, /* File descriptor open on db file */
dan64d039e2010-04-13 19:27:31 +00001522 u8 *zBuf, /* Temporary buffer to use */
1523 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1524 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001525){
danb9bf16b2010-04-14 11:23:30 +00001526 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001527
danb9bf16b2010-04-14 11:23:30 +00001528 /* Wait for a write-lock on regions B and C. */
dan64d039e2010-04-13 19:27:31 +00001529 do {
1530 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1531 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1532 if( rc!=SQLITE_OK ) return rc;
1533
danb9bf16b2010-04-14 11:23:30 +00001534 /* Wait for a write-lock on region A. */
dan64d039e2010-04-13 19:27:31 +00001535 do {
1536 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1537 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001538 if( rc!=SQLITE_OK ){
1539 logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1540 return rc;
1541 }
dan64d039e2010-04-13 19:27:31 +00001542
danb9bf16b2010-04-14 11:23:30 +00001543 /* Copy data from the log to the database file. */
1544 rc = logSummaryReadHdr(pLog, 0);
1545 if( rc==SQLITE_OK ){
1546 rc = logCheckpoint(pLog, pFd, zBuf);
1547 }
1548
1549 /* Release the locks. */
dan64d039e2010-04-13 19:27:31 +00001550 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1551 return rc;
dan7c246102010-04-12 19:00:29 +00001552}
1553