blob: 2b2764f361c39931c30758b07b7667e1215c71ba [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001
2/*
3** This file contains the implementation of a log file used in
4** "journal_mode=wal" mode.
5*/
6
7#include "log.h"
8
9#include <unistd.h>
10#include <fcntl.h>
11#include <sys/mman.h>
12
13typedef struct LogSummaryHdr LogSummaryHdr;
14typedef struct LogSummary LogSummary;
15typedef struct LogCheckpoint LogCheckpoint;
dan64d039e2010-04-13 19:27:31 +000016typedef struct LogLock LogLock;
dan7c246102010-04-12 19:00:29 +000017
18
19/*
20** The following structure may be used to store the same data that
21** is stored in the log-summary header.
22**
23** Member variables iCheck1 and iCheck2 contain the checksum for the
24** last frame written to the log, or 2 and 3 respectively if the log
25** is currently empty.
26*/
27struct LogSummaryHdr {
28 u32 iChange; /* Counter incremented each transaction */
29 u32 pgsz; /* Database page size in bytes */
30 u32 iLastPg; /* Address of last valid frame in log */
31 u32 nPage; /* Size of database in pages */
32 u32 iCheck1; /* Checkpoint value 1 */
33 u32 iCheck2; /* Checkpoint value 2 */
34};
35
36/* Size of serialized LogSummaryHdr object. */
37#define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))
38
39#define LOGSUMMARY_FRAME_OFFSET \
40 (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32))
41
42/* Size of frame header */
43#define LOG_FRAME_HDRSIZE 20
44
45/*
46** There is one instance of this structure for each log-summary object
47** that this process has a connection to. They are stored in a linked
48** list starting at pLogSummary (global variable).
49**
50** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
51** directly in this implementation because the VFS does not support
52** the required blocking file-locks.
53*/
54struct LogSummary {
55 sqlite3_mutex *mutex; /* Mutex used to protect this object */
56 int nRef; /* Number of pointers to this structure */
57 int fd; /* File descriptor open on log-summary */
58 char *zPath; /* Path to associated WAL file */
dan64d039e2010-04-13 19:27:31 +000059 LogLock *pLock; /* Linked list of locks on this object */
dan7c246102010-04-12 19:00:29 +000060 LogSummary *pNext; /* Next in global list */
61 int nData; /* Size of aData allocation/mapping */
62 u32 *aData; /* File body */
63};
64
dan64d039e2010-04-13 19:27:31 +000065
dan7c246102010-04-12 19:00:29 +000066/*
dan64d039e2010-04-13 19:27:31 +000067** The four lockable regions associated with each log-summary. A connection
68** may take either a SHARED or EXCLUSIVE lock on each.
dan7c246102010-04-12 19:00:29 +000069*/
dan64d039e2010-04-13 19:27:31 +000070#define LOG_REGION_A 0x01
71#define LOG_REGION_B 0x02
72#define LOG_REGION_C 0x04
73#define LOG_REGION_D 0x08
74
75/*
76** A single instance of this structure is allocated as part of each
77** connection to a database log. All structures associated with the
78** same log file are linked together into a list using LogLock.pNext
79** starting at LogSummary.pLock.
80**
81** The mLock field of the structure describes the locks (if any)
82** currently held by the connection. If a SHARED lock is held on
83** any of the four locking regions, then the associated LOG_REGION_X
84** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
85** then the (LOG_REGION_X << 8) bit is set.
86*/
87struct LogLock {
88 LogLock *pNext; /* Next lock on the same log */
89 u32 mLock; /* Mask of locks */
90};
dan7c246102010-04-12 19:00:29 +000091
92struct Log {
93 LogSummary *pSummary; /* Log file summary data */
94 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
95 sqlite3_file *pFd; /* File handle for log file */
96 int sync_flags; /* Flags to use with OsSync() */
dan64d039e2010-04-13 19:27:31 +000097 int isLocked; /* Non-zero if a snapshot is held open */
dan7c246102010-04-12 19:00:29 +000098 int isWriteLocked; /* True if this is the writer connection */
99 LogSummaryHdr hdr; /* Log summary header for current snapshot */
dan64d039e2010-04-13 19:27:31 +0000100 LogLock lock; /* Lock held by this connection (if any) */
dan7c246102010-04-12 19:00:29 +0000101};
102
dan64d039e2010-04-13 19:27:31 +0000103
dan7c246102010-04-12 19:00:29 +0000104/*
105** This structure is used to implement an iterator that iterates through
106** all frames in the log in database page order. Where two or more frames
107** correspond to the same database page, the iterator visits only the
108** frame most recently written to the log.
109**
110** The internals of this structure are only accessed by:
111**
112** logCheckpointInit() - Create a new iterator,
113** logCheckpointNext() - Step an iterator,
114** logCheckpointFree() - Free an iterator.
115**
116** This functionality is used by the checkpoint code (see logCheckpoint()).
117*/
118struct LogCheckpoint {
119 int nSegment; /* Size of LogCheckpoint.aSummary[] array */
120 int nFinal; /* Elements in segment nSegment-1 */
121 struct LogSegment {
122 int iNext; /* Next aIndex index */
123 u8 *aIndex; /* Pointer to index array */
124 u32 *aDbPage; /* Pointer to db page array */
125 } aSegment[1];
126};
127
dan64d039e2010-04-13 19:27:31 +0000128
129/*
130** List of all LogSummary objects created by this process. Protected by
131** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
132** here instead of borrowing the LRU mutex.
133*/
134#define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
135static LogSummary *pLogSummary = 0;
136
dan7c246102010-04-12 19:00:29 +0000137/*
138** Generate an 8 byte checksum based on the data in array aByte[] and the
139** initial values of aCksum[0] and aCksum[1]. The checksum is written into
140** aCksum[] before returning.
141*/
142#define LOG_CKSM_BYTES 8
143static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
144 u32 *z32 = (u32 *)aByte;
145 int n32 = nByte / sizeof(u32);
146 int i;
147
148 assert( LOG_CKSM_BYTES==2*sizeof(u32) );
149 assert( (nByte&0x00000003)==0 );
150
151 u32 cksum0 = aCksum[0];
152 u32 cksum1 = aCksum[1];
153
154 for(i=0; i<n32; i++){
155 cksum0 = (cksum0 >> 8) + (cksum0 ^ z32[i]);
156 cksum1 = (cksum1 >> 8) + (cksum1 ^ z32[i]);
157 }
158
159 aCksum[0] = cksum0;
160 aCksum[1] = cksum1;
161}
162
163/*
164** Argument zPath must be a nul-terminated string containing a path-name.
165** This function modifies the string in-place by removing any "./" or "../"
166** elements in the path. For example, the following input:
167**
168** "/home/user/plans/good/../evil/./world_domination.txt"
169**
170** is overwritten with the 'normalized' version:
171**
172** "/home/user/plans/evil/world_domination.txt"
173*/
174static void logNormalizePath(char *zPath){
175 int i, j;
176 char *z = zPath;
177 int n = strlen(z);
178
179 while( n>1 && z[n-1]=='/' ){ n--; }
180 for(i=j=0; i<n; i++){
181 if( z[i]=='/' ){
182 if( z[i+1]=='/' ) continue;
183 if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
184 i += 1;
185 continue;
186 }
187 if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
188 while( j>0 && z[j-1]!='/' ){ j--; }
189 if( j>0 ){ j--; }
190 i += 2;
191 continue;
192 }
193 }
194 z[j++] = z[i];
195 }
196 z[j] = 0;
197}
198
199/*
200** Lock the summary file pSummary->fd.
201*/
202static int logSummaryLock(LogSummary *pSummary){
203 int rc;
204 struct flock f;
205 memset(&f, 0, sizeof(f));
206 f.l_type = F_WRLCK;
207 f.l_whence = SEEK_SET;
208 f.l_start = 0;
209 f.l_len = 1;
210 rc = fcntl(pSummary->fd, F_SETLKW, &f);
211 if( rc!=0 ){
212 return SQLITE_IOERR;
213 }
214 return SQLITE_OK;
215}
216
217/*
218** Unlock the summary file pSummary->fd.
219*/
220static int logSummaryUnlock(LogSummary *pSummary){
221 int rc;
222 struct flock f;
223 memset(&f, 0, sizeof(f));
224 f.l_type = F_UNLCK;
225 f.l_whence = SEEK_SET;
226 f.l_start = 0;
227 f.l_len = 1;
228 rc = fcntl(pSummary->fd, F_SETLK, &f);
229 if( rc!=0 ){
230 return SQLITE_IOERR;
231 }
232 return SQLITE_OK;
233}
234
235/*
236** Memory map the first nByte bytes of the summary file opened with
237** pSummary->fd at pSummary->aData. If the summary file is smaller than
238** nByte bytes in size when this function is called, ftruncate() is
239** used to expand it before it is mapped.
240**
241** It is assumed that an exclusive lock is held on the summary file
242** by the caller (to protect the ftruncate()).
243*/
244static int logSummaryMap(LogSummary *pSummary, int nByte){
245 struct stat sStat;
246 int rc;
247 int fd = pSummary->fd;
248 void *pMap;
249
250 assert( pSummary->aData==0 );
251
252 /* If the file is less than nByte bytes in size, cause it to grow. */
253 rc = fstat(fd, &sStat);
254 if( rc!=0 ) return SQLITE_IOERR;
255 if( sStat.st_size<nByte ){
256 rc = ftruncate(fd, nByte);
257 if( rc!=0 ) return SQLITE_IOERR;
258 }
259
260 /* Map the file. */
261 pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
262 if( pMap==MAP_FAILED ){
263 return SQLITE_IOERR;
264 }
265 pSummary->aData = (u32 *)pMap;
266 pSummary->nData = nByte;
267
268 return SQLITE_OK;
269}
270
271/*
272** Unmap the log-summary mapping and close the file-descriptor. If
273** the isTruncate argument is non-zero, truncate the log-summary file
274** region to zero bytes.
275**
276** Regardless of the value of isTruncate, close the file-descriptor
277** opened on the log-summary file.
278*/
279static int logSummaryUnmap(LogSummary *pSummary, int isTruncate){
280 int rc = SQLITE_OK;
281 if( pSummary->aData ){
282 assert( pSummary->fd>0 );
283 munmap(pSummary->aData, pSummary->nData);
284 pSummary->aData = 0;
285 if( isTruncate ){
286 rc = (ftruncate(pSummary->fd, 0) ? SQLITE_IOERR : SQLITE_OK);
287 }
288 }
289 if( pSummary->fd>0 ){
290 close(pSummary->fd);
291 pSummary->fd = -1;
292 }
293 return rc;
294}
295
296
297static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
298 u32 *aData = pSummary->aData;
299 memcpy(aData, pHdr, sizeof(LogSummaryHdr));
300 aData[LOGSUMMARY_HDR_NFIELD] = 1;
301 aData[LOGSUMMARY_HDR_NFIELD+1] = 1;
302 logChecksumBytes(
303 (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD]
304 );
305}
306
307/*
308** This function encodes a single frame header and writes it to a buffer
309** supplied by the caller. A log frame-header is made up of a series of
310** 4-byte big-endian integers, as follows:
311**
312** 0: Database page size in bytes.
313** 4: Page number.
314** 8: New database size (for commit frames, otherwise zero).
315** 12: Frame checksum 1.
316** 16: Frame checksum 2.
317*/
318static void logEncodeFrame(
319 u32 *aCksum, /* IN/OUT: Checksum values */
320 u32 iPage, /* Database page number for frame */
321 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
322 int nData, /* Database page size (size of aData[]) */
323 u8 *aData, /* Pointer to page data (for checksum) */
324 u8 *aFrame /* OUT: Write encoded frame here */
325){
326 assert( LOG_FRAME_HDRSIZE==20 );
327
328 sqlite3Put4byte(&aFrame[0], nData);
329 sqlite3Put4byte(&aFrame[4], iPage);
330 sqlite3Put4byte(&aFrame[8], nTruncate);
331
332 logChecksumBytes(aFrame, 12, aCksum);
333 logChecksumBytes(aData, nData, aCksum);
334
335 sqlite3Put4byte(&aFrame[12], aCksum[0]);
336 sqlite3Put4byte(&aFrame[16], aCksum[1]);
337}
338
339/*
340** Return 1 and populate *piPage, *pnTruncate and aCksum if the
341** frame checksum looks Ok. Otherwise return 0.
342*/
343static int logDecodeFrame(
344 u32 *aCksum, /* IN/OUT: Checksum values */
345 u32 *piPage, /* OUT: Database page number for frame */
346 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
347 int nData, /* Database page size (size of aData[]) */
348 u8 *aData, /* Pointer to page data (for checksum) */
349 u8 *aFrame /* Frame data */
350){
351 logChecksumBytes(aFrame, 12, aCksum);
352 logChecksumBytes(aData, nData, aCksum);
353
354 if( aCksum[0]!=sqlite3Get4byte(&aFrame[12])
355 || aCksum[1]!=sqlite3Get4byte(&aFrame[16])
356 ){
357 /* Checksum failed. */
358 return 0;
359 }
360
361 *piPage = sqlite3Get4byte(&aFrame[4]);
362 *pnTruncate = sqlite3Get4byte(&aFrame[8]);
363 return 1;
364}
365
366static void logMergesort8(
367 Pgno *aContent, /* Pages in log */
368 u8 *aBuffer, /* Buffer of at least *pnList items to use */
369 u8 *aList, /* IN/OUT: List to sort */
370 int *pnList /* IN/OUT: Number of elements in aList[] */
371){
372 int nList = *pnList;
373 if( nList>1 ){
374 int nLeft = nList / 2; /* Elements in left list */
375 int nRight = nList - nLeft; /* Elements in right list */
376 u8 *aLeft = aList; /* Left list */
377 u8 *aRight = &aList[nLeft]; /* Right list */
378 int iLeft = 0; /* Current index in aLeft */
379 int iRight = 0; /* Current index in aright */
380 int iOut = 0; /* Current index in output buffer */
381
382 /* TODO: Change to non-recursive version. */
383 logMergesort8(aContent, aBuffer, aLeft, &nLeft);
384 logMergesort8(aContent, aBuffer, aRight, &nRight);
385
386 while( iRight<nRight || iLeft<nLeft ){
387 u8 logpage;
388 Pgno dbpage;
389
390 if( (iLeft<nLeft)
391 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
392 ){
393 logpage = aLeft[iLeft++];
394 }else{
395 logpage = aRight[iRight++];
396 }
397 dbpage = aContent[logpage];
398
399 aBuffer[iOut++] = logpage;
400 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
401
402 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
403 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
404 }
405 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
406 *pnList = iOut;
407 }
408
409#ifdef SQLITE_DEBUG
410 {
411 int i;
412 for(i=1; i<*pnList; i++){
413 assert( aContent[aList[i]] > aContent[aList[i-1]] );
414 }
415 }
416#endif
417}
418
419
420/*
421** Return the index in the LogSummary.aData array that corresponds to
422** frame iFrame. The log-summary file consists of a header, followed by
423** alternating "map" and "index" blocks.
424*/
425static int logSummaryEntry(u32 iFrame){
426 return ((((iFrame-1)>>8)<<6) + iFrame-1 + 2 + LOGSUMMARY_HDR_NFIELD);
427}
428
429
430/*
431** Set an entry in the log-summary map to map log frame iFrame to db
432** page iPage. Values are always appended to the log-summary (i.e. the
433** value of iFrame is always exactly one more than the value passed to
434** the previous call), but that restriction is not enforced or asserted
435** here.
436*/
437static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
438 u32 iSlot = logSummaryEntry(iFrame);
439
440 /* Set the log-summary entry itself */
441 pSummary->aData[iSlot] = iPage;
442
443 /* If the frame number is a multiple of 256 (frames are numbered starting
444 ** at 1), build an index of the most recently added 256 frames.
445 */
446 if( (iFrame&0x000000FF)==0 ){
447 int i; /* Iterator used while initializing aIndex */
448 u32 *aFrame; /* Pointer to array of 256 frames */
449 int nIndex; /* Number of entries in index */
450 u8 *aIndex; /* 256 bytes to build index in */
451 u8 *aTmp; /* Scratch space to use while sorting */
452
453 aFrame = &pSummary->aData[iSlot-255];
454 aIndex = (u8 *)&pSummary->aData[iSlot+1];
455 aTmp = &aIndex[256];
456
457 nIndex = 256;
458 for(i=0; i<256; i++) aIndex[i] = (u8)i;
459 logMergesort8(aFrame, aTmp, aIndex, &nIndex);
460 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
461 }
462}
463
464
465/*
466** Recover the log-summary by reading the log file. The caller must hold
467** an exclusive lock on the log-summary file.
468*/
469static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
470 int rc; /* Return Code */
471 i64 nSize; /* Size of log file */
472 LogSummaryHdr hdr; /* Recovered log-summary header */
473
474 memset(&hdr, 0, sizeof(hdr));
475
476 rc = sqlite3OsFileSize(pFd, &nSize);
477 if( rc!=SQLITE_OK ){
478 return rc;
479 }
480
481 if( nSize>LOG_FRAME_HDRSIZE ){
482 u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
483 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
484 int nFrame; /* Number of bytes at aFrame */
485 u8 *aData; /* Pointer to data part of aFrame buffer */
486 int iFrame; /* Index of last frame read */
487 i64 iOffset; /* Next offset to read from log file */
488 int nPgsz; /* Page size according to the log */
489 u32 aCksum[2] = {2, 3}; /* Running checksum */
490
491 /* Read in the first frame header in the file (to determine the
492 ** database page size).
493 */
494 rc = sqlite3OsRead(pFd, aBuf, LOG_FRAME_HDRSIZE, 0);
495 if( rc!=SQLITE_OK ){
496 return rc;
497 }
498
499 /* If the database page size is not a power of two, or is greater than
500 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
501 */
502 nPgsz = sqlite3Get4byte(&aBuf[0]);
503 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){
504 goto finished;
505 }
506
507 /* Malloc a buffer to read frames into. */
508 nFrame = nPgsz + LOG_FRAME_HDRSIZE;
509 aFrame = (u8 *)sqlite3_malloc(nFrame);
510 if( !aFrame ){
511 return SQLITE_NOMEM;
512 }
513 aData = &aFrame[LOG_FRAME_HDRSIZE];
514
515 /* Read all frames from the log file. */
516 iFrame = 0;
517 iOffset = 0;
518 for(iOffset=0; (iOffset+nFrame)<nSize; iOffset+=nFrame){
519 u32 pgno; /* Database page number for frame */
520 u32 nTruncate; /* dbsize field from frame header */
521 int isValid; /* True if this frame is valid */
522
523 /* Read and decode the next log frame. */
524 rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
525 if( rc!=SQLITE_OK ) break;
526 isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
527 if( !isValid ) break;
528 logSummaryAppend(pSummary, ++iFrame, pgno);
529
530 /* If nTruncate is non-zero, this is a commit record. */
531 if( nTruncate ){
532 hdr.iCheck1 = aCksum[0];
533 hdr.iCheck2 = aCksum[1];
534 hdr.iLastPg = iFrame;
535 hdr.nPage = nTruncate;
536 hdr.pgsz = nPgsz;
537 }
538 }
539
540 sqlite3_free(aFrame);
541 }else{
542 hdr.iCheck1 = 2;
543 hdr.iCheck2 = 3;
544 }
545
546finished:
547 logSummaryWriteHdr(pSummary, &hdr);
548 return rc;
549}
550
551
552/*
553** This function intializes the connection to the log-summary identified
554** by struct pSummary.
555*/
556static int logSummaryInit(LogSummary *pSummary, sqlite3_file *pFd){
557 int rc; /* Return Code */
558 char *zFile; /* File name for summary file */
559
560 assert( pSummary->fd<0 );
561 assert( pSummary->aData==0 );
562 assert( pSummary->nRef>0 );
563 assert( pSummary->zPath );
564
565 /* Open a file descriptor on the summary file. */
566 zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
567 if( !zFile ){
568 return SQLITE_NOMEM;
569 }
570 pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
571 sqlite3_free(zFile);
572 if( pSummary->fd<0 ){
573 return SQLITE_IOERR;
574 }
575
576 /* Grab an exclusive lock the summary file. Then mmap() it. TODO: This
577 ** code needs to be enhanced to support a growable mapping. For now, just
578 ** make the mapping very large to start with.
579 */
580 rc = logSummaryLock(pSummary);
581 if( rc!=SQLITE_OK ) return rc;
582 rc = logSummaryMap(pSummary, 512*1024);
583 if( rc!=SQLITE_OK ) goto out;
584
585 /* Grab a SHARED lock on the log file. Then try to upgrade to an EXCLUSIVE
586 ** lock. If successful, then this is the first (and only) connection to
587 ** the database. In this case assume the contents of the log-summary
588 ** cannot be trusted. Zero the log-summary header to make sure.
589 **
590 ** The SHARED lock on the log file is not released until the connection
591 ** to the database is closed.
592 */
593 rc = sqlite3OsLock(pFd, SQLITE_LOCK_SHARED);
594 if( rc!=SQLITE_OK ) goto out;
595 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
596 if( rc==SQLITE_OK ){
597 /* This is the first and only connection. */
598 memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
599 rc = sqlite3OsUnlock(pFd, SQLITE_LOCK_SHARED);
600 }else if( rc==SQLITE_BUSY ){
601 rc = SQLITE_OK;
602 }
603
604 out:
605 logSummaryUnlock(pSummary);
606 return rc;
607}
608
609/*
610** Open a connection to the log file associated with database zDb. The
611** database file does not actually have to exist. zDb is used only to
612** figure out the name of the log file to open. If the log file does not
613** exist it is created by this call.
614*/
615int sqlite3LogOpen(
616 sqlite3_vfs *pVfs, /* vfs module to open log file with */
617 const char *zDb, /* Name of database file */
618 Log **ppLog /* OUT: Allocated Log handle */
619){
620 int rc; /* Return Code */
621 Log *pRet; /* Object to allocate and return */
622 LogSummary *pSummary = 0; /* Summary object */
623 sqlite3_mutex *mutex = 0; /* LOG_SUMMARY_MUTEX mutex */
624 int flags; /* Flags passed to OsOpen() */
625 char *zWal = 0; /* Path to WAL file */
626 int nWal; /* Length of zWal in bytes */
627
628 /* Zero output variables */
629 assert( zDb );
630 *ppLog = 0;
631
632 /* Allocate an instance of struct Log to return. */
633 pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
634 if( !pRet ) goto out;
635 pRet->pVfs = pVfs;
636 pRet->pFd = (sqlite3_file *)&pRet[1];
637 pRet->sync_flags = SQLITE_SYNC_NORMAL;
638
639 /* Normalize the path name. */
640 zWal = sqlite3_mprintf("%s-wal", zDb);
641 if( !zWal ) goto out;
642 logNormalizePath(zWal);
643 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_DB);
644 nWal = sqlite3Strlen30(zWal);
645
646 /* Enter the mutex that protects the linked-list of LogSummary structures */
647 if( sqlite3GlobalConfig.bCoreMutex ){
648 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
649 }
650 sqlite3_mutex_enter(mutex);
651
652 /* Search for an existing log summary object in the linked list. If one
653 ** cannot be found, allocate and initialize a new object.
654 */
655 for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
656 int nPath = sqlite3Strlen30(pSummary->zPath);
657 if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
658 }
659 if( !pSummary ){
660 int nByte = sizeof(LogSummary) + nWal + 1;
661 pSummary = (LogSummary *)sqlite3MallocZero(nByte);
662 if( !pSummary ){
663 rc = SQLITE_NOMEM;
664 goto out;
665 }
666 if( sqlite3GlobalConfig.bCoreMutex ){
667 pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
668 }
669 pSummary->zPath = (char *)&pSummary[1];
670 pSummary->fd = -1;
671 memcpy(pSummary->zPath, zWal, nWal);
672 pSummary->pNext = pLogSummary;
673 pLogSummary = pSummary;
674 }
675 pSummary->nRef++;
676 pRet->pSummary = pSummary;
677
678 /* Exit the mutex protecting the linked-list of LogSummary objects. */
679 sqlite3_mutex_leave(mutex);
680 mutex = 0;
681
682 /* Open file handle on the log file. */
683 rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
684 if( rc!=SQLITE_OK ) goto out;
685
686 /* Object pSummary is shared between all connections to the database made
687 ** by this process. So at this point it may or may not be connected to
688 ** the log-summary. If it is not, connect it. Otherwise, just take the
689 ** SHARED lock on the log file.
690 */
691 sqlite3_mutex_enter(pSummary->mutex);
692 mutex = pSummary->mutex;
693 if( pSummary->fd<0 ){
694 rc = logSummaryInit(pSummary, pRet->pFd);
695 }else{
696 rc = sqlite3OsLock(pRet->pFd, SQLITE_LOCK_SHARED);
697 }
698
dan64d039e2010-04-13 19:27:31 +0000699 pRet->lock.pNext = pSummary->pLock;
700 pSummary->pLock = &pRet->lock;
701
dan7c246102010-04-12 19:00:29 +0000702 out:
703 sqlite3_mutex_leave(mutex);
704 sqlite3_free(zWal);
705 if( rc!=SQLITE_OK ){
706 assert(0);
707 if( pRet ){
708 sqlite3OsClose(pRet->pFd);
709 sqlite3_free(pRet);
710 }
711 assert( !pSummary || pSummary->nRef==0 );
712 sqlite3_free(pSummary);
713 }
714 *ppLog = pRet;
715 return rc;
716}
717
718static int logCheckpointNext(
719 LogCheckpoint *p, /* Iterator */
720 u32 *piPage, /* OUT: Next db page to write */
721 u32 *piFrame /* OUT: Log frame to read from */
722){
723 u32 iMin = *piPage;
724 u32 iRet = 0xFFFFFFFF;
725 int i;
726 int nBlock = p->nFinal;
727
728 for(i=p->nSegment-1; i>=0; i--){
729 struct LogSegment *pSegment = &p->aSegment[i];
730 while( pSegment->iNext<nBlock ){
731 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
732 if( iPg>iMin ){
733 if( iPg<iRet ){
734 iRet = iPg;
735 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
736 }
737 break;
738 }
739 pSegment->iNext++;
740 }
741
742 nBlock = 256;
743 }
744
745 *piPage = iRet;
746 return (iRet==0xFFFFFFFF);
747}
748
749static LogCheckpoint *logCheckpointInit(Log *pLog){
750 u32 *aData = pLog->pSummary->aData;
751 LogCheckpoint *p; /* Return value */
752 int nSegment; /* Number of segments to merge */
753 u32 iLast; /* Last frame in log */
754 int nByte; /* Number of bytes to allocate */
755 int i; /* Iterator variable */
756 int nFinal; /* Number of unindexed entries */
757 struct LogSegment *pFinal; /* Final (unindexed) segment */
758 u8 *aTmp; /* Temp space used by merge-sort */
759
760 iLast = pLog->hdr.iLastPg;
761 nSegment = (iLast >> 8) + 1;
762 nFinal = (iLast & 0x000000FF);
763
764 nByte = sizeof(LogCheckpoint) + (nSegment-1)*sizeof(struct LogSegment) + 512;
765 p = (LogCheckpoint *)sqlite3_malloc(nByte);
766 if( p ){
767 memset(p, 0, nByte);
768 p->nSegment = nSegment;
769 p->nFinal = nFinal;
770 }
771
772 for(i=0; i<nSegment-1; i++){
773 p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
774 p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
775 }
776 pFinal = &p->aSegment[nSegment-1];
777
778 pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
779 pFinal->aIndex = (u8 *)&pFinal[1];
780 aTmp = &pFinal->aIndex[256];
781 for(i=0; i<nFinal; i++){
782 pFinal->aIndex[i] = i;
783 }
784 logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
785 p->nFinal = nFinal;
786
787 return p;
788}
789
790/*
791** Free a log iterator allocated by logCheckpointInit().
792*/
793static void logCheckpointFree(LogCheckpoint *p){
794 sqlite3_free(p);
795}
796
797/*
798** Checkpoint the contents of the log file.
799*/
800static int logCheckpoint(
801 Log *pLog, /* Log connection */
802 sqlite3_file *pFd, /* File descriptor open on db file */
803 u8 *zBuf /* Temporary buffer to use */
804){
805 int rc; /* Return code */
806 int pgsz = pLog->hdr.pgsz; /* Database page-size */
807 LogCheckpoint *pIter = 0; /* Log iterator context */
808 u32 iDbpage = 0; /* Next database page to write */
809 u32 iFrame; /* Log frame containing data for iDbpage */
810
811 /* Allocate the iterator */
812 pIter = logCheckpointInit(pLog);
813 if( !pIter ) return SQLITE_NOMEM;
814
815 /* Sync the log file to disk */
816 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
817 if( rc!=SQLITE_OK ) goto out;
818
819 /* Iterate through the contents of the log, copying data to the db file. */
820 while( 0==logCheckpointNext(pIter, &iDbpage, &iFrame) ){
821 rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
822 (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
823 );
824 if( rc!=SQLITE_OK ) goto out;
825 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
826 if( rc!=SQLITE_OK ) goto out;
827 }
828
829 /* Truncate the database file */
830 rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
831 if( rc!=SQLITE_OK ) goto out;
832
833 /* Sync the database file. If successful, update the log-summary. */
834 rc = sqlite3OsSync(pFd, pLog->sync_flags);
835 if( rc!=SQLITE_OK ) goto out;
836 pLog->hdr.iLastPg = 0;
837 pLog->hdr.iCheck1 = 2;
838 pLog->hdr.iCheck2 = 3;
839 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
840
841 /* TODO: If a crash occurs and the current log is copied into the
842 ** database there is no problem. However, if a crash occurs while
843 ** writing the next transaction into the start of the log, such that:
844 **
845 ** * The first transaction currently in the log is left intact, but
846 ** * The second (or subsequent) transaction is damaged,
847 **
848 ** then the database could become corrupt.
849 **
850 ** The easiest thing to do would be to write and sync a dummy header
851 ** into the log at this point. Unfortunately, that turns out to be
852 ** an unwelcome performance hit. Alternatives are...
853 */
854#if 0
855 memset(zBuf, 0, LOG_FRAME_HDRSIZE);
856 rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
857 if( rc!=SQLITE_OK ) goto out;
858 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
859#endif
860
861 out:
862 logCheckpointFree(pIter);
863 return rc;
864}
865
866/*
867** Close a connection to a log file.
868*/
869int sqlite3LogClose(
870 Log *pLog, /* Log to close */
871 sqlite3_file *pFd, /* Database file */
872 u8 *zBuf /* Buffer of at least page-size bytes */
873){
874 int rc = SQLITE_OK;
875 if( pLog ){
dan64d039e2010-04-13 19:27:31 +0000876 LogLock **ppL;
dan7c246102010-04-12 19:00:29 +0000877 LogSummary *pSummary = pLog->pSummary;
878 sqlite3_mutex *mutex = 0;
879
dan64d039e2010-04-13 19:27:31 +0000880 sqlite3_mutex_enter(pSummary->mutex);
881 for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
882 *ppL = pLog->lock.pNext;
883 sqlite3_mutex_leave(pSummary->mutex);
884
dan7c246102010-04-12 19:00:29 +0000885 if( sqlite3GlobalConfig.bCoreMutex ){
886 mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
887 }
888 sqlite3_mutex_enter(mutex);
889
890 /* Decrement the reference count on the log summary. If this is the last
891 ** reference to the log summary object in this process, the object will
892 ** be freed. If this is also the last connection to the database, then
893 ** checkpoint the database and truncate the log and log-summary files
894 ** to zero bytes in size.
895 **/
896 pSummary->nRef--;
897 if( pSummary->nRef==0 ){
898 LogSummary **pp;
899
900 rc = logSummaryLock(pSummary);
901 if( rc==SQLITE_OK ){
902 int isTruncate = 0;
903 int rc2 = sqlite3OsLock(pLog->pFd, SQLITE_LOCK_EXCLUSIVE);
904 if( rc2==SQLITE_OK ){
905 /* This is the last connection to the database (including other
906 ** processes). Do three things:
907 **
908 ** 1. Checkpoint the db.
909 ** 2. Truncate the log file to zero bytes.
910 ** 3. Truncate the log-summary file to zero bytes.
911 */
912 rc2 = logCheckpoint(pLog, pFd, zBuf);
913 if( rc2==SQLITE_OK ){
914 rc2 = sqlite3OsTruncate(pLog->pFd, 0);
915 }
916 isTruncate = 1;
917 }else if( rc2==SQLITE_BUSY ){
918 rc2 = SQLITE_OK;
919 }
920 logSummaryUnmap(pSummary, isTruncate);
921 sqlite3OsUnlock(pLog->pFd, SQLITE_LOCK_NONE);
922 rc = logSummaryUnlock(pSummary);
923 if( rc2!=SQLITE_OK ) rc = rc2;
924 }
925
926 /* Remove the LogSummary object from the global list. Then free the
927 ** mutex and the object itself.
928 */
929 for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
930 *pp = (*pp)->pNext;
931 sqlite3_mutex_free(pSummary->mutex);
932 sqlite3_free(pSummary);
933 }
934
935 sqlite3_mutex_leave(mutex);
936
937 /* Close the connection to the log file and free the Log handle. */
938 sqlite3OsClose(pLog->pFd);
939 sqlite3_free(pLog);
940 }
941 return rc;
942}
943
944/*
945** Set the flags to pass to the sqlite3OsSync() function when syncing
946** the log file.
947*/
948#if 0
949void sqlite3LogSetSyncflags(Log *pLog, int sync_flags){
950 assert( sync_flags==SQLITE_SYNC_NORMAL || sync_flags==SQLITE_SYNC_FULL );
951 pLog->sync_flags = sync_flags;
952}
953#endif
954
955/*
956** Enter and leave the log-summary mutex. In this context, entering the
957** log-summary mutex means:
958**
959** 1. Obtaining mutex pLog->pSummary->mutex, and
960** 2. Taking an exclusive lock on the log-summary file.
961**
962** i.e. this mutex locks out other processes as well as other threads
963** hosted in this address space.
964*/
965static int logEnterMutex(Log *pLog){
966 LogSummary *pSummary = pLog->pSummary;
967 int rc;
968
969 sqlite3_mutex_enter(pSummary->mutex);
970 rc = logSummaryLock(pSummary);
971 if( rc!=SQLITE_OK ){
972 sqlite3_mutex_leave(pSummary->mutex);
973 }
974 return rc;
975}
976static void logLeaveMutex(Log *pLog){
977 LogSummary *pSummary = pLog->pSummary;
978 logSummaryUnlock(pSummary);
979 sqlite3_mutex_leave(pSummary->mutex);
980}
981
982/*
dan64d039e2010-04-13 19:27:31 +0000983** Values for the second parameter to logLockRegion().
984*/
985#define LOG_UNLOCK 0
986#define LOG_RDLOCK 1
987#define LOG_WRLOCK 2
988
989static int logLockRegion(Log *pLog, u32 mRegion, int op){
990 LogSummary *pSummary = pLog->pSummary;
991 LogLock *p; /* Used to iterate through in-process locks */
992 u32 mNew; /* New locks on file */
993 u32 mOld; /* Old locks on file */
994 u32 mNewLock; /* New locks held by pLog */
995
996 assert(
997 /* Writer lock operations */
998 (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
999 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
1000
1001 /* Reader lock operations */
1002 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
1003 || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
1004 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
1005 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))
1006 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))
1007
1008 /* Checkpointer lock operations */
1009 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
1010 || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
1011 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
1012 || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
1013 );
1014
1015 sqlite3_mutex_enter(pSummary->mutex);
1016
1017 /* If obtaining (not releasing) a lock, check if there exist any
1018 ** conflicting locks in process. Return SQLITE_BUSY in this case.
1019 */
1020 if( op ){
1021 u32 mConflict = (mRegion<<8) | ((op==LOG_WRLOCK) ? mRegion : 0);
1022 for(p=pSummary->pLock; p; p=p->pNext){
1023 if( p!=&pLog->lock && (p->mLock & mConflict) ){
1024 sqlite3_mutex_leave(pSummary->mutex);
1025 return SQLITE_BUSY;
1026 }
1027 }
1028 }
1029
1030 /* Determine the new lock mask for this log connection */
1031 switch( op ){
1032 case LOG_UNLOCK:
1033 mNewLock = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
1034 break;
1035 case LOG_RDLOCK:
1036 mNewLock = ((pLog->lock.mLock & ~(mRegion<<8)) | mRegion);
1037 break;
1038 default:
1039 assert( op==LOG_WRLOCK );
1040 mNewLock = (pLog->lock.mLock | (mRegion<<8) | mRegion);
1041 break;
1042 }
1043
1044 /* Determine the current and desired sets of locks at the file level. */
1045 mNew = 0;
1046 for(p=pSummary->pLock; p; p=p->pNext){
1047 assert( (p->mLock & (p->mLock<<8))==(p->mLock & 0x00000F00) );
1048 if( p!=&pLog->lock ) mNew |= p->mLock;
1049 }
1050 mOld = mNew | pLog->lock.mLock;
1051 mNew = mNew | mNewLock;
1052
1053 if( mNew!=mOld ){
1054 int rc;
1055 u32 mChange = (mNew^mOld) | ((mNew^mOld)>>8);
1056 struct flock f;
1057 memset(&f, 0, sizeof(f));
1058 f.l_type = (op==LOG_WRLOCK?F_WRLCK:(op==LOG_RDLOCK?F_RDLCK:F_UNLCK));
1059 f.l_whence = SEEK_SET;
1060
1061 if( mChange & LOG_REGION_A ) f.l_start = 12;
1062 else if( mChange & LOG_REGION_B ) f.l_start = 13;
1063 else if( mChange & LOG_REGION_C ) f.l_start = 14;
1064 else if( mChange & LOG_REGION_D ) f.l_start = 15;
1065
1066 if( mChange & LOG_REGION_D ) f.l_len = 16 - f.l_start;
1067 else if( mChange & LOG_REGION_C ) f.l_len = 15 - f.l_start;
1068 else if( mChange & LOG_REGION_B ) f.l_len = 14 - f.l_start;
1069 else if( mChange & LOG_REGION_A ) f.l_len = 13 - f.l_start;
1070
1071 rc = fcntl(pSummary->fd, F_SETLK, &f);
1072 if( rc!=0 ){
1073 sqlite3_mutex_leave(pSummary->mutex);
1074 return SQLITE_BUSY;
1075 }
1076 }
1077
1078 pLog->lock.mLock = mNewLock;
1079 sqlite3_mutex_leave(pSummary->mutex);
1080 return SQLITE_OK;
1081}
1082
1083/*
1084** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +00001085**
1086** If this call obtains a new read-lock and the database contents have been
1087** modified since the most recent call to LogCloseSnapshot() on this Log
1088** connection, then *pChanged is set to 1 before returning. Otherwise, it
1089** is left unmodified. This is used by the pager layer to determine whether
1090** or not any cached pages may be safely reused.
1091*/
1092int sqlite3LogOpenSnapshot(Log *pLog, int *pChanged){
1093 int rc = SQLITE_OK;
1094 if( pLog->isLocked==0 ){
dan64d039e2010-04-13 19:27:31 +00001095 int nAttempt;
1096
1097 /* Obtain a snapshot-lock on the log-summary file. The procedure
1098 ** for obtaining the snapshot log is:
1099 **
1100 ** 1. Attempt a SHARED lock on regions A and B.
1101 ** 2a. If step 1 is successful, drop the lock on region B.
1102 ** 2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
1103 ** 3. Repeat the above until the lock attempt in step 1 or 2b is
1104 ** successful.
1105 **
1106 ** If neither of the locks can be obtained after 5 tries, presumably
1107 ** something is wrong (i.e. a process not following the locking protocol).
1108 ** Return an error code in this case.
1109 */
1110 rc = SQLITE_BUSY;
1111 for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
1112 rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
1113 if( rc==SQLITE_BUSY ){
1114 rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
1115 if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
1116 }else{
1117 logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
1118 pLog->isLocked = LOG_REGION_A;
1119 }
1120 }
1121 if( rc!=SQLITE_OK ){
1122 return rc;
1123 }
1124
dan7c246102010-04-12 19:00:29 +00001125 if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1126 u32 aCksum[2] = {1, 1};
1127 u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];
1128 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1129
1130 /* Verify the checksum on the log-summary header. If it fails,
1131 ** recover the log-summary from the log file.
1132 */
1133 logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
1134 if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
1135 || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
1136 ){
1137 rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
1138 memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
1139 *pChanged = 1;
1140 }
1141 if( rc==SQLITE_OK ){
dan7c246102010-04-12 19:00:29 +00001142 if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
1143 *pChanged = 1;
1144 memcpy(&pLog->hdr, aHdr, LOGSUMMARY_HDR_NFIELD*sizeof(u32));
1145 }
1146 }
1147 logLeaveMutex(pLog);
1148 }
dan64d039e2010-04-13 19:27:31 +00001149
1150 if( rc!=SQLITE_OK ){
1151 /* An error occured while attempting log recovery. */
1152 sqlite3LogCloseSnapshot(pLog);
1153 }
dan7c246102010-04-12 19:00:29 +00001154 }
1155 return rc;
1156}
1157
1158/*
1159** Unlock the current snapshot.
1160*/
1161void sqlite3LogCloseSnapshot(Log *pLog){
dan64d039e2010-04-13 19:27:31 +00001162 if( pLog->isLocked ){
1163 assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
1164 logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
1165 }
dan7c246102010-04-12 19:00:29 +00001166 pLog->isLocked = 0;
1167}
1168
1169
1170
1171/*
1172** Read a page from the log, if it is present.
1173*/
1174int sqlite3LogRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
1175 u32 iRead = 0;
1176 u32 *aData = pLog->pSummary->aData;
1177 int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);
1178
1179 /* Do a linear search of the unindexed block of page-numbers (if any)
1180 ** at the end of the log-summary. An alternative to this would be to
1181 ** build an index in private memory each time a read transaction is
1182 ** opened on a new snapshot.
1183 */
1184 if( pLog->hdr.iLastPg ){
1185 u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
1186 u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
1187 while( *pi!=pgno && pi!=piStop ) pi--;
1188 if( pi!=piStop ){
1189 iRead = (pi-piStop) + iFrame;
1190 }
1191 }
1192 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1193
1194 while( iRead==0 && iFrame>0 ){
1195 int iLow = 0;
1196 int iHigh = 255;
1197 u32 *aFrame;
1198 u8 *aIndex;
1199
1200 iFrame -= 256;
1201 aFrame = &aData[logSummaryEntry(iFrame+1)];
1202 aIndex = (u8 *)&aFrame[256];
1203
1204 while( iLow<=iHigh ){
1205 int iTest = (iLow+iHigh)>>1;
1206 u32 iPg = aFrame[aIndex[iTest]];
1207
1208 if( iPg==pgno ){
1209 iRead = iFrame + 1 + aIndex[iTest];
1210 break;
1211 }
1212 else if( iPg<pgno ){
1213 iLow = iTest+1;
1214 }else{
1215 iHigh = iTest-1;
1216 }
1217 }
1218 }
1219 assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );
1220
1221 /* If iRead is non-zero, then it is the log frame number that contains the
1222 ** required page. Read and return data from the log file.
1223 */
1224 if( iRead ){
1225 i64 iOffset = (iRead-1) * (pLog->hdr.pgsz+LOG_FRAME_HDRSIZE);
1226 iOffset += LOG_FRAME_HDRSIZE;
1227 *pInLog = 1;
1228 return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
1229 }
1230
1231 *pInLog = 0;
1232 return SQLITE_OK;
1233}
1234
1235
1236/*
1237** Set *pPgno to the size of the database file (or zero, if unknown).
1238*/
1239void sqlite3LogMaxpgno(Log *pLog, Pgno *pPgno){
1240 assert( pLog->isLocked );
1241 *pPgno = pLog->hdr.nPage;
1242}
1243
1244/*
1245** The caller must hold at least a RESERVED lock on the database file
1246** when invoking this function.
1247**
1248** This function returns SQLITE_OK if the caller may write to the database.
1249** Otherwise, if the caller is operating on a snapshot that has already
1250** been overwritten by another writer, SQLITE_OBE is returned.
1251*/
1252int sqlite3LogWriteLock(Log *pLog, int op){
1253 assert( pLog->isLocked );
1254 if( op ){
dan64d039e2010-04-13 19:27:31 +00001255
1256 /* Obtain the writer lock */
1257 int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
1258 if( rc!=SQLITE_OK ){
1259 return rc;
1260 }
1261
dan7c246102010-04-12 19:00:29 +00001262 if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
1263 return SQLITE_BUSY;
1264 }
1265 pLog->isWriteLocked = 1;
dan64d039e2010-04-13 19:27:31 +00001266
dan7c246102010-04-12 19:00:29 +00001267 }else if( pLog->isWriteLocked ){
dan64d039e2010-04-13 19:27:31 +00001268 logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
dan7c246102010-04-12 19:00:29 +00001269 memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
1270 pLog->isWriteLocked = 0;
1271 }
1272 return SQLITE_OK;
1273}
1274
1275/*
1276** Write a set of frames to the log. The caller must hold at least a
1277** RESERVED lock on the database file.
1278*/
1279int sqlite3LogFrames(
1280 Log *pLog, /* Log handle to write to */
1281 int nPgsz, /* Database page-size in bytes */
1282 PgHdr *pList, /* List of dirty pages to write */
1283 Pgno nTruncate, /* Database size after this commit */
1284 int isCommit, /* True if this is a commit */
1285 int isSync /* True to sync the log file */
1286){
1287 /* Each frame has a 20 byte header, as follows:
1288 **
1289 ** + Pseudo-random salt (4 bytes)
1290 ** + Page number (4 bytes)
1291 ** + New database size, or 0 if not a commit frame (4 bytes)
1292 ** + Checksum (CHECKSUM_BYTES bytes);
1293 **
1294 ** The checksum is computed based on the following:
1295 **
1296 ** + The previous checksum, or {2, 3} for the first frame in the log.
1297 ** + The non-checksum fields of the frame header, and
1298 ** + The frame contents (page data).
1299 **
1300 ** This format must also be understood by the code in logSummaryRecover().
1301 ** The size of the frame header is used by LogRead() and LogCheckpoint().
1302 */
1303 int rc; /* Used to catch return codes */
1304 u32 iFrame; /* Next frame address */
1305 u8 aFrame[LOG_FRAME_HDRSIZE];
1306 PgHdr *p; /* Iterator to run through pList with. */
1307 u32 aCksum[2];
1308
1309 PgHdr *pLast; /* Last frame in list */
1310 int nLast = 0; /* Number of extra copies of last page */
1311
1312 assert( LOG_FRAME_HDRSIZE==(4 * 3 + LOG_CKSM_BYTES) );
1313 assert( pList );
1314
1315 aCksum[0] = pLog->hdr.iCheck1;
1316 aCksum[1] = pLog->hdr.iCheck2;
1317
1318 /* Write the log file. */
1319 iFrame = pLog->hdr.iLastPg;
1320 for(p=pList; p; p=p->pDirty){
1321 u32 nDbsize; /* Db-size field for frame header */
1322 i64 iOffset; /* Write offset in log file */
1323
1324 iFrame++;
1325 iOffset = (iFrame-1) * (nPgsz+sizeof(aFrame));
1326
1327 /* Populate and write the frame header */
1328 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
1329 logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1330 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1331 if( rc!=SQLITE_OK ){
1332 return rc;
1333 }
1334
1335 /* Write the page data */
1336 rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
1337 if( rc!=SQLITE_OK ){
1338 return rc;
1339 }
1340 pLast = p;
1341 }
1342
1343 /* Sync the log file if the 'isSync' flag was specified. */
1344 if( isSync ){
1345#if 0
1346 i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
1347 i64 iOffset = iFrame * (nPgsz+sizeof(aFrame));
1348
1349 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1350 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1351 }
1352 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1353 while( iOffset<iSegment ){
1354 logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1355 rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
1356 if( rc!=SQLITE_OK ){
1357 return rc;
1358 }
1359
1360 iOffset += LOG_FRAME_HDRSIZE;
1361 rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
1362 if( rc!=SQLITE_OK ){
1363 return rc;
1364 }
1365 nLast++;
1366 iOffset += nPgsz;
1367 }
1368#endif
1369
1370 rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
1371 if( rc!=SQLITE_OK ){
1372 return rc;
1373 }
1374 }
1375
1376 /* Append data to the log summary. It is not necessary to lock the
1377 ** log-summary to do this as the RESERVED lock held on the db file
1378 ** guarantees that there are no other writers, and no data that may
1379 ** be in use by existing readers is being overwritten.
1380 */
1381 iFrame = pLog->hdr.iLastPg;
1382 for(p=pList; p; p=p->pDirty){
1383 iFrame++;
1384 logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
1385 }
1386 while( nLast>0 ){
1387 iFrame++;
1388 nLast--;
1389 logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
1390 }
1391
1392 /* Update the private copy of the header. */
1393 pLog->hdr.pgsz = nPgsz;
1394 pLog->hdr.iLastPg = iFrame;
1395 if( isCommit ){
1396 pLog->hdr.iChange++;
1397 pLog->hdr.nPage = nTruncate;
1398 }
1399 pLog->hdr.iCheck1 = aCksum[0];
1400 pLog->hdr.iCheck2 = aCksum[1];
1401
1402 /* If this is a commit, update the log-summary header too. */
1403 if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
1404 logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
1405 logLeaveMutex(pLog);
1406 }
1407
1408 return SQLITE_OK;
1409}
1410
1411/*
1412** Checkpoint the database. When this function is called the caller
1413** must hold an exclusive lock on the database file.
1414*/
1415int sqlite3LogCheckpoint(
1416 Log *pLog, /* Log connection */
1417 sqlite3_file *pFd, /* File descriptor open on db file */
dan64d039e2010-04-13 19:27:31 +00001418 u8 *zBuf, /* Temporary buffer to use */
1419 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1420 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001421){
dan64d039e2010-04-13 19:27:31 +00001422 int rc;
dan7c246102010-04-12 19:00:29 +00001423
dan64d039e2010-04-13 19:27:31 +00001424 do {
1425 rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
1426 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1427 if( rc!=SQLITE_OK ) return rc;
1428
1429 do {
1430 rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
1431 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
1432 if( rc!=SQLITE_OK ) return rc;
dan7c246102010-04-12 19:00:29 +00001433
dan64d039e2010-04-13 19:27:31 +00001434 rc = logCheckpoint(pLog, pFd, zBuf);
1435
1436 logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
1437 return rc;
dan7c246102010-04-12 19:00:29 +00001438}
1439