blob: baa2da2b6498ce09ea22e4723fbbfd8cef18aa92 [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001/*
drh7ed91f22010-04-29 22:34:07 +00002** 2010 February 1
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
13** This file contains the implementation of a write-ahead log file used in
dan7c246102010-04-12 19:00:29 +000014** "journal_mode=wal" mode.
15*/
dan5cf53532010-05-01 16:40:20 +000016#ifndef SQLITE_OMIT_WAL
17
drh7ed91f22010-04-29 22:34:07 +000018#include "wal.h"
dan7c246102010-04-12 19:00:29 +000019
dan4b64c1e2010-04-27 18:49:54 +000020
dan97a31352010-04-16 13:59:31 +000021/*
drh7ed91f22010-04-29 22:34:07 +000022** WRITE-AHEAD LOG (WAL) FILE FORMAT
dan97a31352010-04-16 13:59:31 +000023**
drh7ed91f22010-04-29 22:34:07 +000024** A wal file consists of a header followed by zero or more "frames".
25** The header is 12 bytes in size and consists of the following three
dan97a31352010-04-16 13:59:31 +000026** big-endian 32-bit unsigned integer values:
27**
dan3de777f2010-04-17 12:31:37 +000028** 0: Database page size,
29** 4: Randomly selected salt value 1,
30** 8: Randomly selected salt value 2.
dan97a31352010-04-16 13:59:31 +000031**
drh7ed91f22010-04-29 22:34:07 +000032** Immediately following the header are zero or more frames. Each
dan97a31352010-04-16 13:59:31 +000033** frame itself consists of a 16-byte header followed by a <page-size> bytes
34** of page data. The header is broken into 4 big-endian 32-bit unsigned
35** integer values, as follows:
36**
dan3de777f2010-04-17 12:31:37 +000037** 0: Page number.
38** 4: For commit records, the size of the database image in pages
dan97a31352010-04-16 13:59:31 +000039** after the commit. For all other records, zero.
dan3de777f2010-04-17 12:31:37 +000040** 8: Checksum value 1.
dan97a31352010-04-16 13:59:31 +000041** 12: Checksum value 2.
42*/
43
44/*
drh7ed91f22010-04-29 22:34:07 +000045** WAL-INDEX FILE FORMAT
dan97a31352010-04-16 13:59:31 +000046**
drh7ed91f22010-04-29 22:34:07 +000047** The wal-index file consists of a 32-byte header region, followed by an
48** 8-byte region that contains no useful data (used to apply byte-range locks
danff207012010-04-24 04:49:15 +000049** to), followed by the data region.
50**
51** The contents of both the header and data region are specified in terms
52** of 1, 2 and 4 byte unsigned integers. All integers are stored in
drh7ed91f22010-04-29 22:34:07 +000053** machine-endian order. The wal-index is not a persistent file and
54** so it does not need to be portable across archtectures.
danff207012010-04-24 04:49:15 +000055**
drh7ed91f22010-04-29 22:34:07 +000056** A wal-index file is essentially a shadow-pager map. It contains a
57** mapping from database page number to the set of locations in the wal
danff207012010-04-24 04:49:15 +000058** file that contain versions of the database page. When a database
drh7ed91f22010-04-29 22:34:07 +000059** client needs to read a page of data, it first queries the wal-index
danff207012010-04-24 04:49:15 +000060** file to determine if the required version of the page is stored in
drh7ed91f22010-04-29 22:34:07 +000061** the wal. If so, the page is read from the wal. If not, the page is
62** read from the database file.
danff207012010-04-24 04:49:15 +000063**
drh7ed91f22010-04-29 22:34:07 +000064** Whenever a transaction is appended to the wal or a checkpoint transfers
65** data from the wal into the database file, the wal-index is
danff207012010-04-24 04:49:15 +000066** updated accordingly.
67**
drh7ed91f22010-04-29 22:34:07 +000068** The fields in the wal-index file header are described in the comment
69** directly above the definition of struct WalIndexHdr (see below).
70** Immediately following the fields in the WalIndexHdr structure is
danff207012010-04-24 04:49:15 +000071** an 8 byte checksum based on the contents of the header. This field is
drh7ed91f22010-04-29 22:34:07 +000072** not the same as the iCheck1 and iCheck2 fields of the WalIndexHdr.
dan97a31352010-04-16 13:59:31 +000073*/
74
drh7ed91f22010-04-29 22:34:07 +000075/* Object declarations */
76typedef struct WalIndexHdr WalIndexHdr;
77typedef struct WalIterator WalIterator;
dan7c246102010-04-12 19:00:29 +000078
79
80/*
drh7ed91f22010-04-29 22:34:07 +000081** The following object stores a copy of the wal-index header.
dan7c246102010-04-12 19:00:29 +000082**
83** Member variables iCheck1 and iCheck2 contain the checksum for the
drh7ed91f22010-04-29 22:34:07 +000084** last frame written to the wal, or 2 and 3 respectively if the log
dan7c246102010-04-12 19:00:29 +000085** is currently empty.
86*/
drh7ed91f22010-04-29 22:34:07 +000087struct WalIndexHdr {
dan7c246102010-04-12 19:00:29 +000088 u32 iChange; /* Counter incremented each transaction */
89 u32 pgsz; /* Database page size in bytes */
90 u32 iLastPg; /* Address of last valid frame in log */
91 u32 nPage; /* Size of database in pages */
92 u32 iCheck1; /* Checkpoint value 1 */
93 u32 iCheck2; /* Checkpoint value 2 */
94};
95
drh7ed91f22010-04-29 22:34:07 +000096/* Size of serialized WalIndexHdr object. */
97#define WALINDEX_HDR_NFIELD (sizeof(WalIndexHdr) / sizeof(u32))
dan7c246102010-04-12 19:00:29 +000098
drh7ed91f22010-04-29 22:34:07 +000099/* A block of 16 bytes beginning at WALINDEX_LOCK_OFFSET is reserved
danff207012010-04-24 04:49:15 +0000100** for locks. Since some systems only feature mandatory file-locks, we
101** do not read or write data from the region of the file on which locks
102** are applied.
103*/
drh7ed91f22010-04-29 22:34:07 +0000104#define WALINDEX_LOCK_OFFSET ((sizeof(WalIndexHdr))+2*sizeof(u32))
105#define WALINDEX_LOCK_RESERVED 8
dan7c246102010-04-12 19:00:29 +0000106
drh7ed91f22010-04-29 22:34:07 +0000107/* Size of header before each frame in wal */
108#define WAL_FRAME_HDRSIZE 16
danff207012010-04-24 04:49:15 +0000109
drh7ed91f22010-04-29 22:34:07 +0000110/* Size of write ahead log header */
111#define WAL_HDRSIZE 12
dan97a31352010-04-16 13:59:31 +0000112
113/*
drh7ed91f22010-04-29 22:34:07 +0000114** Return the offset of frame iFrame in the write-ahead log file,
115** assuming a database page size of pgsz bytes. The offset returned
116** is to the start of the write-ahead log frame-header.
dan97a31352010-04-16 13:59:31 +0000117*/
drh7ed91f22010-04-29 22:34:07 +0000118#define walFrameOffset(iFrame, pgsz) ( \
119 WAL_HDRSIZE + ((iFrame)-1)*((pgsz)+WAL_FRAME_HDRSIZE) \
dan97a31352010-04-16 13:59:31 +0000120)
dan7c246102010-04-12 19:00:29 +0000121
122/*
drh7ed91f22010-04-29 22:34:07 +0000123** An open write-ahead log file is represented by an instance of the
124** following object.
dance4f05f2010-04-22 19:14:13 +0000125*/
drh7ed91f22010-04-29 22:34:07 +0000126struct Wal {
127 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
128 sqlite3_file *pFd; /* File handle for WAL file */
129 u32 iCallback; /* Value to pass to log callback (or 0) */
130 sqlite3_shm *pWIndex; /* The open wal-index file */
drh5530b762010-04-30 14:39:50 +0000131 int szWIndex; /* Size of the wal-index that is mapped in mem */
drh7ed91f22010-04-29 22:34:07 +0000132 u32 *pWiData; /* Pointer to wal-index content in memory */
133 u8 lockState; /* SQLITE_SHM_xxxx constant showing lock state */
134 u8 readerType; /* SQLITE_SHM_READ or SQLITE_SHM_READ_FULL */
135 WalIndexHdr hdr; /* Wal-index for current snapshot */
drh2d536e12010-05-01 20:17:30 +0000136 char *zName; /* Name of underlying storage */
dan7c246102010-04-12 19:00:29 +0000137};
138
dan64d039e2010-04-13 19:27:31 +0000139
dan7c246102010-04-12 19:00:29 +0000140/*
141** This structure is used to implement an iterator that iterates through
142** all frames in the log in database page order. Where two or more frames
143** correspond to the same database page, the iterator visits only the
144** frame most recently written to the log.
145**
146** The internals of this structure are only accessed by:
147**
drh7ed91f22010-04-29 22:34:07 +0000148** walIteratorInit() - Create a new iterator,
149** walIteratorNext() - Step an iterator,
150** walIteratorFree() - Free an iterator.
dan7c246102010-04-12 19:00:29 +0000151**
drh7ed91f22010-04-29 22:34:07 +0000152** This functionality is used by the checkpoint code (see walCheckpoint()).
dan7c246102010-04-12 19:00:29 +0000153*/
drh7ed91f22010-04-29 22:34:07 +0000154struct WalIterator {
155 int nSegment; /* Size of WalIterator.aSegment[] array */
dan7c246102010-04-12 19:00:29 +0000156 int nFinal; /* Elements in segment nSegment-1 */
drh7ed91f22010-04-29 22:34:07 +0000157 struct WalSegment {
dan7c246102010-04-12 19:00:29 +0000158 int iNext; /* Next aIndex index */
159 u8 *aIndex; /* Pointer to index array */
160 u32 *aDbPage; /* Pointer to db page array */
161 } aSegment[1];
162};
163
dan64d039e2010-04-13 19:27:31 +0000164
dan7c246102010-04-12 19:00:29 +0000165/*
166** Generate an 8 byte checksum based on the data in array aByte[] and the
167** initial values of aCksum[0] and aCksum[1]. The checksum is written into
168** aCksum[] before returning.
dan56d95912010-04-24 19:07:29 +0000169**
170** The range of bytes to checksum is treated as an array of 32-bit
171** little-endian unsigned integers. For each integer X in the array, from
172** start to finish, do the following:
173**
174** aCksum[0] += X;
175** aCksum[1] += aCksum[0];
176**
177** For the calculation above, use 64-bit unsigned accumulators. Before
178** returning, truncate the values to 32-bits as follows:
179**
180** aCksum[0] = (u32)(aCksum[0] + (aCksum[0]>>24));
181** aCksum[1] = (u32)(aCksum[1] + (aCksum[1]>>24));
dan7c246102010-04-12 19:00:29 +0000182*/
drh7ed91f22010-04-29 22:34:07 +0000183static void walChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000184 u64 sum1 = aCksum[0];
185 u64 sum2 = aCksum[1];
186 u32 *a32 = (u32 *)aByte;
187 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000188
dan7c246102010-04-12 19:00:29 +0000189 assert( (nByte&0x00000003)==0 );
190
dance4f05f2010-04-22 19:14:13 +0000191 if( SQLITE_LITTLEENDIAN ){
192#ifdef SQLITE_DEBUG
193 u8 *a = (u8 *)a32;
194 assert( *a32==(a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24)) );
195#endif
196 do {
197 sum1 += *a32;
198 sum2 += sum1;
199 } while( ++a32<aEnd );
200 }else{
201 do {
202 u8 *a = (u8*)a32;
203 sum1 += a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24);
204 sum2 += sum1;
205 } while( ++a32<aEnd );
206 }
dan7c246102010-04-12 19:00:29 +0000207
dan39c79f52010-04-15 10:58:51 +0000208 aCksum[0] = sum1 + (sum1>>24);
209 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000210}
211
212/*
drh7ed91f22010-04-29 22:34:07 +0000213** Attempt to change the lock status.
dan7c246102010-04-12 19:00:29 +0000214**
drh7ed91f22010-04-29 22:34:07 +0000215** When changing the lock status to SQLITE_SHM_READ, store the
216** type of reader lock (either SQLITE_SHM_READ or SQLITE_SHM_READ_FULL)
217** in pWal->readerType.
dan7c246102010-04-12 19:00:29 +0000218*/
drh7ed91f22010-04-29 22:34:07 +0000219static int walSetLock(Wal *pWal, int desiredStatus){
220 int rc, got;
221 if( pWal->lockState==desiredStatus ) return SQLITE_OK;
danff6dfc72010-05-06 12:15:48 +0000222 got = pWal->lockState;
drh1fbe0f22010-05-03 16:30:27 +0000223 rc = pWal->pVfs->xShmLock(pWal->pVfs, pWal->pWIndex, desiredStatus, &got);
drh49156b22010-04-30 16:12:04 +0000224 pWal->lockState = got;
225 if( got==SQLITE_SHM_READ_FULL || got==SQLITE_SHM_READ ){
226 pWal->readerType = got;
227 pWal->lockState = SQLITE_SHM_READ;
dan7c246102010-04-12 19:00:29 +0000228 }
229 return rc;
230}
231
drh7ed91f22010-04-29 22:34:07 +0000232/*
233** Update the header of the wal-index file.
234*/
235static void walIndexWriteHdr(Wal *pWal, WalIndexHdr *pHdr){
236 u32 *aHdr = pWal->pWiData; /* Write header here */
237 u32 *aCksum = &aHdr[WALINDEX_HDR_NFIELD]; /* Write header cksum here */
danff207012010-04-24 04:49:15 +0000238
drh7ed91f22010-04-29 22:34:07 +0000239 assert( WALINDEX_HDR_NFIELD==sizeof(WalIndexHdr)/4 );
240 assert( aHdr!=0 );
241 memcpy(aHdr, pHdr, sizeof(WalIndexHdr));
danff207012010-04-24 04:49:15 +0000242 aCksum[0] = aCksum[1] = 1;
drh7ed91f22010-04-29 22:34:07 +0000243 walChecksumBytes((u8 *)aHdr, sizeof(WalIndexHdr), aCksum);
dan7c246102010-04-12 19:00:29 +0000244}
245
246/*
247** This function encodes a single frame header and writes it to a buffer
drh7ed91f22010-04-29 22:34:07 +0000248** supplied by the caller. A frame-header is made up of a series of
dan7c246102010-04-12 19:00:29 +0000249** 4-byte big-endian integers, as follows:
250**
251** 0: Database page size in bytes.
252** 4: Page number.
253** 8: New database size (for commit frames, otherwise zero).
254** 12: Frame checksum 1.
255** 16: Frame checksum 2.
256*/
drh7ed91f22010-04-29 22:34:07 +0000257static void walEncodeFrame(
dan7c246102010-04-12 19:00:29 +0000258 u32 *aCksum, /* IN/OUT: Checksum values */
259 u32 iPage, /* Database page number for frame */
260 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
261 int nData, /* Database page size (size of aData[]) */
262 u8 *aData, /* Pointer to page data (for checksum) */
263 u8 *aFrame /* OUT: Write encoded frame here */
264){
drh7ed91f22010-04-29 22:34:07 +0000265 assert( WAL_FRAME_HDRSIZE==16 );
dan7c246102010-04-12 19:00:29 +0000266
dan97a31352010-04-16 13:59:31 +0000267 sqlite3Put4byte(&aFrame[0], iPage);
268 sqlite3Put4byte(&aFrame[4], nTruncate);
dan7c246102010-04-12 19:00:29 +0000269
drh7ed91f22010-04-29 22:34:07 +0000270 walChecksumBytes(aFrame, 8, aCksum);
271 walChecksumBytes(aData, nData, aCksum);
dan7c246102010-04-12 19:00:29 +0000272
dan97a31352010-04-16 13:59:31 +0000273 sqlite3Put4byte(&aFrame[8], aCksum[0]);
274 sqlite3Put4byte(&aFrame[12], aCksum[1]);
dan7c246102010-04-12 19:00:29 +0000275}
276
277/*
278** Return 1 and populate *piPage, *pnTruncate and aCksum if the
279** frame checksum looks Ok. Otherwise return 0.
280*/
drh7ed91f22010-04-29 22:34:07 +0000281static int walDecodeFrame(
dan7c246102010-04-12 19:00:29 +0000282 u32 *aCksum, /* IN/OUT: Checksum values */
283 u32 *piPage, /* OUT: Database page number for frame */
284 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
285 int nData, /* Database page size (size of aData[]) */
286 u8 *aData, /* Pointer to page data (for checksum) */
287 u8 *aFrame /* Frame data */
288){
drh7ed91f22010-04-29 22:34:07 +0000289 assert( WAL_FRAME_HDRSIZE==16 );
dan4a4b01d2010-04-16 11:30:18 +0000290
drh7ed91f22010-04-29 22:34:07 +0000291 walChecksumBytes(aFrame, 8, aCksum);
292 walChecksumBytes(aData, nData, aCksum);
dan7c246102010-04-12 19:00:29 +0000293
dan97a31352010-04-16 13:59:31 +0000294 if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
295 || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
dan7c246102010-04-12 19:00:29 +0000296 ){
297 /* Checksum failed. */
298 return 0;
299 }
300
dan97a31352010-04-16 13:59:31 +0000301 *piPage = sqlite3Get4byte(&aFrame[0]);
302 *pnTruncate = sqlite3Get4byte(&aFrame[4]);
dan7c246102010-04-12 19:00:29 +0000303 return 1;
304}
305
drh7ed91f22010-04-29 22:34:07 +0000306static void walMergesort8(
307 Pgno *aContent, /* Pages in wal */
dan7c246102010-04-12 19:00:29 +0000308 u8 *aBuffer, /* Buffer of at least *pnList items to use */
309 u8 *aList, /* IN/OUT: List to sort */
310 int *pnList /* IN/OUT: Number of elements in aList[] */
311){
312 int nList = *pnList;
313 if( nList>1 ){
314 int nLeft = nList / 2; /* Elements in left list */
315 int nRight = nList - nLeft; /* Elements in right list */
316 u8 *aLeft = aList; /* Left list */
317 u8 *aRight = &aList[nLeft]; /* Right list */
318 int iLeft = 0; /* Current index in aLeft */
319 int iRight = 0; /* Current index in aright */
320 int iOut = 0; /* Current index in output buffer */
321
322 /* TODO: Change to non-recursive version. */
drh7ed91f22010-04-29 22:34:07 +0000323 walMergesort8(aContent, aBuffer, aLeft, &nLeft);
324 walMergesort8(aContent, aBuffer, aRight, &nRight);
dan7c246102010-04-12 19:00:29 +0000325
326 while( iRight<nRight || iLeft<nLeft ){
327 u8 logpage;
328 Pgno dbpage;
329
330 if( (iLeft<nLeft)
331 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
332 ){
333 logpage = aLeft[iLeft++];
334 }else{
335 logpage = aRight[iRight++];
336 }
337 dbpage = aContent[logpage];
338
339 aBuffer[iOut++] = logpage;
340 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
341
342 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
343 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
344 }
345 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
346 *pnList = iOut;
347 }
348
349#ifdef SQLITE_DEBUG
350 {
351 int i;
352 for(i=1; i<*pnList; i++){
353 assert( aContent[aList[i]] > aContent[aList[i-1]] );
354 }
355 }
356#endif
357}
358
359
360/*
drh7ed91f22010-04-29 22:34:07 +0000361** Return the index in the WalIndex.aData array that corresponds to
362** frame iFrame. The wal-index file consists of a header, followed by
dan7c246102010-04-12 19:00:29 +0000363** alternating "map" and "index" blocks.
364*/
drh7ed91f22010-04-29 22:34:07 +0000365static int walIndexEntry(u32 iFrame){
danff207012010-04-24 04:49:15 +0000366 return (
drh7ed91f22010-04-29 22:34:07 +0000367 (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
danff207012010-04-24 04:49:15 +0000368 + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
369 + iFrame-1 /* Db page numbers that occur before iFrame */
370 );
dan7c246102010-04-12 19:00:29 +0000371}
372
drh7ed91f22010-04-29 22:34:07 +0000373/*
danb7d53f52010-05-06 17:28:08 +0000374** Return the minimum mapping size in bytes that can be used to read the
375** wal-index up to and including frame iFrame. If iFrame is the last frame
376** in a block of 256 frames, the returned byte-count includes the space
377** required by the 256-byte index block.
378*/
379static int walMappingSize(u32 iFrame){
380 return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED
381 + iFrame*sizeof(u32)
382 + (iFrame>>8)*256
383 );
384}
385
386/*
drh5530b762010-04-30 14:39:50 +0000387** Release our reference to the wal-index memory map, if we are holding
388** it.
drh7ed91f22010-04-29 22:34:07 +0000389*/
390static void walIndexUnmap(Wal *pWal){
391 if( pWal->pWiData ){
drh1fbe0f22010-05-03 16:30:27 +0000392 pWal->pVfs->xShmRelease(pWal->pVfs, pWal->pWIndex);
drh7ed91f22010-04-29 22:34:07 +0000393 pWal->pWiData = 0;
394 }
395}
dan7c246102010-04-12 19:00:29 +0000396
397/*
drh5530b762010-04-30 14:39:50 +0000398** Map the wal-index file into memory if it isn't already.
399**
400** The reqSize parameter is the minimum required size of the mapping.
401** A value of -1 means "don't care". The reqSize parameter is ignored
402** if the mapping is already held.
drh7ed91f22010-04-29 22:34:07 +0000403*/
drh5530b762010-04-30 14:39:50 +0000404static int walIndexMap(Wal *pWal, int reqSize){
405 int rc = SQLITE_OK;
406 if( pWal->pWiData==0 ){
drh1fbe0f22010-05-03 16:30:27 +0000407 rc = pWal->pVfs->xShmGet(pWal->pVfs, pWal->pWIndex, reqSize,
408 &pWal->szWIndex, (void**)(char*)&pWal->pWiData);
drh5530b762010-04-30 14:39:50 +0000409 if( rc==SQLITE_OK && pWal->pWiData==0 ){
410 /* Make sure pWal->pWiData is not NULL while we are holding the
411 ** lock on the mapping. */
412 assert( pWal->szWIndex==0 );
413 pWal->pWiData = &pWal->iCallback;
414 }
dand41a29a2010-05-06 15:56:28 +0000415 assert( rc==SQLITE_OK || pWal->pWiData==0 );
drh79e6c782010-04-30 02:13:26 +0000416 }
417 return rc;
418}
419
420/*
drh5530b762010-04-30 14:39:50 +0000421** Remap the wal-index so that the mapping covers the full size
422** of the underlying file.
423**
424** If enlargeTo is non-negative, then increase the size of the underlying
425** storage to be at least as big as enlargeTo before remapping.
drh79e6c782010-04-30 02:13:26 +0000426*/
drh5530b762010-04-30 14:39:50 +0000427static int walIndexRemap(Wal *pWal, int enlargeTo){
428 int rc;
429 int sz;
drh1fbe0f22010-05-03 16:30:27 +0000430 rc = pWal->pVfs->xShmSize(pWal->pVfs, pWal->pWIndex, enlargeTo, &sz);
drh5530b762010-04-30 14:39:50 +0000431 if( rc==SQLITE_OK && sz>pWal->szWIndex ){
432 walIndexUnmap(pWal);
433 rc = walIndexMap(pWal, sz);
434 }
drh7ed91f22010-04-29 22:34:07 +0000435 return rc;
436}
437
438/*
439** Increment by which to increase the wal-index file size.
440*/
441#define WALINDEX_MMAP_INCREMENT (64*1024)
442
443/*
444** Set an entry in the wal-index map to map log frame iFrame to db
445** page iPage. Values are always appended to the wal-index (i.e. the
dan7c246102010-04-12 19:00:29 +0000446** value of iFrame is always exactly one more than the value passed to
447** the previous call), but that restriction is not enforced or asserted
448** here.
449*/
drh7ed91f22010-04-29 22:34:07 +0000450static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
danc7991bd2010-05-05 19:04:59 +0000451 int rc;
drh7ed91f22010-04-29 22:34:07 +0000452 u32 iSlot = walIndexEntry(iFrame);
453
danc7991bd2010-05-05 19:04:59 +0000454 rc = walIndexMap(pWal, -1);
455 if( rc!=SQLITE_OK ){
456 return rc;
457 }
danc9d53db2010-04-30 16:50:00 +0000458 while( ((iSlot+128)*sizeof(u32))>=pWal->szWIndex ){
danc9d53db2010-04-30 16:50:00 +0000459 int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT;
dance4f05f2010-04-22 19:14:13 +0000460
drh5530b762010-04-30 14:39:50 +0000461 /* Enlarge the storage, then remap it. */
drh7ed91f22010-04-29 22:34:07 +0000462 rc = walIndexRemap(pWal, nByte);
dan31f98fc2010-04-27 05:42:32 +0000463 if( rc!=SQLITE_OK ){
464 return rc;
465 }
dance4f05f2010-04-22 19:14:13 +0000466 }
467
drh7ed91f22010-04-29 22:34:07 +0000468 /* Set the wal-index entry itself */
469 pWal->pWiData[iSlot] = iPage;
dan7c246102010-04-12 19:00:29 +0000470
471 /* If the frame number is a multiple of 256 (frames are numbered starting
472 ** at 1), build an index of the most recently added 256 frames.
473 */
474 if( (iFrame&0x000000FF)==0 ){
475 int i; /* Iterator used while initializing aIndex */
476 u32 *aFrame; /* Pointer to array of 256 frames */
477 int nIndex; /* Number of entries in index */
478 u8 *aIndex; /* 256 bytes to build index in */
479 u8 *aTmp; /* Scratch space to use while sorting */
480
drh7ed91f22010-04-29 22:34:07 +0000481 aFrame = &pWal->pWiData[iSlot-255];
482 aIndex = (u8 *)&pWal->pWiData[iSlot+1];
dan7c246102010-04-12 19:00:29 +0000483 aTmp = &aIndex[256];
484
485 nIndex = 256;
486 for(i=0; i<256; i++) aIndex[i] = (u8)i;
drh7ed91f22010-04-29 22:34:07 +0000487 walMergesort8(aFrame, aTmp, aIndex, &nIndex);
dan7c246102010-04-12 19:00:29 +0000488 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
489 }
dan31f98fc2010-04-27 05:42:32 +0000490
491 return SQLITE_OK;
dan7c246102010-04-12 19:00:29 +0000492}
493
494
495/*
drh7ed91f22010-04-29 22:34:07 +0000496** Recover the wal-index by reading the write-ahead log file.
497** The caller must hold RECOVER lock on the wal-index file.
dan7c246102010-04-12 19:00:29 +0000498*/
drh7ed91f22010-04-29 22:34:07 +0000499static int walIndexRecover(Wal *pWal){
dan7c246102010-04-12 19:00:29 +0000500 int rc; /* Return Code */
501 i64 nSize; /* Size of log file */
drh7ed91f22010-04-29 22:34:07 +0000502 WalIndexHdr hdr; /* Recovered wal-index header */
dan7c246102010-04-12 19:00:29 +0000503
drh7ed91f22010-04-29 22:34:07 +0000504 assert( pWal->lockState==SQLITE_SHM_RECOVER );
dan7c246102010-04-12 19:00:29 +0000505 memset(&hdr, 0, sizeof(hdr));
506
drh7ed91f22010-04-29 22:34:07 +0000507 rc = sqlite3OsFileSize(pWal->pFd, &nSize);
dan7c246102010-04-12 19:00:29 +0000508 if( rc!=SQLITE_OK ){
509 return rc;
510 }
511
drh7ed91f22010-04-29 22:34:07 +0000512 if( nSize>WAL_FRAME_HDRSIZE ){
513 u8 aBuf[WAL_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
dan7c246102010-04-12 19:00:29 +0000514 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
515 int nFrame; /* Number of bytes at aFrame */
516 u8 *aData; /* Pointer to data part of aFrame buffer */
517 int iFrame; /* Index of last frame read */
518 i64 iOffset; /* Next offset to read from log file */
519 int nPgsz; /* Page size according to the log */
dan97a31352010-04-16 13:59:31 +0000520 u32 aCksum[2]; /* Running checksum */
dan7c246102010-04-12 19:00:29 +0000521
522 /* Read in the first frame header in the file (to determine the
523 ** database page size).
524 */
drh7ed91f22010-04-29 22:34:07 +0000525 rc = sqlite3OsRead(pWal->pFd, aBuf, WAL_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000526 if( rc!=SQLITE_OK ){
527 return rc;
528 }
529
530 /* If the database page size is not a power of two, or is greater than
531 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
532 */
533 nPgsz = sqlite3Get4byte(&aBuf[0]);
dance4f05f2010-04-22 19:14:13 +0000534 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE || nPgsz<512 ){
dan7c246102010-04-12 19:00:29 +0000535 goto finished;
536 }
dan97a31352010-04-16 13:59:31 +0000537 aCksum[0] = sqlite3Get4byte(&aBuf[4]);
538 aCksum[1] = sqlite3Get4byte(&aBuf[8]);
dan7c246102010-04-12 19:00:29 +0000539
540 /* Malloc a buffer to read frames into. */
drh7ed91f22010-04-29 22:34:07 +0000541 nFrame = nPgsz + WAL_FRAME_HDRSIZE;
dan7c246102010-04-12 19:00:29 +0000542 aFrame = (u8 *)sqlite3_malloc(nFrame);
543 if( !aFrame ){
544 return SQLITE_NOMEM;
545 }
drh7ed91f22010-04-29 22:34:07 +0000546 aData = &aFrame[WAL_FRAME_HDRSIZE];
dan7c246102010-04-12 19:00:29 +0000547
548 /* Read all frames from the log file. */
549 iFrame = 0;
drh7ed91f22010-04-29 22:34:07 +0000550 for(iOffset=WAL_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
dan7c246102010-04-12 19:00:29 +0000551 u32 pgno; /* Database page number for frame */
552 u32 nTruncate; /* dbsize field from frame header */
553 int isValid; /* True if this frame is valid */
554
555 /* Read and decode the next log frame. */
drh7ed91f22010-04-29 22:34:07 +0000556 rc = sqlite3OsRead(pWal->pFd, aFrame, nFrame, iOffset);
dan7c246102010-04-12 19:00:29 +0000557 if( rc!=SQLITE_OK ) break;
drh7ed91f22010-04-29 22:34:07 +0000558 isValid = walDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
dan7c246102010-04-12 19:00:29 +0000559 if( !isValid ) break;
danc7991bd2010-05-05 19:04:59 +0000560 rc = walIndexAppend(pWal, ++iFrame, pgno);
561 if( rc!=SQLITE_OK ) break;
dan7c246102010-04-12 19:00:29 +0000562
563 /* If nTruncate is non-zero, this is a commit record. */
564 if( nTruncate ){
565 hdr.iCheck1 = aCksum[0];
566 hdr.iCheck2 = aCksum[1];
567 hdr.iLastPg = iFrame;
568 hdr.nPage = nTruncate;
569 hdr.pgsz = nPgsz;
570 }
571 }
572
573 sqlite3_free(aFrame);
574 }else{
575 hdr.iCheck1 = 2;
576 hdr.iCheck2 = 3;
577 }
578
579finished:
dan576bc322010-05-06 18:04:50 +0000580 if( rc==SQLITE_OK && hdr.iLastPg==0 ){
581 rc = walIndexRemap(pWal, WALINDEX_MMAP_INCREMENT);
582 }
583 if( rc==SQLITE_OK ){
584 walIndexWriteHdr(pWal, &hdr);
585 memcpy(&pWal->hdr, &hdr, sizeof(hdr));
586 }
dan7c246102010-04-12 19:00:29 +0000587 return rc;
588}
589
drha8e654e2010-05-04 17:38:42 +0000590/*
dan1018e902010-05-05 15:33:05 +0000591** Close an open wal-index.
drha8e654e2010-05-04 17:38:42 +0000592*/
dan1018e902010-05-05 15:33:05 +0000593static void walIndexClose(Wal *pWal, int isDelete){
drha8e654e2010-05-04 17:38:42 +0000594 sqlite3_shm *pWIndex = pWal->pWIndex;
595 if( pWIndex ){
596 sqlite3_vfs *pVfs = pWal->pVfs;
597 int notUsed;
598 pVfs->xShmLock(pVfs, pWIndex, SQLITE_SHM_UNLOCK, &notUsed);
dan1018e902010-05-05 15:33:05 +0000599 pVfs->xShmClose(pVfs, pWIndex, isDelete);
drha8e654e2010-05-04 17:38:42 +0000600 }
601}
602
dan7c246102010-04-12 19:00:29 +0000603/*
604** Open a connection to the log file associated with database zDb. The
605** database file does not actually have to exist. zDb is used only to
606** figure out the name of the log file to open. If the log file does not
607** exist it is created by this call.
dan3de777f2010-04-17 12:31:37 +0000608**
609** A SHARED lock should be held on the database file when this function
610** is called. The purpose of this SHARED lock is to prevent any other
drh7ed91f22010-04-29 22:34:07 +0000611** client from unlinking the log or wal-index file. If another process
dan3de777f2010-04-17 12:31:37 +0000612** were to do this just after this client opened one of these files, the
613** system would be badly broken.
danef378022010-05-04 11:06:03 +0000614**
615** If the log file is successfully opened, SQLITE_OK is returned and
616** *ppWal is set to point to a new WAL handle. If an error occurs,
617** an SQLite error code is returned and *ppWal is left unmodified.
dan7c246102010-04-12 19:00:29 +0000618*/
drhc438efd2010-04-26 00:19:45 +0000619int sqlite3WalOpen(
drh7ed91f22010-04-29 22:34:07 +0000620 sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */
dan7c246102010-04-12 19:00:29 +0000621 const char *zDb, /* Name of database file */
drh7ed91f22010-04-29 22:34:07 +0000622 Wal **ppWal /* OUT: Allocated Wal handle */
dan7c246102010-04-12 19:00:29 +0000623){
danef378022010-05-04 11:06:03 +0000624 int rc; /* Return Code */
drh7ed91f22010-04-29 22:34:07 +0000625 Wal *pRet; /* Object to allocate and return */
dan7c246102010-04-12 19:00:29 +0000626 int flags; /* Flags passed to OsOpen() */
danef378022010-05-04 11:06:03 +0000627 char *zWal; /* Path to WAL file */
dan7c246102010-04-12 19:00:29 +0000628 int nWal; /* Length of zWal in bytes */
629
dan7c246102010-04-12 19:00:29 +0000630 assert( zDb );
dan87bfb512010-04-30 11:43:28 +0000631 if( pVfs->xShmOpen==0 ) return SQLITE_CANTOPEN_BKPT;
dan7c246102010-04-12 19:00:29 +0000632
drh7ed91f22010-04-29 22:34:07 +0000633 /* Allocate an instance of struct Wal to return. */
634 *ppWal = 0;
635 nWal = strlen(zDb);
drh2d536e12010-05-01 20:17:30 +0000636 pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile + nWal+5);
dan76ed3bc2010-05-03 17:18:24 +0000637 if( !pRet ){
638 return SQLITE_NOMEM;
639 }
640
dan7c246102010-04-12 19:00:29 +0000641 pRet->pVfs = pVfs;
642 pRet->pFd = (sqlite3_file *)&pRet[1];
drh2d536e12010-05-01 20:17:30 +0000643 pRet->zName = zWal = pVfs->szOsFile + (char*)pRet->pFd;
644 sqlite3_snprintf(nWal+5, zWal, "%s-wal", zDb);
drh7ed91f22010-04-29 22:34:07 +0000645 rc = pVfs->xShmOpen(pVfs, zWal, &pRet->pWIndex);
dan7c246102010-04-12 19:00:29 +0000646
drh7ed91f22010-04-29 22:34:07 +0000647 /* Open file handle on the write-ahead log file. */
dan76ed3bc2010-05-03 17:18:24 +0000648 if( rc==SQLITE_OK ){
649 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
650 rc = sqlite3OsOpen(pVfs, zWal, pRet->pFd, flags, &flags);
651 }
dan7c246102010-04-12 19:00:29 +0000652
dan7c246102010-04-12 19:00:29 +0000653 if( rc!=SQLITE_OK ){
dan1018e902010-05-05 15:33:05 +0000654 walIndexClose(pRet, 0);
danef378022010-05-04 11:06:03 +0000655 sqlite3OsClose(pRet->pFd);
656 sqlite3_free(pRet);
657 }else{
658 *ppWal = pRet;
dan7c246102010-04-12 19:00:29 +0000659 }
dan7c246102010-04-12 19:00:29 +0000660 return rc;
661}
662
drh7ed91f22010-04-29 22:34:07 +0000663static int walIteratorNext(
664 WalIterator *p, /* Iterator */
665 u32 *piPage, /* OUT: Next db page to write */
666 u32 *piFrame /* OUT: Wal frame to read from */
dan7c246102010-04-12 19:00:29 +0000667){
668 u32 iMin = *piPage;
669 u32 iRet = 0xFFFFFFFF;
670 int i;
671 int nBlock = p->nFinal;
672
673 for(i=p->nSegment-1; i>=0; i--){
drh7ed91f22010-04-29 22:34:07 +0000674 struct WalSegment *pSegment = &p->aSegment[i];
dan7c246102010-04-12 19:00:29 +0000675 while( pSegment->iNext<nBlock ){
676 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
677 if( iPg>iMin ){
678 if( iPg<iRet ){
679 iRet = iPg;
680 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
681 }
682 break;
683 }
684 pSegment->iNext++;
685 }
686
687 nBlock = 256;
688 }
689
690 *piPage = iRet;
691 return (iRet==0xFFFFFFFF);
692}
693
dan8f6097c2010-05-06 07:43:58 +0000694static int walIteratorInit(Wal *pWal, WalIterator **pp){
drh7ed91f22010-04-29 22:34:07 +0000695 u32 *aData; /* Content of the wal-index file */
696 WalIterator *p; /* Return value */
dan7c246102010-04-12 19:00:29 +0000697 int nSegment; /* Number of segments to merge */
698 u32 iLast; /* Last frame in log */
699 int nByte; /* Number of bytes to allocate */
700 int i; /* Iterator variable */
701 int nFinal; /* Number of unindexed entries */
drh7ed91f22010-04-29 22:34:07 +0000702 struct WalSegment *pFinal; /* Final (unindexed) segment */
dan7c246102010-04-12 19:00:29 +0000703 u8 *aTmp; /* Temp space used by merge-sort */
dan8f6097c2010-05-06 07:43:58 +0000704 int rc; /* Return code of walIndexMap() */
dan7c246102010-04-12 19:00:29 +0000705
dan576bc322010-05-06 18:04:50 +0000706 rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
dan8f6097c2010-05-06 07:43:58 +0000707 if( rc!=SQLITE_OK ){
708 return rc;
709 }
drh7ed91f22010-04-29 22:34:07 +0000710 aData = pWal->pWiData;
711 iLast = pWal->hdr.iLastPg;
dan7c246102010-04-12 19:00:29 +0000712 nSegment = (iLast >> 8) + 1;
713 nFinal = (iLast & 0x000000FF);
714
drh7ed91f22010-04-29 22:34:07 +0000715 nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512;
716 p = (WalIterator *)sqlite3_malloc(nByte);
dan8f6097c2010-05-06 07:43:58 +0000717 if( !p ){
dan9a6b4e92010-05-06 11:32:09 +0000718 rc = SQLITE_NOMEM;
719 }else{
dan7c246102010-04-12 19:00:29 +0000720 memset(p, 0, nByte);
721 p->nSegment = nSegment;
dan76ed3bc2010-05-03 17:18:24 +0000722
723 for(i=0; i<nSegment-1; i++){
724 p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
725 p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256];
726 }
727 pFinal = &p->aSegment[nSegment-1];
728
729 pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)];
730 pFinal->aIndex = (u8 *)&pFinal[1];
731 aTmp = &pFinal->aIndex[256];
732 for(i=0; i<nFinal; i++){
733 pFinal->aIndex[i] = i;
734 }
735 walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
dan7c246102010-04-12 19:00:29 +0000736 p->nFinal = nFinal;
737 }
738
dan8f6097c2010-05-06 07:43:58 +0000739 *pp = p;
dan9a6b4e92010-05-06 11:32:09 +0000740 return rc;
dan7c246102010-04-12 19:00:29 +0000741}
742
743/*
drh7ed91f22010-04-29 22:34:07 +0000744** Free a log iterator allocated by walIteratorInit().
dan7c246102010-04-12 19:00:29 +0000745*/
drh7ed91f22010-04-29 22:34:07 +0000746static void walIteratorFree(WalIterator *p){
dan7c246102010-04-12 19:00:29 +0000747 sqlite3_free(p);
748}
749
750/*
751** Checkpoint the contents of the log file.
752*/
drh7ed91f22010-04-29 22:34:07 +0000753static int walCheckpoint(
754 Wal *pWal, /* Wal connection */
dan7c246102010-04-12 19:00:29 +0000755 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +0000756 int sync_flags, /* Flags for OsSync() (or 0) */
danb6e099a2010-05-04 14:47:39 +0000757 int nBuf, /* Size of zBuf in bytes */
dan7c246102010-04-12 19:00:29 +0000758 u8 *zBuf /* Temporary buffer to use */
759){
760 int rc; /* Return code */
drh7ed91f22010-04-29 22:34:07 +0000761 int pgsz = pWal->hdr.pgsz; /* Database page-size */
762 WalIterator *pIter = 0; /* Wal iterator context */
dan7c246102010-04-12 19:00:29 +0000763 u32 iDbpage = 0; /* Next database page to write */
drh7ed91f22010-04-29 22:34:07 +0000764 u32 iFrame = 0; /* Wal frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +0000765
766 /* Allocate the iterator */
dan8f6097c2010-05-06 07:43:58 +0000767 rc = walIteratorInit(pWal, &pIter);
768 if( rc!=SQLITE_OK || pWal->hdr.iLastPg==0 ){
danb6e099a2010-05-04 14:47:39 +0000769 goto out;
770 }
771
772 if( pWal->hdr.pgsz!=nBuf ){
773 rc = SQLITE_CORRUPT_BKPT;
774 goto out;
775 }
776
dan7c246102010-04-12 19:00:29 +0000777 /* Sync the log file to disk */
danc5118782010-04-17 17:34:41 +0000778 if( sync_flags ){
drh7ed91f22010-04-29 22:34:07 +0000779 rc = sqlite3OsSync(pWal->pFd, sync_flags);
danc5118782010-04-17 17:34:41 +0000780 if( rc!=SQLITE_OK ) goto out;
781 }
dan7c246102010-04-12 19:00:29 +0000782
783 /* Iterate through the contents of the log, copying data to the db file. */
drh7ed91f22010-04-29 22:34:07 +0000784 while( 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
785 rc = sqlite3OsRead(pWal->pFd, zBuf, pgsz,
786 walFrameOffset(iFrame, pgsz) + WAL_FRAME_HDRSIZE
dan7c246102010-04-12 19:00:29 +0000787 );
788 if( rc!=SQLITE_OK ) goto out;
789 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
790 if( rc!=SQLITE_OK ) goto out;
791 }
792
793 /* Truncate the database file */
drh7ed91f22010-04-29 22:34:07 +0000794 rc = sqlite3OsTruncate(pFd, ((i64)pWal->hdr.nPage*(i64)pgsz));
dan7c246102010-04-12 19:00:29 +0000795 if( rc!=SQLITE_OK ) goto out;
796
drh7ed91f22010-04-29 22:34:07 +0000797 /* Sync the database file. If successful, update the wal-index. */
danc5118782010-04-17 17:34:41 +0000798 if( sync_flags ){
799 rc = sqlite3OsSync(pFd, sync_flags);
800 if( rc!=SQLITE_OK ) goto out;
801 }
drh7ed91f22010-04-29 22:34:07 +0000802 pWal->hdr.iLastPg = 0;
803 pWal->hdr.iCheck1 = 2;
804 pWal->hdr.iCheck2 = 3;
805 walIndexWriteHdr(pWal, &pWal->hdr);
dan7c246102010-04-12 19:00:29 +0000806
807 /* TODO: If a crash occurs and the current log is copied into the
808 ** database there is no problem. However, if a crash occurs while
809 ** writing the next transaction into the start of the log, such that:
810 **
811 ** * The first transaction currently in the log is left intact, but
812 ** * The second (or subsequent) transaction is damaged,
813 **
814 ** then the database could become corrupt.
815 **
816 ** The easiest thing to do would be to write and sync a dummy header
817 ** into the log at this point. Unfortunately, that turns out to be
818 ** an unwelcome performance hit. Alternatives are...
819 */
820#if 0
drh7ed91f22010-04-29 22:34:07 +0000821 memset(zBuf, 0, WAL_FRAME_HDRSIZE);
822 rc = sqlite3OsWrite(pWal->pFd, zBuf, WAL_FRAME_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000823 if( rc!=SQLITE_OK ) goto out;
drh7ed91f22010-04-29 22:34:07 +0000824 rc = sqlite3OsSync(pWal->pFd, pWal->sync_flags);
dan7c246102010-04-12 19:00:29 +0000825#endif
826
827 out:
drh7ed91f22010-04-29 22:34:07 +0000828 walIteratorFree(pIter);
dan7c246102010-04-12 19:00:29 +0000829 return rc;
830}
831
832/*
833** Close a connection to a log file.
834*/
drhc438efd2010-04-26 00:19:45 +0000835int sqlite3WalClose(
drh7ed91f22010-04-29 22:34:07 +0000836 Wal *pWal, /* Wal to close */
dan7c246102010-04-12 19:00:29 +0000837 sqlite3_file *pFd, /* Database file */
danc5118782010-04-17 17:34:41 +0000838 int sync_flags, /* Flags to pass to OsSync() (or 0) */
danb6e099a2010-05-04 14:47:39 +0000839 int nBuf,
840 u8 *zBuf /* Buffer of at least nBuf bytes */
dan7c246102010-04-12 19:00:29 +0000841){
842 int rc = SQLITE_OK;
drh7ed91f22010-04-29 22:34:07 +0000843 if( pWal ){
dan30c86292010-04-30 16:24:46 +0000844 int isDelete = 0; /* True to unlink wal and wal-index files */
845
846 /* If an EXCLUSIVE lock can be obtained on the database file (using the
847 ** ordinary, rollback-mode locking methods, this guarantees that the
848 ** connection associated with this log file is the only connection to
849 ** the database. In this case checkpoint the database and unlink both
850 ** the wal and wal-index files.
851 **
852 ** The EXCLUSIVE lock is not released before returning.
853 */
854 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
855 if( rc==SQLITE_OK ){
danb7d53f52010-05-06 17:28:08 +0000856 rc = sqlite3WalCheckpoint(pWal, pFd, sync_flags, nBuf, zBuf, 0, 0);
dan30c86292010-04-30 16:24:46 +0000857 if( rc==SQLITE_OK ){
858 isDelete = 1;
859 }
860 walIndexUnmap(pWal);
861 }
862
dan1018e902010-05-05 15:33:05 +0000863 walIndexClose(pWal, isDelete);
drh7ed91f22010-04-29 22:34:07 +0000864 sqlite3OsClose(pWal->pFd);
dan30c86292010-04-30 16:24:46 +0000865 if( isDelete ){
drh2d536e12010-05-01 20:17:30 +0000866 sqlite3OsDelete(pWal->pVfs, pWal->zName, 0);
dan30c86292010-04-30 16:24:46 +0000867 }
drh7ed91f22010-04-29 22:34:07 +0000868 sqlite3_free(pWal);
dan7c246102010-04-12 19:00:29 +0000869 }
870 return rc;
871}
872
873/*
drh7ed91f22010-04-29 22:34:07 +0000874** Try to read the wal-index header. Attempt to verify the header
875** checksum. If the checksum can be verified, copy the wal-index
876** header into structure pWal->hdr. If the contents of pWal->hdr are
danb9bf16b2010-04-14 11:23:30 +0000877** modified by this and pChanged is not NULL, set *pChanged to 1.
878** Otherwise leave *pChanged unmodified.
879**
880** If the checksum cannot be verified return SQLITE_ERROR.
881*/
danc7991bd2010-05-05 19:04:59 +0000882int walIndexTryHdr(Wal *pWal, int *pisValid, int *pChanged){
danb9bf16b2010-04-14 11:23:30 +0000883 u32 aCksum[2] = {1, 1};
drh7ed91f22010-04-29 22:34:07 +0000884 u32 aHdr[WALINDEX_HDR_NFIELD+2];
danb9bf16b2010-04-14 11:23:30 +0000885
dan576bc322010-05-06 18:04:50 +0000886 assert( *pisValid==0 );
drh79e6c782010-04-30 02:13:26 +0000887 if( pWal->szWIndex==0 ){
dan576bc322010-05-06 18:04:50 +0000888 return SQLITE_OK;
drh79e6c782010-04-30 02:13:26 +0000889 }
890
drh7ed91f22010-04-29 22:34:07 +0000891 /* Read the header. The caller may or may not have locked the wal-index
dancd11fb22010-04-26 10:40:52 +0000892 ** file, meaning it is possible that an inconsistent snapshot is read
dana8614692010-05-06 14:42:34 +0000893 ** from the file. If this happens, return SQLITE_ERROR.
danb9bf16b2010-04-14 11:23:30 +0000894 */
drh7ed91f22010-04-29 22:34:07 +0000895 memcpy(aHdr, pWal->pWiData, sizeof(aHdr));
896 walChecksumBytes((u8*)aHdr, sizeof(u32)*WALINDEX_HDR_NFIELD, aCksum);
897 if( aCksum[0]!=aHdr[WALINDEX_HDR_NFIELD]
898 || aCksum[1]!=aHdr[WALINDEX_HDR_NFIELD+1]
danb9bf16b2010-04-14 11:23:30 +0000899 ){
danc7991bd2010-05-05 19:04:59 +0000900 return SQLITE_OK;
danb9bf16b2010-04-14 11:23:30 +0000901 }
danc7991bd2010-05-05 19:04:59 +0000902 *pisValid = 1;
danb9bf16b2010-04-14 11:23:30 +0000903
drh7ed91f22010-04-29 22:34:07 +0000904 if( memcmp(&pWal->hdr, aHdr, sizeof(WalIndexHdr)) ){
dana8614692010-05-06 14:42:34 +0000905 *pChanged = 1;
drh7ed91f22010-04-29 22:34:07 +0000906 memcpy(&pWal->hdr, aHdr, sizeof(WalIndexHdr));
danb9bf16b2010-04-14 11:23:30 +0000907 }
908 return SQLITE_OK;
909}
910
911/*
drh7ed91f22010-04-29 22:34:07 +0000912** Read the wal-index header from the wal-index file into structure
913** pWal->hdr. If attempting to verify the header checksum fails, try
danb9bf16b2010-04-14 11:23:30 +0000914** to recover the log before returning.
915**
drh7ed91f22010-04-29 22:34:07 +0000916** If the wal-index header is successfully read, return SQLITE_OK.
danb9bf16b2010-04-14 11:23:30 +0000917** Otherwise an SQLite error code.
918*/
drh7ed91f22010-04-29 22:34:07 +0000919static int walIndexReadHdr(Wal *pWal, int *pChanged){
danb9bf16b2010-04-14 11:23:30 +0000920 int rc;
danc7991bd2010-05-05 19:04:59 +0000921 int isValid = 0;
dan5273f582010-05-06 18:27:19 +0000922 int lockState;
danb9bf16b2010-04-14 11:23:30 +0000923
dan4c97b532010-04-30 09:52:17 +0000924 assert( pWal->lockState>=SQLITE_SHM_READ );
dana8614692010-05-06 14:42:34 +0000925 assert( pChanged );
danc7991bd2010-05-05 19:04:59 +0000926 rc = walIndexMap(pWal, -1);
927 if( rc!=SQLITE_OK ){
928 return rc;
929 }
drh7ed91f22010-04-29 22:34:07 +0000930
danb9bf16b2010-04-14 11:23:30 +0000931 /* First try to read the header without a lock. Verify the checksum
932 ** before returning. This will almost always work.
933 */
danc7991bd2010-05-05 19:04:59 +0000934 rc = walIndexTryHdr(pWal, &isValid, pChanged);
935 if( isValid || rc!=SQLITE_OK ){
936 return rc;
danb9bf16b2010-04-14 11:23:30 +0000937 }
938
drh7ed91f22010-04-29 22:34:07 +0000939 /* If the first attempt to read the header failed, lock the wal-index
danb9bf16b2010-04-14 11:23:30 +0000940 ** file and try again. If the header checksum verification fails this
941 ** time as well, run log recovery.
942 */
dan5273f582010-05-06 18:27:19 +0000943 lockState = pWal->lockState;
drh7ed91f22010-04-29 22:34:07 +0000944 if( SQLITE_OK==(rc = walSetLock(pWal, SQLITE_SHM_RECOVER)) ){
dana8614692010-05-06 14:42:34 +0000945 /* This call to walIndexTryHdr() may not return an error code, as the
946 ** wal-index is already mapped. It may find that the header is invalid,
947 ** but there is no chance of hitting an actual error. */
dan576bc322010-05-06 18:04:50 +0000948 assert( pWal->pWiData );
danc7991bd2010-05-05 19:04:59 +0000949 rc = walIndexTryHdr(pWal, &isValid, pChanged);
dana8614692010-05-06 14:42:34 +0000950 assert( rc==SQLITE_OK );
951 if( isValid==0 ){
952 *pChanged = 1;
drh7ed91f22010-04-29 22:34:07 +0000953 rc = walIndexRecover(pWal);
danb9bf16b2010-04-14 11:23:30 +0000954 }
dan5273f582010-05-06 18:27:19 +0000955 walSetLock(pWal, lockState);
danb9bf16b2010-04-14 11:23:30 +0000956 }
957
958 return rc;
959}
960
961/*
dan64d039e2010-04-13 19:27:31 +0000962** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +0000963**
964** If this call obtains a new read-lock and the database contents have been
drh7ed91f22010-04-29 22:34:07 +0000965** modified since the most recent call to WalCloseSnapshot() on this Wal
dan7c246102010-04-12 19:00:29 +0000966** connection, then *pChanged is set to 1 before returning. Otherwise, it
967** is left unmodified. This is used by the pager layer to determine whether
968** or not any cached pages may be safely reused.
969*/
drh7ed91f22010-04-29 22:34:07 +0000970int sqlite3WalOpenSnapshot(Wal *pWal, int *pChanged){
dan8d6ad1c2010-05-04 10:36:20 +0000971 int rc; /* Return code */
dan64d039e2010-04-13 19:27:31 +0000972
drh7ed91f22010-04-29 22:34:07 +0000973 rc = walSetLock(pWal, SQLITE_SHM_READ);
dan8d6ad1c2010-05-04 10:36:20 +0000974 assert( rc!=SQLITE_OK || pWal->lockState==SQLITE_SHM_READ );
dan64d039e2010-04-13 19:27:31 +0000975
dan8d6ad1c2010-05-04 10:36:20 +0000976 if( rc==SQLITE_OK ){
drh7ed91f22010-04-29 22:34:07 +0000977 rc = walIndexReadHdr(pWal, pChanged);
dan64d039e2010-04-13 19:27:31 +0000978 if( rc!=SQLITE_OK ){
979 /* An error occured while attempting log recovery. */
drh7ed91f22010-04-29 22:34:07 +0000980 sqlite3WalCloseSnapshot(pWal);
dan64d039e2010-04-13 19:27:31 +0000981 }
dan7c246102010-04-12 19:00:29 +0000982 }
danba515902010-04-30 09:32:06 +0000983
984 walIndexUnmap(pWal);
dan7c246102010-04-12 19:00:29 +0000985 return rc;
986}
987
988/*
989** Unlock the current snapshot.
990*/
drh7ed91f22010-04-29 22:34:07 +0000991void sqlite3WalCloseSnapshot(Wal *pWal){
dan8d6ad1c2010-05-04 10:36:20 +0000992 assert( pWal->lockState==SQLITE_SHM_READ
993 || pWal->lockState==SQLITE_SHM_UNLOCK
994 );
995 walSetLock(pWal, SQLITE_SHM_UNLOCK);
dan7c246102010-04-12 19:00:29 +0000996}
997
dan5e0ce872010-04-28 17:48:44 +0000998/*
dan7c246102010-04-12 19:00:29 +0000999** Read a page from the log, if it is present.
1000*/
danb6e099a2010-05-04 14:47:39 +00001001int sqlite3WalRead(
1002 Wal *pWal,
1003 Pgno pgno,
1004 int *pInWal,
1005 int nOut,
1006 u8 *pOut
1007){
danc7991bd2010-05-05 19:04:59 +00001008 int rc; /* Return code */
dan7c246102010-04-12 19:00:29 +00001009 u32 iRead = 0;
dancd11fb22010-04-26 10:40:52 +00001010 u32 *aData;
drh7ed91f22010-04-29 22:34:07 +00001011 int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00);
dan7c246102010-04-12 19:00:29 +00001012
dan1bc61712010-04-30 10:24:54 +00001013 assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
dan576bc322010-05-06 18:04:50 +00001014 rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
danc7991bd2010-05-05 19:04:59 +00001015 if( rc!=SQLITE_OK ){
1016 return rc;
1017 }
dancd11fb22010-04-26 10:40:52 +00001018
dan7c246102010-04-12 19:00:29 +00001019 /* Do a linear search of the unindexed block of page-numbers (if any)
drh7ed91f22010-04-29 22:34:07 +00001020 ** at the end of the wal-index. An alternative to this would be to
dan7c246102010-04-12 19:00:29 +00001021 ** build an index in private memory each time a read transaction is
1022 ** opened on a new snapshot.
1023 */
drh7ed91f22010-04-29 22:34:07 +00001024 aData = pWal->pWiData;
1025 if( pWal->hdr.iLastPg ){
1026 u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)];
1027 u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF);
dan7c246102010-04-12 19:00:29 +00001028 while( *pi!=pgno && pi!=piStop ) pi--;
1029 if( pi!=piStop ){
1030 iRead = (pi-piStop) + iFrame;
1031 }
1032 }
drh7ed91f22010-04-29 22:34:07 +00001033 assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
dan7c246102010-04-12 19:00:29 +00001034
1035 while( iRead==0 && iFrame>0 ){
1036 int iLow = 0;
1037 int iHigh = 255;
1038 u32 *aFrame;
1039 u8 *aIndex;
1040
1041 iFrame -= 256;
drh7ed91f22010-04-29 22:34:07 +00001042 aFrame = &aData[walIndexEntry(iFrame+1)];
dan7c246102010-04-12 19:00:29 +00001043 aIndex = (u8 *)&aFrame[256];
1044
1045 while( iLow<=iHigh ){
1046 int iTest = (iLow+iHigh)>>1;
1047 u32 iPg = aFrame[aIndex[iTest]];
1048
1049 if( iPg==pgno ){
1050 iRead = iFrame + 1 + aIndex[iTest];
1051 break;
1052 }
1053 else if( iPg<pgno ){
1054 iLow = iTest+1;
1055 }else{
1056 iHigh = iTest-1;
1057 }
1058 }
1059 }
drh7ed91f22010-04-29 22:34:07 +00001060 assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
1061 walIndexUnmap(pWal);
dancd11fb22010-04-26 10:40:52 +00001062
dan7c246102010-04-12 19:00:29 +00001063 /* If iRead is non-zero, then it is the log frame number that contains the
1064 ** required page. Read and return data from the log file.
1065 */
1066 if( iRead ){
drh7ed91f22010-04-29 22:34:07 +00001067 i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
1068 *pInWal = 1;
danb6e099a2010-05-04 14:47:39 +00001069 return sqlite3OsRead(pWal->pFd, pOut, nOut, iOffset);
dan7c246102010-04-12 19:00:29 +00001070 }
1071
drh7ed91f22010-04-29 22:34:07 +00001072 *pInWal = 0;
dan7c246102010-04-12 19:00:29 +00001073 return SQLITE_OK;
1074}
1075
1076
1077/*
1078** Set *pPgno to the size of the database file (or zero, if unknown).
1079*/
drh7ed91f22010-04-29 22:34:07 +00001080void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
1081 assert( pWal->lockState==SQLITE_SHM_READ
1082 || pWal->lockState==SQLITE_SHM_WRITE );
1083 *pPgno = pWal->hdr.nPage;
dan7c246102010-04-12 19:00:29 +00001084}
1085
1086/*
dan7c246102010-04-12 19:00:29 +00001087** This function returns SQLITE_OK if the caller may write to the database.
1088** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001089** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001090*/
drh7ed91f22010-04-29 22:34:07 +00001091int sqlite3WalWriteLock(Wal *pWal, int op){
1092 int rc;
dan7c246102010-04-12 19:00:29 +00001093 if( op ){
drh7ed91f22010-04-29 22:34:07 +00001094 assert( pWal->lockState == SQLITE_SHM_READ );
1095 rc = walSetLock(pWal, SQLITE_SHM_WRITE);
dan30c86292010-04-30 16:24:46 +00001096
1097 /* If this connection is not reading the most recent database snapshot,
1098 ** it is not possible to write to the database. In this case release
1099 ** the write locks and return SQLITE_BUSY.
1100 */
1101 if( rc==SQLITE_OK ){
dan576bc322010-05-06 18:04:50 +00001102 rc = walIndexMap(pWal, sizeof(WalIndexHdr));
dan30c86292010-04-30 16:24:46 +00001103 if( rc==SQLITE_OK
1104 && memcmp(&pWal->hdr, pWal->pWiData, sizeof(WalIndexHdr))
1105 ){
1106 rc = SQLITE_BUSY;
1107 }
1108 walIndexUnmap(pWal);
1109 if( rc!=SQLITE_OK ){
1110 walSetLock(pWal, SQLITE_SHM_READ);
1111 }
1112 }
drh7ed91f22010-04-29 22:34:07 +00001113 }else if( pWal->lockState==SQLITE_SHM_WRITE ){
1114 rc = walSetLock(pWal, SQLITE_SHM_READ);
dan7c246102010-04-12 19:00:29 +00001115 }
drh7ed91f22010-04-29 22:34:07 +00001116 return rc;
dan7c246102010-04-12 19:00:29 +00001117}
1118
dan74d6cd82010-04-24 18:44:05 +00001119/*
dan74d6cd82010-04-24 18:44:05 +00001120** If any data has been written (but not committed) to the log file, this
1121** function moves the write-pointer back to the start of the transaction.
1122**
1123** Additionally, the callback function is invoked for each frame written
1124** to the log since the start of the transaction. If the callback returns
1125** other than SQLITE_OK, it is not invoked again and the error code is
1126** returned to the caller.
1127**
1128** Otherwise, if the callback function does not return an error, this
1129** function returns SQLITE_OK.
1130*/
drh7ed91f22010-04-29 22:34:07 +00001131int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
dana8614692010-05-06 14:42:34 +00001132 int unused;
dand41a29a2010-05-06 15:56:28 +00001133 int rc;
drh7ed91f22010-04-29 22:34:07 +00001134 Pgno iMax = pWal->hdr.iLastPg;
dan74d6cd82010-04-24 18:44:05 +00001135 Pgno iFrame;
1136
dand41a29a2010-05-06 15:56:28 +00001137 assert( pWal->pWiData==0 );
dana8614692010-05-06 14:42:34 +00001138 rc = walIndexReadHdr(pWal, &unused);
dand41a29a2010-05-06 15:56:28 +00001139 for(iFrame=pWal->hdr.iLastPg+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){
drhcd058ec2010-05-04 17:20:09 +00001140 assert( pWal->lockState==SQLITE_SHM_WRITE );
drh7ed91f22010-04-29 22:34:07 +00001141 rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
dan74d6cd82010-04-24 18:44:05 +00001142 }
drh7ed91f22010-04-29 22:34:07 +00001143 walIndexUnmap(pWal);
dan74d6cd82010-04-24 18:44:05 +00001144 return rc;
1145}
1146
drh7ed91f22010-04-29 22:34:07 +00001147/* Return an integer that records the current (uncommitted) write
1148** position in the WAL
1149*/
1150u32 sqlite3WalSavepoint(Wal *pWal){
1151 assert( pWal->lockState==SQLITE_SHM_WRITE );
1152 return pWal->hdr.iLastPg;
dan4cd78b42010-04-26 16:57:10 +00001153}
1154
drh7ed91f22010-04-29 22:34:07 +00001155/* Move the write position of the WAL back to iFrame. Called in
1156** response to a ROLLBACK TO command.
1157*/
1158int sqlite3WalSavepointUndo(Wal *pWal, u32 iFrame){
dan4cd78b42010-04-26 16:57:10 +00001159 int rc = SQLITE_OK;
1160 u8 aCksum[8];
drh7ed91f22010-04-29 22:34:07 +00001161 assert( pWal->lockState==SQLITE_SHM_WRITE );
dan4cd78b42010-04-26 16:57:10 +00001162
drh7ed91f22010-04-29 22:34:07 +00001163 pWal->hdr.iLastPg = iFrame;
dan4cd78b42010-04-26 16:57:10 +00001164 if( iFrame>0 ){
drh7ed91f22010-04-29 22:34:07 +00001165 i64 iOffset = walFrameOffset(iFrame, pWal->hdr.pgsz) + sizeof(u32)*2;
1166 rc = sqlite3OsRead(pWal->pFd, aCksum, sizeof(aCksum), iOffset);
1167 pWal->hdr.iCheck1 = sqlite3Get4byte(&aCksum[0]);
1168 pWal->hdr.iCheck2 = sqlite3Get4byte(&aCksum[4]);
dan4cd78b42010-04-26 16:57:10 +00001169 }
1170
1171 return rc;
1172}
1173
dan7c246102010-04-12 19:00:29 +00001174/*
dan4cd78b42010-04-26 16:57:10 +00001175** Write a set of frames to the log. The caller must hold the write-lock
1176** on the log file (obtained using sqlite3WalWriteLock()).
dan7c246102010-04-12 19:00:29 +00001177*/
drhc438efd2010-04-26 00:19:45 +00001178int sqlite3WalFrames(
drh7ed91f22010-04-29 22:34:07 +00001179 Wal *pWal, /* Wal handle to write to */
dan7c246102010-04-12 19:00:29 +00001180 int nPgsz, /* Database page-size in bytes */
1181 PgHdr *pList, /* List of dirty pages to write */
1182 Pgno nTruncate, /* Database size after this commit */
1183 int isCommit, /* True if this is a commit */
danc5118782010-04-17 17:34:41 +00001184 int sync_flags /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001185){
dan7c246102010-04-12 19:00:29 +00001186 int rc; /* Used to catch return codes */
1187 u32 iFrame; /* Next frame address */
drh7ed91f22010-04-29 22:34:07 +00001188 u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
dan7c246102010-04-12 19:00:29 +00001189 PgHdr *p; /* Iterator to run through pList with. */
dan97a31352010-04-16 13:59:31 +00001190 u32 aCksum[2]; /* Checksums */
dan7c246102010-04-12 19:00:29 +00001191 PgHdr *pLast; /* Last frame in list */
1192 int nLast = 0; /* Number of extra copies of last page */
1193
drh7ed91f22010-04-29 22:34:07 +00001194 assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
dan7c246102010-04-12 19:00:29 +00001195 assert( pList );
drh7ed91f22010-04-29 22:34:07 +00001196 assert( pWal->lockState==SQLITE_SHM_WRITE );
danba515902010-04-30 09:32:06 +00001197 assert( pWal->pWiData==0 );
dan7c246102010-04-12 19:00:29 +00001198
dan97a31352010-04-16 13:59:31 +00001199 /* If this is the first frame written into the log, write the log
1200 ** header to the start of the log file. See comments at the top of
1201 ** this file for a description of the log-header format.
1202 */
drh7ed91f22010-04-29 22:34:07 +00001203 assert( WAL_FRAME_HDRSIZE>=WAL_HDRSIZE );
1204 iFrame = pWal->hdr.iLastPg;
dan97a31352010-04-16 13:59:31 +00001205 if( iFrame==0 ){
1206 sqlite3Put4byte(aFrame, nPgsz);
1207 sqlite3_randomness(8, &aFrame[4]);
drh7ed91f22010-04-29 22:34:07 +00001208 pWal->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
1209 pWal->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
1210 rc = sqlite3OsWrite(pWal->pFd, aFrame, WAL_HDRSIZE, 0);
dan97a31352010-04-16 13:59:31 +00001211 if( rc!=SQLITE_OK ){
1212 return rc;
1213 }
1214 }
1215
drh7ed91f22010-04-29 22:34:07 +00001216 aCksum[0] = pWal->hdr.iCheck1;
1217 aCksum[1] = pWal->hdr.iCheck2;
dan7c246102010-04-12 19:00:29 +00001218
1219 /* Write the log file. */
dan7c246102010-04-12 19:00:29 +00001220 for(p=pList; p; p=p->pDirty){
1221 u32 nDbsize; /* Db-size field for frame header */
1222 i64 iOffset; /* Write offset in log file */
1223
drh7ed91f22010-04-29 22:34:07 +00001224 iOffset = walFrameOffset(++iFrame, nPgsz);
dan7c246102010-04-12 19:00:29 +00001225
1226 /* Populate and write the frame header */
1227 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
drh7ed91f22010-04-29 22:34:07 +00001228 walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1229 rc = sqlite3OsWrite(pWal->pFd, aFrame, sizeof(aFrame), iOffset);
dan7c246102010-04-12 19:00:29 +00001230 if( rc!=SQLITE_OK ){
1231 return rc;
1232 }
1233
1234 /* Write the page data */
drh7ed91f22010-04-29 22:34:07 +00001235 rc = sqlite3OsWrite(pWal->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
dan7c246102010-04-12 19:00:29 +00001236 if( rc!=SQLITE_OK ){
1237 return rc;
1238 }
1239 pLast = p;
1240 }
1241
1242 /* Sync the log file if the 'isSync' flag was specified. */
danc5118782010-04-17 17:34:41 +00001243 if( sync_flags ){
drh7ed91f22010-04-29 22:34:07 +00001244 i64 iSegment = sqlite3OsSectorSize(pWal->pFd);
1245 i64 iOffset = walFrameOffset(iFrame+1, nPgsz);
dan67032392010-04-17 15:42:43 +00001246
1247 assert( isCommit );
dan7c246102010-04-12 19:00:29 +00001248
1249 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1250 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1251 }
1252 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1253 while( iOffset<iSegment ){
drh7ed91f22010-04-29 22:34:07 +00001254 walEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1255 rc = sqlite3OsWrite(pWal->pFd, aFrame, sizeof(aFrame), iOffset);
dan7c246102010-04-12 19:00:29 +00001256 if( rc!=SQLITE_OK ){
1257 return rc;
1258 }
1259
drh7ed91f22010-04-29 22:34:07 +00001260 iOffset += WAL_FRAME_HDRSIZE;
1261 rc = sqlite3OsWrite(pWal->pFd, pLast->pData, nPgsz, iOffset);
dan7c246102010-04-12 19:00:29 +00001262 if( rc!=SQLITE_OK ){
1263 return rc;
1264 }
1265 nLast++;
1266 iOffset += nPgsz;
1267 }
dan7c246102010-04-12 19:00:29 +00001268
drh7ed91f22010-04-29 22:34:07 +00001269 rc = sqlite3OsSync(pWal->pFd, sync_flags);
dan7c246102010-04-12 19:00:29 +00001270 }
danba515902010-04-30 09:32:06 +00001271 assert( pWal->pWiData==0 );
dan7c246102010-04-12 19:00:29 +00001272
1273 /* Append data to the log summary. It is not necessary to lock the
drh7ed91f22010-04-29 22:34:07 +00001274 ** wal-index to do this as the RESERVED lock held on the db file
dan7c246102010-04-12 19:00:29 +00001275 ** guarantees that there are no other writers, and no data that may
1276 ** be in use by existing readers is being overwritten.
1277 */
drh7ed91f22010-04-29 22:34:07 +00001278 iFrame = pWal->hdr.iLastPg;
danc7991bd2010-05-05 19:04:59 +00001279 for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
dan7c246102010-04-12 19:00:29 +00001280 iFrame++;
danc7991bd2010-05-05 19:04:59 +00001281 rc = walIndexAppend(pWal, iFrame, p->pgno);
dan7c246102010-04-12 19:00:29 +00001282 }
danc7991bd2010-05-05 19:04:59 +00001283 while( nLast>0 && rc==SQLITE_OK ){
dan7c246102010-04-12 19:00:29 +00001284 iFrame++;
1285 nLast--;
danc7991bd2010-05-05 19:04:59 +00001286 rc = walIndexAppend(pWal, iFrame, pLast->pgno);
dan7c246102010-04-12 19:00:29 +00001287 }
1288
danc7991bd2010-05-05 19:04:59 +00001289 if( rc==SQLITE_OK ){
1290 /* Update the private copy of the header. */
1291 pWal->hdr.pgsz = nPgsz;
1292 pWal->hdr.iLastPg = iFrame;
1293 if( isCommit ){
1294 pWal->hdr.iChange++;
1295 pWal->hdr.nPage = nTruncate;
1296 }
1297 pWal->hdr.iCheck1 = aCksum[0];
1298 pWal->hdr.iCheck2 = aCksum[1];
dan7c246102010-04-12 19:00:29 +00001299
danc7991bd2010-05-05 19:04:59 +00001300 /* If this is a commit, update the wal-index header too. */
1301 if( isCommit ){
1302 walIndexWriteHdr(pWal, &pWal->hdr);
1303 pWal->iCallback = iFrame;
1304 }
dan7c246102010-04-12 19:00:29 +00001305 }
danc7991bd2010-05-05 19:04:59 +00001306
drh7ed91f22010-04-29 22:34:07 +00001307 walIndexUnmap(pWal);
dan8d22a172010-04-19 18:03:51 +00001308 return rc;
dan7c246102010-04-12 19:00:29 +00001309}
1310
1311/*
danb9bf16b2010-04-14 11:23:30 +00001312** Checkpoint the database:
1313**
drh7ed91f22010-04-29 22:34:07 +00001314** 1. Acquire a CHECKPOINT lock
1315** 2. Copy the contents of the log into the database file.
1316** 3. Zero the wal-index header (so new readers will ignore the log).
1317** 4. Drop the CHECKPOINT lock.
dan7c246102010-04-12 19:00:29 +00001318*/
drhc438efd2010-04-26 00:19:45 +00001319int sqlite3WalCheckpoint(
drh7ed91f22010-04-29 22:34:07 +00001320 Wal *pWal, /* Wal connection */
dan7c246102010-04-12 19:00:29 +00001321 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001322 int sync_flags, /* Flags to sync db file with (or 0) */
danb6e099a2010-05-04 14:47:39 +00001323 int nBuf, /* Size of temporary buffer */
dan64d039e2010-04-13 19:27:31 +00001324 u8 *zBuf, /* Temporary buffer to use */
1325 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1326 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001327){
danb9bf16b2010-04-14 11:23:30 +00001328 int rc; /* Return code */
dan31c03902010-04-29 14:51:33 +00001329 int isChanged = 0; /* True if a new wal-index header is loaded */
dan7c246102010-04-12 19:00:29 +00001330
drh7ed91f22010-04-29 22:34:07 +00001331 assert( pWal->lockState==SQLITE_SHM_UNLOCK );
dan5cf53532010-05-01 16:40:20 +00001332 assert( pWal->pWiData==0 );
dan39c79f52010-04-15 10:58:51 +00001333
drh7ed91f22010-04-29 22:34:07 +00001334 /* Get the CHECKPOINT lock */
dan64d039e2010-04-13 19:27:31 +00001335 do {
drh7ed91f22010-04-29 22:34:07 +00001336 rc = walSetLock(pWal, SQLITE_SHM_CHECKPOINT);
dan64d039e2010-04-13 19:27:31 +00001337 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001338 if( rc!=SQLITE_OK ){
drh7ed91f22010-04-29 22:34:07 +00001339 walSetLock(pWal, SQLITE_SHM_UNLOCK);
danb9bf16b2010-04-14 11:23:30 +00001340 return rc;
1341 }
dan64d039e2010-04-13 19:27:31 +00001342
danb9bf16b2010-04-14 11:23:30 +00001343 /* Copy data from the log to the database file. */
drh7ed91f22010-04-29 22:34:07 +00001344 rc = walIndexReadHdr(pWal, &isChanged);
danb9bf16b2010-04-14 11:23:30 +00001345 if( rc==SQLITE_OK ){
danb6e099a2010-05-04 14:47:39 +00001346 rc = walCheckpoint(pWal, pFd, sync_flags, nBuf, zBuf);
danb9bf16b2010-04-14 11:23:30 +00001347 }
dan31c03902010-04-29 14:51:33 +00001348 if( isChanged ){
1349 /* If a new wal-index header was loaded before the checkpoint was
drh7ed91f22010-04-29 22:34:07 +00001350 ** performed, then the pager-cache associated with log pWal is now
dan31c03902010-04-29 14:51:33 +00001351 ** out of date. So zero the cached wal-index header to ensure that
1352 ** next time the pager opens a snapshot on this database it knows that
1353 ** the cache needs to be reset.
1354 */
drh7ed91f22010-04-29 22:34:07 +00001355 memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
dan31c03902010-04-29 14:51:33 +00001356 }
danb9bf16b2010-04-14 11:23:30 +00001357
1358 /* Release the locks. */
dan87bfb512010-04-30 11:43:28 +00001359 walIndexUnmap(pWal);
drh7ed91f22010-04-29 22:34:07 +00001360 walSetLock(pWal, SQLITE_SHM_UNLOCK);
dan64d039e2010-04-13 19:27:31 +00001361 return rc;
dan7c246102010-04-12 19:00:29 +00001362}
1363
drh7ed91f22010-04-29 22:34:07 +00001364/* Return the value to pass to a sqlite3_wal_hook callback, the
1365** number of frames in the WAL at the point of the last commit since
1366** sqlite3WalCallback() was called. If no commits have occurred since
1367** the last call, then return 0.
1368*/
1369int sqlite3WalCallback(Wal *pWal){
dan8d22a172010-04-19 18:03:51 +00001370 u32 ret = 0;
drh7ed91f22010-04-29 22:34:07 +00001371 if( pWal ){
1372 ret = pWal->iCallback;
1373 pWal->iCallback = 0;
dan8d22a172010-04-19 18:03:51 +00001374 }
1375 return (int)ret;
1376}
dan5cf53532010-05-01 16:40:20 +00001377#endif /* #ifndef SQLITE_OMIT_WAL */