blob: 3b57184c8174f82ccbb38b8312fd0ada591d087a [file] [log] [blame]
dan7c246102010-04-12 19:00:29 +00001/*
drh7ed91f22010-04-29 22:34:07 +00002** 2010 February 1
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
13** This file contains the implementation of a write-ahead log file used in
dan7c246102010-04-12 19:00:29 +000014** "journal_mode=wal" mode.
15*/
drh7ed91f22010-04-29 22:34:07 +000016#include "wal.h"
dan7c246102010-04-12 19:00:29 +000017
dan4b64c1e2010-04-27 18:49:54 +000018
dan97a31352010-04-16 13:59:31 +000019/*
drh7ed91f22010-04-29 22:34:07 +000020** WRITE-AHEAD LOG (WAL) FILE FORMAT
dan97a31352010-04-16 13:59:31 +000021**
drh7ed91f22010-04-29 22:34:07 +000022** A wal file consists of a header followed by zero or more "frames".
23** The header is 12 bytes in size and consists of the following three
dan97a31352010-04-16 13:59:31 +000024** big-endian 32-bit unsigned integer values:
25**
dan3de777f2010-04-17 12:31:37 +000026** 0: Database page size,
27** 4: Randomly selected salt value 1,
28** 8: Randomly selected salt value 2.
dan97a31352010-04-16 13:59:31 +000029**
drh7ed91f22010-04-29 22:34:07 +000030** Immediately following the header are zero or more frames. Each
dan97a31352010-04-16 13:59:31 +000031** frame itself consists of a 16-byte header followed by a <page-size> bytes
32** of page data. The header is broken into 4 big-endian 32-bit unsigned
33** integer values, as follows:
34**
dan3de777f2010-04-17 12:31:37 +000035** 0: Page number.
36** 4: For commit records, the size of the database image in pages
dan97a31352010-04-16 13:59:31 +000037** after the commit. For all other records, zero.
dan3de777f2010-04-17 12:31:37 +000038** 8: Checksum value 1.
dan97a31352010-04-16 13:59:31 +000039** 12: Checksum value 2.
40*/
41
42/*
drh7ed91f22010-04-29 22:34:07 +000043** WAL-INDEX FILE FORMAT
dan97a31352010-04-16 13:59:31 +000044**
drh7ed91f22010-04-29 22:34:07 +000045** The wal-index file consists of a 32-byte header region, followed by an
46** 8-byte region that contains no useful data (used to apply byte-range locks
danff207012010-04-24 04:49:15 +000047** to), followed by the data region.
48**
49** The contents of both the header and data region are specified in terms
50** of 1, 2 and 4 byte unsigned integers. All integers are stored in
drh7ed91f22010-04-29 22:34:07 +000051** machine-endian order. The wal-index is not a persistent file and
52** so it does not need to be portable across archtectures.
danff207012010-04-24 04:49:15 +000053**
drh7ed91f22010-04-29 22:34:07 +000054** A wal-index file is essentially a shadow-pager map. It contains a
55** mapping from database page number to the set of locations in the wal
danff207012010-04-24 04:49:15 +000056** file that contain versions of the database page. When a database
drh7ed91f22010-04-29 22:34:07 +000057** client needs to read a page of data, it first queries the wal-index
danff207012010-04-24 04:49:15 +000058** file to determine if the required version of the page is stored in
drh7ed91f22010-04-29 22:34:07 +000059** the wal. If so, the page is read from the wal. If not, the page is
60** read from the database file.
danff207012010-04-24 04:49:15 +000061**
drh7ed91f22010-04-29 22:34:07 +000062** Whenever a transaction is appended to the wal or a checkpoint transfers
63** data from the wal into the database file, the wal-index is
danff207012010-04-24 04:49:15 +000064** updated accordingly.
65**
drh7ed91f22010-04-29 22:34:07 +000066** The fields in the wal-index file header are described in the comment
67** directly above the definition of struct WalIndexHdr (see below).
68** Immediately following the fields in the WalIndexHdr structure is
danff207012010-04-24 04:49:15 +000069** an 8 byte checksum based on the contents of the header. This field is
drh7ed91f22010-04-29 22:34:07 +000070** not the same as the iCheck1 and iCheck2 fields of the WalIndexHdr.
dan97a31352010-04-16 13:59:31 +000071*/
72
drh7ed91f22010-04-29 22:34:07 +000073/* Object declarations */
74typedef struct WalIndexHdr WalIndexHdr;
75typedef struct WalIterator WalIterator;
dan7c246102010-04-12 19:00:29 +000076
77
78/*
drh7ed91f22010-04-29 22:34:07 +000079** The following object stores a copy of the wal-index header.
dan7c246102010-04-12 19:00:29 +000080**
81** Member variables iCheck1 and iCheck2 contain the checksum for the
drh7ed91f22010-04-29 22:34:07 +000082** last frame written to the wal, or 2 and 3 respectively if the log
dan7c246102010-04-12 19:00:29 +000083** is currently empty.
84*/
drh7ed91f22010-04-29 22:34:07 +000085struct WalIndexHdr {
dan7c246102010-04-12 19:00:29 +000086 u32 iChange; /* Counter incremented each transaction */
87 u32 pgsz; /* Database page size in bytes */
88 u32 iLastPg; /* Address of last valid frame in log */
89 u32 nPage; /* Size of database in pages */
90 u32 iCheck1; /* Checkpoint value 1 */
91 u32 iCheck2; /* Checkpoint value 2 */
92};
93
drh7ed91f22010-04-29 22:34:07 +000094/* Size of serialized WalIndexHdr object. */
95#define WALINDEX_HDR_NFIELD (sizeof(WalIndexHdr) / sizeof(u32))
dan7c246102010-04-12 19:00:29 +000096
drh7ed91f22010-04-29 22:34:07 +000097/* A block of 16 bytes beginning at WALINDEX_LOCK_OFFSET is reserved
danff207012010-04-24 04:49:15 +000098** for locks. Since some systems only feature mandatory file-locks, we
99** do not read or write data from the region of the file on which locks
100** are applied.
101*/
drh7ed91f22010-04-29 22:34:07 +0000102#define WALINDEX_LOCK_OFFSET ((sizeof(WalIndexHdr))+2*sizeof(u32))
103#define WALINDEX_LOCK_RESERVED 8
dan7c246102010-04-12 19:00:29 +0000104
drh7ed91f22010-04-29 22:34:07 +0000105/* Size of header before each frame in wal */
106#define WAL_FRAME_HDRSIZE 16
danff207012010-04-24 04:49:15 +0000107
drh7ed91f22010-04-29 22:34:07 +0000108/* Size of write ahead log header */
109#define WAL_HDRSIZE 12
dan97a31352010-04-16 13:59:31 +0000110
111/*
drh7ed91f22010-04-29 22:34:07 +0000112** Return the offset of frame iFrame in the write-ahead log file,
113** assuming a database page size of pgsz bytes. The offset returned
114** is to the start of the write-ahead log frame-header.
dan97a31352010-04-16 13:59:31 +0000115*/
drh7ed91f22010-04-29 22:34:07 +0000116#define walFrameOffset(iFrame, pgsz) ( \
117 WAL_HDRSIZE + ((iFrame)-1)*((pgsz)+WAL_FRAME_HDRSIZE) \
dan97a31352010-04-16 13:59:31 +0000118)
dan7c246102010-04-12 19:00:29 +0000119
120/*
drh7ed91f22010-04-29 22:34:07 +0000121** An open write-ahead log file is represented by an instance of the
122** following object.
dance4f05f2010-04-22 19:14:13 +0000123*/
drh7ed91f22010-04-29 22:34:07 +0000124struct Wal {
125 sqlite3_vfs *pVfs; /* The VFS used to create pFd */
126 sqlite3_file *pFd; /* File handle for WAL file */
127 u32 iCallback; /* Value to pass to log callback (or 0) */
128 sqlite3_shm *pWIndex; /* The open wal-index file */
drh5530b762010-04-30 14:39:50 +0000129 int szWIndex; /* Size of the wal-index that is mapped in mem */
drh7ed91f22010-04-29 22:34:07 +0000130 u32 *pWiData; /* Pointer to wal-index content in memory */
131 u8 lockState; /* SQLITE_SHM_xxxx constant showing lock state */
132 u8 readerType; /* SQLITE_SHM_READ or SQLITE_SHM_READ_FULL */
133 WalIndexHdr hdr; /* Wal-index for current snapshot */
dan7c246102010-04-12 19:00:29 +0000134};
135
dan64d039e2010-04-13 19:27:31 +0000136
dan7c246102010-04-12 19:00:29 +0000137/*
138** This structure is used to implement an iterator that iterates through
139** all frames in the log in database page order. Where two or more frames
140** correspond to the same database page, the iterator visits only the
141** frame most recently written to the log.
142**
143** The internals of this structure are only accessed by:
144**
drh7ed91f22010-04-29 22:34:07 +0000145** walIteratorInit() - Create a new iterator,
146** walIteratorNext() - Step an iterator,
147** walIteratorFree() - Free an iterator.
dan7c246102010-04-12 19:00:29 +0000148**
drh7ed91f22010-04-29 22:34:07 +0000149** This functionality is used by the checkpoint code (see walCheckpoint()).
dan7c246102010-04-12 19:00:29 +0000150*/
drh7ed91f22010-04-29 22:34:07 +0000151struct WalIterator {
152 int nSegment; /* Size of WalIterator.aSegment[] array */
dan7c246102010-04-12 19:00:29 +0000153 int nFinal; /* Elements in segment nSegment-1 */
drh7ed91f22010-04-29 22:34:07 +0000154 struct WalSegment {
dan7c246102010-04-12 19:00:29 +0000155 int iNext; /* Next aIndex index */
156 u8 *aIndex; /* Pointer to index array */
157 u32 *aDbPage; /* Pointer to db page array */
158 } aSegment[1];
159};
160
dan64d039e2010-04-13 19:27:31 +0000161
dan7c246102010-04-12 19:00:29 +0000162/*
163** Generate an 8 byte checksum based on the data in array aByte[] and the
164** initial values of aCksum[0] and aCksum[1]. The checksum is written into
165** aCksum[] before returning.
dan56d95912010-04-24 19:07:29 +0000166**
167** The range of bytes to checksum is treated as an array of 32-bit
168** little-endian unsigned integers. For each integer X in the array, from
169** start to finish, do the following:
170**
171** aCksum[0] += X;
172** aCksum[1] += aCksum[0];
173**
174** For the calculation above, use 64-bit unsigned accumulators. Before
175** returning, truncate the values to 32-bits as follows:
176**
177** aCksum[0] = (u32)(aCksum[0] + (aCksum[0]>>24));
178** aCksum[1] = (u32)(aCksum[1] + (aCksum[1]>>24));
dan7c246102010-04-12 19:00:29 +0000179*/
drh7ed91f22010-04-29 22:34:07 +0000180static void walChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
dan39c79f52010-04-15 10:58:51 +0000181 u64 sum1 = aCksum[0];
182 u64 sum2 = aCksum[1];
183 u32 *a32 = (u32 *)aByte;
184 u32 *aEnd = (u32 *)&aByte[nByte];
dan7c246102010-04-12 19:00:29 +0000185
dan7c246102010-04-12 19:00:29 +0000186 assert( (nByte&0x00000003)==0 );
187
dance4f05f2010-04-22 19:14:13 +0000188 if( SQLITE_LITTLEENDIAN ){
189#ifdef SQLITE_DEBUG
190 u8 *a = (u8 *)a32;
191 assert( *a32==(a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24)) );
192#endif
193 do {
194 sum1 += *a32;
195 sum2 += sum1;
196 } while( ++a32<aEnd );
197 }else{
198 do {
199 u8 *a = (u8*)a32;
200 sum1 += a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24);
201 sum2 += sum1;
202 } while( ++a32<aEnd );
203 }
dan7c246102010-04-12 19:00:29 +0000204
dan39c79f52010-04-15 10:58:51 +0000205 aCksum[0] = sum1 + (sum1>>24);
206 aCksum[1] = sum2 + (sum2>>24);
dan7c246102010-04-12 19:00:29 +0000207}
208
209/*
drh7ed91f22010-04-29 22:34:07 +0000210** Attempt to change the lock status.
dan7c246102010-04-12 19:00:29 +0000211**
drh7ed91f22010-04-29 22:34:07 +0000212** When changing the lock status to SQLITE_SHM_READ, store the
213** type of reader lock (either SQLITE_SHM_READ or SQLITE_SHM_READ_FULL)
214** in pWal->readerType.
dan7c246102010-04-12 19:00:29 +0000215*/
drh7ed91f22010-04-29 22:34:07 +0000216static int walSetLock(Wal *pWal, int desiredStatus){
217 int rc, got;
218 if( pWal->lockState==desiredStatus ) return SQLITE_OK;
219 rc = pWal->pVfs->xShmLock(pWal->pWIndex, desiredStatus, &got);
drh49156b22010-04-30 16:12:04 +0000220 pWal->lockState = got;
221 if( got==SQLITE_SHM_READ_FULL || got==SQLITE_SHM_READ ){
222 pWal->readerType = got;
223 pWal->lockState = SQLITE_SHM_READ;
dan7c246102010-04-12 19:00:29 +0000224 }
225 return rc;
226}
227
drh7ed91f22010-04-29 22:34:07 +0000228/*
229** Update the header of the wal-index file.
230*/
231static void walIndexWriteHdr(Wal *pWal, WalIndexHdr *pHdr){
232 u32 *aHdr = pWal->pWiData; /* Write header here */
233 u32 *aCksum = &aHdr[WALINDEX_HDR_NFIELD]; /* Write header cksum here */
danff207012010-04-24 04:49:15 +0000234
drh7ed91f22010-04-29 22:34:07 +0000235 assert( WALINDEX_HDR_NFIELD==sizeof(WalIndexHdr)/4 );
236 assert( aHdr!=0 );
237 memcpy(aHdr, pHdr, sizeof(WalIndexHdr));
danff207012010-04-24 04:49:15 +0000238 aCksum[0] = aCksum[1] = 1;
drh7ed91f22010-04-29 22:34:07 +0000239 walChecksumBytes((u8 *)aHdr, sizeof(WalIndexHdr), aCksum);
dan7c246102010-04-12 19:00:29 +0000240}
241
242/*
243** This function encodes a single frame header and writes it to a buffer
drh7ed91f22010-04-29 22:34:07 +0000244** supplied by the caller. A frame-header is made up of a series of
dan7c246102010-04-12 19:00:29 +0000245** 4-byte big-endian integers, as follows:
246**
247** 0: Database page size in bytes.
248** 4: Page number.
249** 8: New database size (for commit frames, otherwise zero).
250** 12: Frame checksum 1.
251** 16: Frame checksum 2.
252*/
drh7ed91f22010-04-29 22:34:07 +0000253static void walEncodeFrame(
dan7c246102010-04-12 19:00:29 +0000254 u32 *aCksum, /* IN/OUT: Checksum values */
255 u32 iPage, /* Database page number for frame */
256 u32 nTruncate, /* New db size (or 0 for non-commit frames) */
257 int nData, /* Database page size (size of aData[]) */
258 u8 *aData, /* Pointer to page data (for checksum) */
259 u8 *aFrame /* OUT: Write encoded frame here */
260){
drh7ed91f22010-04-29 22:34:07 +0000261 assert( WAL_FRAME_HDRSIZE==16 );
dan7c246102010-04-12 19:00:29 +0000262
dan97a31352010-04-16 13:59:31 +0000263 sqlite3Put4byte(&aFrame[0], iPage);
264 sqlite3Put4byte(&aFrame[4], nTruncate);
dan7c246102010-04-12 19:00:29 +0000265
drh7ed91f22010-04-29 22:34:07 +0000266 walChecksumBytes(aFrame, 8, aCksum);
267 walChecksumBytes(aData, nData, aCksum);
dan7c246102010-04-12 19:00:29 +0000268
dan97a31352010-04-16 13:59:31 +0000269 sqlite3Put4byte(&aFrame[8], aCksum[0]);
270 sqlite3Put4byte(&aFrame[12], aCksum[1]);
dan7c246102010-04-12 19:00:29 +0000271}
272
273/*
274** Return 1 and populate *piPage, *pnTruncate and aCksum if the
275** frame checksum looks Ok. Otherwise return 0.
276*/
drh7ed91f22010-04-29 22:34:07 +0000277static int walDecodeFrame(
dan7c246102010-04-12 19:00:29 +0000278 u32 *aCksum, /* IN/OUT: Checksum values */
279 u32 *piPage, /* OUT: Database page number for frame */
280 u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
281 int nData, /* Database page size (size of aData[]) */
282 u8 *aData, /* Pointer to page data (for checksum) */
283 u8 *aFrame /* Frame data */
284){
drh7ed91f22010-04-29 22:34:07 +0000285 assert( WAL_FRAME_HDRSIZE==16 );
dan4a4b01d2010-04-16 11:30:18 +0000286
drh7ed91f22010-04-29 22:34:07 +0000287 walChecksumBytes(aFrame, 8, aCksum);
288 walChecksumBytes(aData, nData, aCksum);
dan7c246102010-04-12 19:00:29 +0000289
dan97a31352010-04-16 13:59:31 +0000290 if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
291 || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
dan7c246102010-04-12 19:00:29 +0000292 ){
293 /* Checksum failed. */
294 return 0;
295 }
296
dan97a31352010-04-16 13:59:31 +0000297 *piPage = sqlite3Get4byte(&aFrame[0]);
298 *pnTruncate = sqlite3Get4byte(&aFrame[4]);
dan7c246102010-04-12 19:00:29 +0000299 return 1;
300}
301
drh7ed91f22010-04-29 22:34:07 +0000302static void walMergesort8(
303 Pgno *aContent, /* Pages in wal */
dan7c246102010-04-12 19:00:29 +0000304 u8 *aBuffer, /* Buffer of at least *pnList items to use */
305 u8 *aList, /* IN/OUT: List to sort */
306 int *pnList /* IN/OUT: Number of elements in aList[] */
307){
308 int nList = *pnList;
309 if( nList>1 ){
310 int nLeft = nList / 2; /* Elements in left list */
311 int nRight = nList - nLeft; /* Elements in right list */
312 u8 *aLeft = aList; /* Left list */
313 u8 *aRight = &aList[nLeft]; /* Right list */
314 int iLeft = 0; /* Current index in aLeft */
315 int iRight = 0; /* Current index in aright */
316 int iOut = 0; /* Current index in output buffer */
317
318 /* TODO: Change to non-recursive version. */
drh7ed91f22010-04-29 22:34:07 +0000319 walMergesort8(aContent, aBuffer, aLeft, &nLeft);
320 walMergesort8(aContent, aBuffer, aRight, &nRight);
dan7c246102010-04-12 19:00:29 +0000321
322 while( iRight<nRight || iLeft<nLeft ){
323 u8 logpage;
324 Pgno dbpage;
325
326 if( (iLeft<nLeft)
327 && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
328 ){
329 logpage = aLeft[iLeft++];
330 }else{
331 logpage = aRight[iRight++];
332 }
333 dbpage = aContent[logpage];
334
335 aBuffer[iOut++] = logpage;
336 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
337
338 assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
339 assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
340 }
341 memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
342 *pnList = iOut;
343 }
344
345#ifdef SQLITE_DEBUG
346 {
347 int i;
348 for(i=1; i<*pnList; i++){
349 assert( aContent[aList[i]] > aContent[aList[i-1]] );
350 }
351 }
352#endif
353}
354
355
356/*
drh7ed91f22010-04-29 22:34:07 +0000357** Return the index in the WalIndex.aData array that corresponds to
358** frame iFrame. The wal-index file consists of a header, followed by
dan7c246102010-04-12 19:00:29 +0000359** alternating "map" and "index" blocks.
360*/
drh7ed91f22010-04-29 22:34:07 +0000361static int walIndexEntry(u32 iFrame){
danff207012010-04-24 04:49:15 +0000362 return (
drh7ed91f22010-04-29 22:34:07 +0000363 (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
danff207012010-04-24 04:49:15 +0000364 + (((iFrame-1)>>8)<<6) /* Indexes that occur before iFrame */
365 + iFrame-1 /* Db page numbers that occur before iFrame */
366 );
dan7c246102010-04-12 19:00:29 +0000367}
368
drh7ed91f22010-04-29 22:34:07 +0000369/*
drh5530b762010-04-30 14:39:50 +0000370** Release our reference to the wal-index memory map, if we are holding
371** it.
drh7ed91f22010-04-29 22:34:07 +0000372*/
373static void walIndexUnmap(Wal *pWal){
374 if( pWal->pWiData ){
375 pWal->pVfs->xShmRelease(pWal->pWIndex);
376 pWal->pWiData = 0;
377 }
378}
dan7c246102010-04-12 19:00:29 +0000379
380/*
drh5530b762010-04-30 14:39:50 +0000381** Map the wal-index file into memory if it isn't already.
382**
383** The reqSize parameter is the minimum required size of the mapping.
384** A value of -1 means "don't care". The reqSize parameter is ignored
385** if the mapping is already held.
drh7ed91f22010-04-29 22:34:07 +0000386*/
drh5530b762010-04-30 14:39:50 +0000387static int walIndexMap(Wal *pWal, int reqSize){
388 int rc = SQLITE_OK;
389 if( pWal->pWiData==0 ){
390 rc = pWal->pVfs->xShmGet(pWal->pWIndex, reqSize, &pWal->szWIndex,
391 (void**)(char*)&pWal->pWiData);
392 if( rc==SQLITE_OK && pWal->pWiData==0 ){
393 /* Make sure pWal->pWiData is not NULL while we are holding the
394 ** lock on the mapping. */
395 assert( pWal->szWIndex==0 );
396 pWal->pWiData = &pWal->iCallback;
397 }
drh79e6c782010-04-30 02:13:26 +0000398 }
399 return rc;
400}
401
402/*
drh5530b762010-04-30 14:39:50 +0000403** Remap the wal-index so that the mapping covers the full size
404** of the underlying file.
405**
406** If enlargeTo is non-negative, then increase the size of the underlying
407** storage to be at least as big as enlargeTo before remapping.
drh79e6c782010-04-30 02:13:26 +0000408*/
drh5530b762010-04-30 14:39:50 +0000409static int walIndexRemap(Wal *pWal, int enlargeTo){
410 int rc;
411 int sz;
412 rc = pWal->pVfs->xShmSize(pWal->pWIndex, enlargeTo, &sz);
413 if( rc==SQLITE_OK && sz>pWal->szWIndex ){
414 walIndexUnmap(pWal);
415 rc = walIndexMap(pWal, sz);
416 }
drh7ed91f22010-04-29 22:34:07 +0000417 return rc;
418}
419
420/*
421** Increment by which to increase the wal-index file size.
422*/
423#define WALINDEX_MMAP_INCREMENT (64*1024)
424
425/*
426** Set an entry in the wal-index map to map log frame iFrame to db
427** page iPage. Values are always appended to the wal-index (i.e. the
dan7c246102010-04-12 19:00:29 +0000428** value of iFrame is always exactly one more than the value passed to
429** the previous call), but that restriction is not enforced or asserted
430** here.
431*/
drh7ed91f22010-04-29 22:34:07 +0000432static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
433 u32 iSlot = walIndexEntry(iFrame);
434
drh5530b762010-04-30 14:39:50 +0000435 walIndexMap(pWal, -1);
drh7ed91f22010-04-29 22:34:07 +0000436 while( (iSlot+128)>=pWal->szWIndex ){
dan31f98fc2010-04-27 05:42:32 +0000437 int rc;
drh7ed91f22010-04-29 22:34:07 +0000438 int nByte = pWal->szWIndex*4 + WALINDEX_MMAP_INCREMENT;
dance4f05f2010-04-22 19:14:13 +0000439
drh5530b762010-04-30 14:39:50 +0000440 /* Enlarge the storage, then remap it. */
drh7ed91f22010-04-29 22:34:07 +0000441 rc = walIndexRemap(pWal, nByte);
dan31f98fc2010-04-27 05:42:32 +0000442 if( rc!=SQLITE_OK ){
443 return rc;
444 }
dance4f05f2010-04-22 19:14:13 +0000445 }
446
drh7ed91f22010-04-29 22:34:07 +0000447 /* Set the wal-index entry itself */
448 pWal->pWiData[iSlot] = iPage;
dan7c246102010-04-12 19:00:29 +0000449
450 /* If the frame number is a multiple of 256 (frames are numbered starting
451 ** at 1), build an index of the most recently added 256 frames.
452 */
453 if( (iFrame&0x000000FF)==0 ){
454 int i; /* Iterator used while initializing aIndex */
455 u32 *aFrame; /* Pointer to array of 256 frames */
456 int nIndex; /* Number of entries in index */
457 u8 *aIndex; /* 256 bytes to build index in */
458 u8 *aTmp; /* Scratch space to use while sorting */
459
drh7ed91f22010-04-29 22:34:07 +0000460 aFrame = &pWal->pWiData[iSlot-255];
461 aIndex = (u8 *)&pWal->pWiData[iSlot+1];
dan7c246102010-04-12 19:00:29 +0000462 aTmp = &aIndex[256];
463
464 nIndex = 256;
465 for(i=0; i<256; i++) aIndex[i] = (u8)i;
drh7ed91f22010-04-29 22:34:07 +0000466 walMergesort8(aFrame, aTmp, aIndex, &nIndex);
dan7c246102010-04-12 19:00:29 +0000467 memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
468 }
dan31f98fc2010-04-27 05:42:32 +0000469
470 return SQLITE_OK;
dan7c246102010-04-12 19:00:29 +0000471}
472
473
474/*
drh7ed91f22010-04-29 22:34:07 +0000475** Recover the wal-index by reading the write-ahead log file.
476** The caller must hold RECOVER lock on the wal-index file.
dan7c246102010-04-12 19:00:29 +0000477*/
drh7ed91f22010-04-29 22:34:07 +0000478static int walIndexRecover(Wal *pWal){
dan7c246102010-04-12 19:00:29 +0000479 int rc; /* Return Code */
480 i64 nSize; /* Size of log file */
drh7ed91f22010-04-29 22:34:07 +0000481 WalIndexHdr hdr; /* Recovered wal-index header */
dan7c246102010-04-12 19:00:29 +0000482
drh7ed91f22010-04-29 22:34:07 +0000483 assert( pWal->lockState==SQLITE_SHM_RECOVER );
dan7c246102010-04-12 19:00:29 +0000484 memset(&hdr, 0, sizeof(hdr));
485
drh7ed91f22010-04-29 22:34:07 +0000486 rc = sqlite3OsFileSize(pWal->pFd, &nSize);
dan7c246102010-04-12 19:00:29 +0000487 if( rc!=SQLITE_OK ){
488 return rc;
489 }
490
drh7ed91f22010-04-29 22:34:07 +0000491 if( nSize>WAL_FRAME_HDRSIZE ){
492 u8 aBuf[WAL_FRAME_HDRSIZE]; /* Buffer to load first frame header into */
dan7c246102010-04-12 19:00:29 +0000493 u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
494 int nFrame; /* Number of bytes at aFrame */
495 u8 *aData; /* Pointer to data part of aFrame buffer */
496 int iFrame; /* Index of last frame read */
497 i64 iOffset; /* Next offset to read from log file */
498 int nPgsz; /* Page size according to the log */
dan97a31352010-04-16 13:59:31 +0000499 u32 aCksum[2]; /* Running checksum */
dan7c246102010-04-12 19:00:29 +0000500
501 /* Read in the first frame header in the file (to determine the
502 ** database page size).
503 */
drh7ed91f22010-04-29 22:34:07 +0000504 rc = sqlite3OsRead(pWal->pFd, aBuf, WAL_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000505 if( rc!=SQLITE_OK ){
506 return rc;
507 }
508
509 /* If the database page size is not a power of two, or is greater than
510 ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
511 */
512 nPgsz = sqlite3Get4byte(&aBuf[0]);
dance4f05f2010-04-22 19:14:13 +0000513 if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE || nPgsz<512 ){
dan7c246102010-04-12 19:00:29 +0000514 goto finished;
515 }
dan97a31352010-04-16 13:59:31 +0000516 aCksum[0] = sqlite3Get4byte(&aBuf[4]);
517 aCksum[1] = sqlite3Get4byte(&aBuf[8]);
dan7c246102010-04-12 19:00:29 +0000518
519 /* Malloc a buffer to read frames into. */
drh7ed91f22010-04-29 22:34:07 +0000520 nFrame = nPgsz + WAL_FRAME_HDRSIZE;
dan7c246102010-04-12 19:00:29 +0000521 aFrame = (u8 *)sqlite3_malloc(nFrame);
522 if( !aFrame ){
523 return SQLITE_NOMEM;
524 }
drh7ed91f22010-04-29 22:34:07 +0000525 aData = &aFrame[WAL_FRAME_HDRSIZE];
dan7c246102010-04-12 19:00:29 +0000526
527 /* Read all frames from the log file. */
528 iFrame = 0;
drh7ed91f22010-04-29 22:34:07 +0000529 for(iOffset=WAL_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
dan7c246102010-04-12 19:00:29 +0000530 u32 pgno; /* Database page number for frame */
531 u32 nTruncate; /* dbsize field from frame header */
532 int isValid; /* True if this frame is valid */
533
534 /* Read and decode the next log frame. */
drh7ed91f22010-04-29 22:34:07 +0000535 rc = sqlite3OsRead(pWal->pFd, aFrame, nFrame, iOffset);
dan7c246102010-04-12 19:00:29 +0000536 if( rc!=SQLITE_OK ) break;
drh7ed91f22010-04-29 22:34:07 +0000537 isValid = walDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
dan7c246102010-04-12 19:00:29 +0000538 if( !isValid ) break;
drh7ed91f22010-04-29 22:34:07 +0000539 walIndexAppend(pWal, ++iFrame, pgno);
dan7c246102010-04-12 19:00:29 +0000540
541 /* If nTruncate is non-zero, this is a commit record. */
542 if( nTruncate ){
543 hdr.iCheck1 = aCksum[0];
544 hdr.iCheck2 = aCksum[1];
545 hdr.iLastPg = iFrame;
546 hdr.nPage = nTruncate;
547 hdr.pgsz = nPgsz;
548 }
549 }
550
551 sqlite3_free(aFrame);
552 }else{
553 hdr.iCheck1 = 2;
554 hdr.iCheck2 = 3;
555 }
556
557finished:
drh7ed91f22010-04-29 22:34:07 +0000558 walIndexWriteHdr(pWal, &hdr);
dan7c246102010-04-12 19:00:29 +0000559 return rc;
560}
561
562/*
563** Open a connection to the log file associated with database zDb. The
564** database file does not actually have to exist. zDb is used only to
565** figure out the name of the log file to open. If the log file does not
566** exist it is created by this call.
dan3de777f2010-04-17 12:31:37 +0000567**
568** A SHARED lock should be held on the database file when this function
569** is called. The purpose of this SHARED lock is to prevent any other
drh7ed91f22010-04-29 22:34:07 +0000570** client from unlinking the log or wal-index file. If another process
dan3de777f2010-04-17 12:31:37 +0000571** were to do this just after this client opened one of these files, the
572** system would be badly broken.
dan7c246102010-04-12 19:00:29 +0000573*/
drhc438efd2010-04-26 00:19:45 +0000574int sqlite3WalOpen(
drh7ed91f22010-04-29 22:34:07 +0000575 sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */
dan7c246102010-04-12 19:00:29 +0000576 const char *zDb, /* Name of database file */
drh7ed91f22010-04-29 22:34:07 +0000577 Wal **ppWal /* OUT: Allocated Wal handle */
dan7c246102010-04-12 19:00:29 +0000578){
danb9bf16b2010-04-14 11:23:30 +0000579 int rc = SQLITE_OK; /* Return Code */
drh7ed91f22010-04-29 22:34:07 +0000580 Wal *pRet; /* Object to allocate and return */
dan7c246102010-04-12 19:00:29 +0000581 int flags; /* Flags passed to OsOpen() */
582 char *zWal = 0; /* Path to WAL file */
583 int nWal; /* Length of zWal in bytes */
584
dan7c246102010-04-12 19:00:29 +0000585 assert( zDb );
dan87bfb512010-04-30 11:43:28 +0000586 if( pVfs->xShmOpen==0 ) return SQLITE_CANTOPEN_BKPT;
dan7c246102010-04-12 19:00:29 +0000587
drh7ed91f22010-04-29 22:34:07 +0000588 /* Allocate an instance of struct Wal to return. */
589 *ppWal = 0;
590 nWal = strlen(zDb);
591 pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile + nWal+11);
592 if( !pRet ) goto wal_open_out;
dan7c246102010-04-12 19:00:29 +0000593 pRet->pVfs = pVfs;
594 pRet->pFd = (sqlite3_file *)&pRet[1];
drh7ed91f22010-04-29 22:34:07 +0000595 zWal = pVfs->szOsFile + (char*)pRet->pFd;
drh79e6c782010-04-30 02:13:26 +0000596 sqlite3_snprintf(nWal+11, zWal, "%s-wal-index", zDb);
drh7ed91f22010-04-29 22:34:07 +0000597 rc = pVfs->xShmOpen(pVfs, zWal, &pRet->pWIndex);
598 if( rc ) goto wal_open_out;
dan7c246102010-04-12 19:00:29 +0000599
drh7ed91f22010-04-29 22:34:07 +0000600 /* Open file handle on the write-ahead log file. */
drh79e6c782010-04-30 02:13:26 +0000601 zWal[nWal+4] = 0;
dan67032392010-04-17 15:42:43 +0000602 flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
drh7ed91f22010-04-29 22:34:07 +0000603 rc = sqlite3OsOpen(pVfs, zWal, pRet->pFd, flags, &flags);
dan7c246102010-04-12 19:00:29 +0000604
drh7ed91f22010-04-29 22:34:07 +0000605wal_open_out:
dan7c246102010-04-12 19:00:29 +0000606 if( rc!=SQLITE_OK ){
dan7c246102010-04-12 19:00:29 +0000607 if( pRet ){
drh7ed91f22010-04-29 22:34:07 +0000608 pVfs->xShmClose(pRet->pWIndex);
dan7c246102010-04-12 19:00:29 +0000609 sqlite3OsClose(pRet->pFd);
610 sqlite3_free(pRet);
611 }
dan7c246102010-04-12 19:00:29 +0000612 }
drh7ed91f22010-04-29 22:34:07 +0000613 *ppWal = pRet;
dan7c246102010-04-12 19:00:29 +0000614 return rc;
615}
616
drh7ed91f22010-04-29 22:34:07 +0000617static int walIteratorNext(
618 WalIterator *p, /* Iterator */
619 u32 *piPage, /* OUT: Next db page to write */
620 u32 *piFrame /* OUT: Wal frame to read from */
dan7c246102010-04-12 19:00:29 +0000621){
622 u32 iMin = *piPage;
623 u32 iRet = 0xFFFFFFFF;
624 int i;
625 int nBlock = p->nFinal;
626
627 for(i=p->nSegment-1; i>=0; i--){
drh7ed91f22010-04-29 22:34:07 +0000628 struct WalSegment *pSegment = &p->aSegment[i];
dan7c246102010-04-12 19:00:29 +0000629 while( pSegment->iNext<nBlock ){
630 u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
631 if( iPg>iMin ){
632 if( iPg<iRet ){
633 iRet = iPg;
634 *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
635 }
636 break;
637 }
638 pSegment->iNext++;
639 }
640
641 nBlock = 256;
642 }
643
644 *piPage = iRet;
645 return (iRet==0xFFFFFFFF);
646}
647
drh7ed91f22010-04-29 22:34:07 +0000648static WalIterator *walIteratorInit(Wal *pWal){
649 u32 *aData; /* Content of the wal-index file */
650 WalIterator *p; /* Return value */
dan7c246102010-04-12 19:00:29 +0000651 int nSegment; /* Number of segments to merge */
652 u32 iLast; /* Last frame in log */
653 int nByte; /* Number of bytes to allocate */
654 int i; /* Iterator variable */
655 int nFinal; /* Number of unindexed entries */
drh7ed91f22010-04-29 22:34:07 +0000656 struct WalSegment *pFinal; /* Final (unindexed) segment */
dan7c246102010-04-12 19:00:29 +0000657 u8 *aTmp; /* Temp space used by merge-sort */
658
drh5530b762010-04-30 14:39:50 +0000659 walIndexMap(pWal, -1);
drh7ed91f22010-04-29 22:34:07 +0000660 aData = pWal->pWiData;
661 iLast = pWal->hdr.iLastPg;
dan7c246102010-04-12 19:00:29 +0000662 nSegment = (iLast >> 8) + 1;
663 nFinal = (iLast & 0x000000FF);
664
drh7ed91f22010-04-29 22:34:07 +0000665 nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512;
666 p = (WalIterator *)sqlite3_malloc(nByte);
dan7c246102010-04-12 19:00:29 +0000667 if( p ){
668 memset(p, 0, nByte);
669 p->nSegment = nSegment;
670 p->nFinal = nFinal;
671 }
672
673 for(i=0; i<nSegment-1; i++){
drh7ed91f22010-04-29 22:34:07 +0000674 p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
675 p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256];
dan7c246102010-04-12 19:00:29 +0000676 }
677 pFinal = &p->aSegment[nSegment-1];
678
drh7ed91f22010-04-29 22:34:07 +0000679 pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)];
dan7c246102010-04-12 19:00:29 +0000680 pFinal->aIndex = (u8 *)&pFinal[1];
681 aTmp = &pFinal->aIndex[256];
682 for(i=0; i<nFinal; i++){
683 pFinal->aIndex[i] = i;
684 }
drh7ed91f22010-04-29 22:34:07 +0000685 walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
dan7c246102010-04-12 19:00:29 +0000686 p->nFinal = nFinal;
687
688 return p;
689}
690
691/*
drh7ed91f22010-04-29 22:34:07 +0000692** Free a log iterator allocated by walIteratorInit().
dan7c246102010-04-12 19:00:29 +0000693*/
drh7ed91f22010-04-29 22:34:07 +0000694static void walIteratorFree(WalIterator *p){
dan7c246102010-04-12 19:00:29 +0000695 sqlite3_free(p);
696}
697
698/*
699** Checkpoint the contents of the log file.
700*/
drh7ed91f22010-04-29 22:34:07 +0000701static int walCheckpoint(
702 Wal *pWal, /* Wal connection */
dan7c246102010-04-12 19:00:29 +0000703 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +0000704 int sync_flags, /* Flags for OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +0000705 u8 *zBuf /* Temporary buffer to use */
706){
707 int rc; /* Return code */
drh7ed91f22010-04-29 22:34:07 +0000708 int pgsz = pWal->hdr.pgsz; /* Database page-size */
709 WalIterator *pIter = 0; /* Wal iterator context */
dan7c246102010-04-12 19:00:29 +0000710 u32 iDbpage = 0; /* Next database page to write */
drh7ed91f22010-04-29 22:34:07 +0000711 u32 iFrame = 0; /* Wal frame containing data for iDbpage */
dan7c246102010-04-12 19:00:29 +0000712
drh7ed91f22010-04-29 22:34:07 +0000713 if( pWal->hdr.iLastPg==0 ){
danbb2e9c92010-04-15 13:33:18 +0000714 return SQLITE_OK;
715 }
716
dan7c246102010-04-12 19:00:29 +0000717 /* Allocate the iterator */
drh7ed91f22010-04-29 22:34:07 +0000718 pIter = walIteratorInit(pWal);
dan7c246102010-04-12 19:00:29 +0000719 if( !pIter ) return SQLITE_NOMEM;
720
721 /* Sync the log file to disk */
danc5118782010-04-17 17:34:41 +0000722 if( sync_flags ){
drh7ed91f22010-04-29 22:34:07 +0000723 rc = sqlite3OsSync(pWal->pFd, sync_flags);
danc5118782010-04-17 17:34:41 +0000724 if( rc!=SQLITE_OK ) goto out;
725 }
dan7c246102010-04-12 19:00:29 +0000726
727 /* Iterate through the contents of the log, copying data to the db file. */
drh7ed91f22010-04-29 22:34:07 +0000728 while( 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
729 rc = sqlite3OsRead(pWal->pFd, zBuf, pgsz,
730 walFrameOffset(iFrame, pgsz) + WAL_FRAME_HDRSIZE
dan7c246102010-04-12 19:00:29 +0000731 );
732 if( rc!=SQLITE_OK ) goto out;
733 rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
734 if( rc!=SQLITE_OK ) goto out;
735 }
736
737 /* Truncate the database file */
drh7ed91f22010-04-29 22:34:07 +0000738 rc = sqlite3OsTruncate(pFd, ((i64)pWal->hdr.nPage*(i64)pgsz));
dan7c246102010-04-12 19:00:29 +0000739 if( rc!=SQLITE_OK ) goto out;
740
drh7ed91f22010-04-29 22:34:07 +0000741 /* Sync the database file. If successful, update the wal-index. */
danc5118782010-04-17 17:34:41 +0000742 if( sync_flags ){
743 rc = sqlite3OsSync(pFd, sync_flags);
744 if( rc!=SQLITE_OK ) goto out;
745 }
drh7ed91f22010-04-29 22:34:07 +0000746 pWal->hdr.iLastPg = 0;
747 pWal->hdr.iCheck1 = 2;
748 pWal->hdr.iCheck2 = 3;
749 walIndexWriteHdr(pWal, &pWal->hdr);
dan7c246102010-04-12 19:00:29 +0000750
751 /* TODO: If a crash occurs and the current log is copied into the
752 ** database there is no problem. However, if a crash occurs while
753 ** writing the next transaction into the start of the log, such that:
754 **
755 ** * The first transaction currently in the log is left intact, but
756 ** * The second (or subsequent) transaction is damaged,
757 **
758 ** then the database could become corrupt.
759 **
760 ** The easiest thing to do would be to write and sync a dummy header
761 ** into the log at this point. Unfortunately, that turns out to be
762 ** an unwelcome performance hit. Alternatives are...
763 */
764#if 0
drh7ed91f22010-04-29 22:34:07 +0000765 memset(zBuf, 0, WAL_FRAME_HDRSIZE);
766 rc = sqlite3OsWrite(pWal->pFd, zBuf, WAL_FRAME_HDRSIZE, 0);
dan7c246102010-04-12 19:00:29 +0000767 if( rc!=SQLITE_OK ) goto out;
drh7ed91f22010-04-29 22:34:07 +0000768 rc = sqlite3OsSync(pWal->pFd, pWal->sync_flags);
dan7c246102010-04-12 19:00:29 +0000769#endif
770
771 out:
drh7ed91f22010-04-29 22:34:07 +0000772 walIteratorFree(pIter);
dan7c246102010-04-12 19:00:29 +0000773 return rc;
774}
775
776/*
777** Close a connection to a log file.
778*/
drhc438efd2010-04-26 00:19:45 +0000779int sqlite3WalClose(
drh7ed91f22010-04-29 22:34:07 +0000780 Wal *pWal, /* Wal to close */
dan7c246102010-04-12 19:00:29 +0000781 sqlite3_file *pFd, /* Database file */
danc5118782010-04-17 17:34:41 +0000782 int sync_flags, /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +0000783 u8 *zBuf /* Buffer of at least page-size bytes */
784){
785 int rc = SQLITE_OK;
drh7ed91f22010-04-29 22:34:07 +0000786 if( pWal ){
dan30c86292010-04-30 16:24:46 +0000787 int isDelete = 0; /* True to unlink wal and wal-index files */
788
789 /* If an EXCLUSIVE lock can be obtained on the database file (using the
790 ** ordinary, rollback-mode locking methods, this guarantees that the
791 ** connection associated with this log file is the only connection to
792 ** the database. In this case checkpoint the database and unlink both
793 ** the wal and wal-index files.
794 **
795 ** The EXCLUSIVE lock is not released before returning.
796 */
797 rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
798 if( rc==SQLITE_OK ){
799 rc = walCheckpoint(pWal, pFd, sync_flags, zBuf);
800 if( rc==SQLITE_OK ){
801 isDelete = 1;
802 }
803 walIndexUnmap(pWal);
804 }
805
drh7ed91f22010-04-29 22:34:07 +0000806 pWal->pVfs->xShmClose(pWal->pWIndex);
807 sqlite3OsClose(pWal->pFd);
dan30c86292010-04-30 16:24:46 +0000808 if( isDelete ){
809 int nWal;
810 char *zWal = &((char *)pWal->pFd)[pWal->pVfs->szOsFile];
811 sqlite3OsDelete(pWal->pVfs, zWal, 0);
812 nWal = sqlite3Strlen30(zWal);
813 memcpy(&zWal[nWal], "-index", 7);
814 pWal->pVfs->xShmDelete(pWal->pVfs, zWal);
815 }
drh7ed91f22010-04-29 22:34:07 +0000816 sqlite3_free(pWal);
dan7c246102010-04-12 19:00:29 +0000817 }
818 return rc;
819}
820
821/*
drh7ed91f22010-04-29 22:34:07 +0000822** Try to read the wal-index header. Attempt to verify the header
823** checksum. If the checksum can be verified, copy the wal-index
824** header into structure pWal->hdr. If the contents of pWal->hdr are
danb9bf16b2010-04-14 11:23:30 +0000825** modified by this and pChanged is not NULL, set *pChanged to 1.
826** Otherwise leave *pChanged unmodified.
827**
828** If the checksum cannot be verified return SQLITE_ERROR.
829*/
drh7ed91f22010-04-29 22:34:07 +0000830int walIndexTryHdr(Wal *pWal, int *pChanged){
danb9bf16b2010-04-14 11:23:30 +0000831 u32 aCksum[2] = {1, 1};
drh7ed91f22010-04-29 22:34:07 +0000832 u32 aHdr[WALINDEX_HDR_NFIELD+2];
danb9bf16b2010-04-14 11:23:30 +0000833
drh79e6c782010-04-30 02:13:26 +0000834 if( pWal->szWIndex==0 ){
drh5530b762010-04-30 14:39:50 +0000835 int rc;
836 rc = walIndexRemap(pWal, WALINDEX_MMAP_INCREMENT);
drh79e6c782010-04-30 02:13:26 +0000837 if( rc ) return rc;
838 }
839
drh7ed91f22010-04-29 22:34:07 +0000840 /* Read the header. The caller may or may not have locked the wal-index
dancd11fb22010-04-26 10:40:52 +0000841 ** file, meaning it is possible that an inconsistent snapshot is read
842 ** from the file. If this happens, return SQLITE_ERROR. The caller will
843 ** retry. Or, if the caller has already locked the file and the header
844 ** still looks inconsistent, it will run recovery.
drh79e6c782010-04-30 02:13:26 +0000845 **
846 ** FIX-ME: It is no longer possible to have not locked the wal-index.
danb9bf16b2010-04-14 11:23:30 +0000847 */
drh7ed91f22010-04-29 22:34:07 +0000848 memcpy(aHdr, pWal->pWiData, sizeof(aHdr));
849 walChecksumBytes((u8*)aHdr, sizeof(u32)*WALINDEX_HDR_NFIELD, aCksum);
850 if( aCksum[0]!=aHdr[WALINDEX_HDR_NFIELD]
851 || aCksum[1]!=aHdr[WALINDEX_HDR_NFIELD+1]
danb9bf16b2010-04-14 11:23:30 +0000852 ){
853 return SQLITE_ERROR;
854 }
855
drh7ed91f22010-04-29 22:34:07 +0000856 if( memcmp(&pWal->hdr, aHdr, sizeof(WalIndexHdr)) ){
danb9bf16b2010-04-14 11:23:30 +0000857 if( pChanged ){
858 *pChanged = 1;
859 }
drh7ed91f22010-04-29 22:34:07 +0000860 memcpy(&pWal->hdr, aHdr, sizeof(WalIndexHdr));
danb9bf16b2010-04-14 11:23:30 +0000861 }
862 return SQLITE_OK;
863}
864
865/*
drh7ed91f22010-04-29 22:34:07 +0000866** Read the wal-index header from the wal-index file into structure
867** pWal->hdr. If attempting to verify the header checksum fails, try
danb9bf16b2010-04-14 11:23:30 +0000868** to recover the log before returning.
869**
drh7ed91f22010-04-29 22:34:07 +0000870** If the wal-index header is successfully read, return SQLITE_OK.
danb9bf16b2010-04-14 11:23:30 +0000871** Otherwise an SQLite error code.
872*/
drh7ed91f22010-04-29 22:34:07 +0000873static int walIndexReadHdr(Wal *pWal, int *pChanged){
danb9bf16b2010-04-14 11:23:30 +0000874 int rc;
875
dan4c97b532010-04-30 09:52:17 +0000876 assert( pWal->lockState>=SQLITE_SHM_READ );
drh5530b762010-04-30 14:39:50 +0000877 walIndexMap(pWal, -1);
drh7ed91f22010-04-29 22:34:07 +0000878
danb9bf16b2010-04-14 11:23:30 +0000879 /* First try to read the header without a lock. Verify the checksum
880 ** before returning. This will almost always work.
881 */
drh7ed91f22010-04-29 22:34:07 +0000882 if( SQLITE_OK==walIndexTryHdr(pWal, pChanged) ){
danb9bf16b2010-04-14 11:23:30 +0000883 return SQLITE_OK;
884 }
885
drh7ed91f22010-04-29 22:34:07 +0000886 /* If the first attempt to read the header failed, lock the wal-index
danb9bf16b2010-04-14 11:23:30 +0000887 ** file and try again. If the header checksum verification fails this
888 ** time as well, run log recovery.
889 */
drh7ed91f22010-04-29 22:34:07 +0000890 if( SQLITE_OK==(rc = walSetLock(pWal, SQLITE_SHM_RECOVER)) ){
891 if( SQLITE_OK!=walIndexTryHdr(pWal, pChanged) ){
danb9bf16b2010-04-14 11:23:30 +0000892 if( pChanged ){
893 *pChanged = 1;
894 }
drh7ed91f22010-04-29 22:34:07 +0000895 rc = walIndexRecover(pWal);
danb9bf16b2010-04-14 11:23:30 +0000896 if( rc==SQLITE_OK ){
drh7ed91f22010-04-29 22:34:07 +0000897 rc = walIndexTryHdr(pWal, 0);
danb9bf16b2010-04-14 11:23:30 +0000898 }
899 }
drh7ed91f22010-04-29 22:34:07 +0000900 walSetLock(pWal, SQLITE_SHM_READ);
danb9bf16b2010-04-14 11:23:30 +0000901 }
902
903 return rc;
904}
905
906/*
dan64d039e2010-04-13 19:27:31 +0000907** Lock a snapshot.
dan7c246102010-04-12 19:00:29 +0000908**
909** If this call obtains a new read-lock and the database contents have been
drh7ed91f22010-04-29 22:34:07 +0000910** modified since the most recent call to WalCloseSnapshot() on this Wal
dan7c246102010-04-12 19:00:29 +0000911** connection, then *pChanged is set to 1 before returning. Otherwise, it
912** is left unmodified. This is used by the pager layer to determine whether
913** or not any cached pages may be safely reused.
914*/
drh7ed91f22010-04-29 22:34:07 +0000915int sqlite3WalOpenSnapshot(Wal *pWal, int *pChanged){
916 int rc;
dan64d039e2010-04-13 19:27:31 +0000917
drh7ed91f22010-04-29 22:34:07 +0000918 rc = walSetLock(pWal, SQLITE_SHM_READ);
919 if( rc==SQLITE_OK ){
920 pWal->lockState = SQLITE_SHM_READ;
dan64d039e2010-04-13 19:27:31 +0000921
drh7ed91f22010-04-29 22:34:07 +0000922 rc = walIndexReadHdr(pWal, pChanged);
dan64d039e2010-04-13 19:27:31 +0000923 if( rc!=SQLITE_OK ){
924 /* An error occured while attempting log recovery. */
drh7ed91f22010-04-29 22:34:07 +0000925 sqlite3WalCloseSnapshot(pWal);
dan31f98fc2010-04-27 05:42:32 +0000926 }else{
927 /* Check if the mapping needs to grow. */
drh5530b762010-04-30 14:39:50 +0000928 if( pWal->hdr.iLastPg
929 && walIndexEntry(pWal->hdr.iLastPg)>=pWal->szWIndex
930 ){
931 walIndexRemap(pWal, -1);
dan31f98fc2010-04-27 05:42:32 +0000932 }
dan64d039e2010-04-13 19:27:31 +0000933 }
dan7c246102010-04-12 19:00:29 +0000934 }
danba515902010-04-30 09:32:06 +0000935
936 walIndexUnmap(pWal);
dan7c246102010-04-12 19:00:29 +0000937 return rc;
938}
939
940/*
941** Unlock the current snapshot.
942*/
drh7ed91f22010-04-29 22:34:07 +0000943void sqlite3WalCloseSnapshot(Wal *pWal){
944 if( pWal->lockState!=SQLITE_SHM_UNLOCK ){
945 assert( pWal->lockState==SQLITE_SHM_READ );
946 walSetLock(pWal, SQLITE_SHM_UNLOCK);
dan64d039e2010-04-13 19:27:31 +0000947 }
dan7c246102010-04-12 19:00:29 +0000948}
949
dan5e0ce872010-04-28 17:48:44 +0000950/*
dan7c246102010-04-12 19:00:29 +0000951** Read a page from the log, if it is present.
952*/
drh7ed91f22010-04-29 22:34:07 +0000953int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, u8 *pOut){
dan7c246102010-04-12 19:00:29 +0000954 u32 iRead = 0;
dancd11fb22010-04-26 10:40:52 +0000955 u32 *aData;
drh7ed91f22010-04-29 22:34:07 +0000956 int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00);
dan7c246102010-04-12 19:00:29 +0000957
dan1bc61712010-04-30 10:24:54 +0000958 assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
drh5530b762010-04-30 14:39:50 +0000959 walIndexMap(pWal, -1);
dancd11fb22010-04-26 10:40:52 +0000960
dan7c246102010-04-12 19:00:29 +0000961 /* Do a linear search of the unindexed block of page-numbers (if any)
drh7ed91f22010-04-29 22:34:07 +0000962 ** at the end of the wal-index. An alternative to this would be to
dan7c246102010-04-12 19:00:29 +0000963 ** build an index in private memory each time a read transaction is
964 ** opened on a new snapshot.
965 */
drh7ed91f22010-04-29 22:34:07 +0000966 aData = pWal->pWiData;
967 if( pWal->hdr.iLastPg ){
968 u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)];
969 u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF);
dan7c246102010-04-12 19:00:29 +0000970 while( *pi!=pgno && pi!=piStop ) pi--;
971 if( pi!=piStop ){
972 iRead = (pi-piStop) + iFrame;
973 }
974 }
drh7ed91f22010-04-29 22:34:07 +0000975 assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
dan7c246102010-04-12 19:00:29 +0000976
977 while( iRead==0 && iFrame>0 ){
978 int iLow = 0;
979 int iHigh = 255;
980 u32 *aFrame;
981 u8 *aIndex;
982
983 iFrame -= 256;
drh7ed91f22010-04-29 22:34:07 +0000984 aFrame = &aData[walIndexEntry(iFrame+1)];
dan7c246102010-04-12 19:00:29 +0000985 aIndex = (u8 *)&aFrame[256];
986
987 while( iLow<=iHigh ){
988 int iTest = (iLow+iHigh)>>1;
989 u32 iPg = aFrame[aIndex[iTest]];
990
991 if( iPg==pgno ){
992 iRead = iFrame + 1 + aIndex[iTest];
993 break;
994 }
995 else if( iPg<pgno ){
996 iLow = iTest+1;
997 }else{
998 iHigh = iTest-1;
999 }
1000 }
1001 }
drh7ed91f22010-04-29 22:34:07 +00001002 assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
1003 walIndexUnmap(pWal);
dancd11fb22010-04-26 10:40:52 +00001004
dan7c246102010-04-12 19:00:29 +00001005 /* If iRead is non-zero, then it is the log frame number that contains the
1006 ** required page. Read and return data from the log file.
1007 */
1008 if( iRead ){
drh7ed91f22010-04-29 22:34:07 +00001009 i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
1010 *pInWal = 1;
1011 return sqlite3OsRead(pWal->pFd, pOut, pWal->hdr.pgsz, iOffset);
dan7c246102010-04-12 19:00:29 +00001012 }
1013
drh7ed91f22010-04-29 22:34:07 +00001014 *pInWal = 0;
dan7c246102010-04-12 19:00:29 +00001015 return SQLITE_OK;
1016}
1017
1018
1019/*
1020** Set *pPgno to the size of the database file (or zero, if unknown).
1021*/
drh7ed91f22010-04-29 22:34:07 +00001022void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
1023 assert( pWal->lockState==SQLITE_SHM_READ
1024 || pWal->lockState==SQLITE_SHM_WRITE );
1025 *pPgno = pWal->hdr.nPage;
dan7c246102010-04-12 19:00:29 +00001026}
1027
1028/*
dan7c246102010-04-12 19:00:29 +00001029** This function returns SQLITE_OK if the caller may write to the database.
1030** Otherwise, if the caller is operating on a snapshot that has already
dan49320f82010-04-14 18:50:08 +00001031** been overwritten by another writer, SQLITE_BUSY is returned.
dan7c246102010-04-12 19:00:29 +00001032*/
drh7ed91f22010-04-29 22:34:07 +00001033int sqlite3WalWriteLock(Wal *pWal, int op){
1034 int rc;
dan7c246102010-04-12 19:00:29 +00001035 if( op ){
drh7ed91f22010-04-29 22:34:07 +00001036 assert( pWal->lockState == SQLITE_SHM_READ );
1037 rc = walSetLock(pWal, SQLITE_SHM_WRITE);
dan30c86292010-04-30 16:24:46 +00001038
1039 /* If this connection is not reading the most recent database snapshot,
1040 ** it is not possible to write to the database. In this case release
1041 ** the write locks and return SQLITE_BUSY.
1042 */
1043 if( rc==SQLITE_OK ){
1044 rc = walIndexMap(pWal, -1);
1045 if( rc==SQLITE_OK
1046 && memcmp(&pWal->hdr, pWal->pWiData, sizeof(WalIndexHdr))
1047 ){
1048 rc = SQLITE_BUSY;
1049 }
1050 walIndexUnmap(pWal);
1051 if( rc!=SQLITE_OK ){
1052 walSetLock(pWal, SQLITE_SHM_READ);
1053 }
1054 }
drh7ed91f22010-04-29 22:34:07 +00001055 }else if( pWal->lockState==SQLITE_SHM_WRITE ){
1056 rc = walSetLock(pWal, SQLITE_SHM_READ);
dan7c246102010-04-12 19:00:29 +00001057 }
drh7ed91f22010-04-29 22:34:07 +00001058 return rc;
dan7c246102010-04-12 19:00:29 +00001059}
1060
dan74d6cd82010-04-24 18:44:05 +00001061/*
drh7ed91f22010-04-29 22:34:07 +00001062** The Wal object passed to this function must be holding the write-lock.
dan74d6cd82010-04-24 18:44:05 +00001063**
1064** If any data has been written (but not committed) to the log file, this
1065** function moves the write-pointer back to the start of the transaction.
1066**
1067** Additionally, the callback function is invoked for each frame written
1068** to the log since the start of the transaction. If the callback returns
1069** other than SQLITE_OK, it is not invoked again and the error code is
1070** returned to the caller.
1071**
1072** Otherwise, if the callback function does not return an error, this
1073** function returns SQLITE_OK.
1074*/
drh7ed91f22010-04-29 22:34:07 +00001075int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
dan74d6cd82010-04-24 18:44:05 +00001076 int rc = SQLITE_OK;
drh7ed91f22010-04-29 22:34:07 +00001077 Pgno iMax = pWal->hdr.iLastPg;
dan74d6cd82010-04-24 18:44:05 +00001078 Pgno iFrame;
1079
drh7ed91f22010-04-29 22:34:07 +00001080 assert( pWal->lockState==SQLITE_SHM_WRITE );
1081 walIndexReadHdr(pWal, 0);
1082 for(iFrame=pWal->hdr.iLastPg+1; iFrame<=iMax && rc==SQLITE_OK; iFrame++){
1083 rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
dan74d6cd82010-04-24 18:44:05 +00001084 }
drh7ed91f22010-04-29 22:34:07 +00001085 walIndexUnmap(pWal);
dan74d6cd82010-04-24 18:44:05 +00001086 return rc;
1087}
1088
drh7ed91f22010-04-29 22:34:07 +00001089/* Return an integer that records the current (uncommitted) write
1090** position in the WAL
1091*/
1092u32 sqlite3WalSavepoint(Wal *pWal){
1093 assert( pWal->lockState==SQLITE_SHM_WRITE );
1094 return pWal->hdr.iLastPg;
dan4cd78b42010-04-26 16:57:10 +00001095}
1096
drh7ed91f22010-04-29 22:34:07 +00001097/* Move the write position of the WAL back to iFrame. Called in
1098** response to a ROLLBACK TO command.
1099*/
1100int sqlite3WalSavepointUndo(Wal *pWal, u32 iFrame){
dan4cd78b42010-04-26 16:57:10 +00001101 int rc = SQLITE_OK;
1102 u8 aCksum[8];
drh7ed91f22010-04-29 22:34:07 +00001103 assert( pWal->lockState==SQLITE_SHM_WRITE );
dan4cd78b42010-04-26 16:57:10 +00001104
drh7ed91f22010-04-29 22:34:07 +00001105 pWal->hdr.iLastPg = iFrame;
dan4cd78b42010-04-26 16:57:10 +00001106 if( iFrame>0 ){
drh7ed91f22010-04-29 22:34:07 +00001107 i64 iOffset = walFrameOffset(iFrame, pWal->hdr.pgsz) + sizeof(u32)*2;
1108 rc = sqlite3OsRead(pWal->pFd, aCksum, sizeof(aCksum), iOffset);
1109 pWal->hdr.iCheck1 = sqlite3Get4byte(&aCksum[0]);
1110 pWal->hdr.iCheck2 = sqlite3Get4byte(&aCksum[4]);
dan4cd78b42010-04-26 16:57:10 +00001111 }
1112
1113 return rc;
1114}
1115
dan7c246102010-04-12 19:00:29 +00001116/*
dan4cd78b42010-04-26 16:57:10 +00001117** Write a set of frames to the log. The caller must hold the write-lock
1118** on the log file (obtained using sqlite3WalWriteLock()).
dan7c246102010-04-12 19:00:29 +00001119*/
drhc438efd2010-04-26 00:19:45 +00001120int sqlite3WalFrames(
drh7ed91f22010-04-29 22:34:07 +00001121 Wal *pWal, /* Wal handle to write to */
dan7c246102010-04-12 19:00:29 +00001122 int nPgsz, /* Database page-size in bytes */
1123 PgHdr *pList, /* List of dirty pages to write */
1124 Pgno nTruncate, /* Database size after this commit */
1125 int isCommit, /* True if this is a commit */
danc5118782010-04-17 17:34:41 +00001126 int sync_flags /* Flags to pass to OsSync() (or 0) */
dan7c246102010-04-12 19:00:29 +00001127){
dan7c246102010-04-12 19:00:29 +00001128 int rc; /* Used to catch return codes */
1129 u32 iFrame; /* Next frame address */
drh7ed91f22010-04-29 22:34:07 +00001130 u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
dan7c246102010-04-12 19:00:29 +00001131 PgHdr *p; /* Iterator to run through pList with. */
dan97a31352010-04-16 13:59:31 +00001132 u32 aCksum[2]; /* Checksums */
dan7c246102010-04-12 19:00:29 +00001133 PgHdr *pLast; /* Last frame in list */
1134 int nLast = 0; /* Number of extra copies of last page */
1135
drh7ed91f22010-04-29 22:34:07 +00001136 assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
dan7c246102010-04-12 19:00:29 +00001137 assert( pList );
drh7ed91f22010-04-29 22:34:07 +00001138 assert( pWal->lockState==SQLITE_SHM_WRITE );
danba515902010-04-30 09:32:06 +00001139 assert( pWal->pWiData==0 );
dan7c246102010-04-12 19:00:29 +00001140
dan97a31352010-04-16 13:59:31 +00001141 /* If this is the first frame written into the log, write the log
1142 ** header to the start of the log file. See comments at the top of
1143 ** this file for a description of the log-header format.
1144 */
drh7ed91f22010-04-29 22:34:07 +00001145 assert( WAL_FRAME_HDRSIZE>=WAL_HDRSIZE );
1146 iFrame = pWal->hdr.iLastPg;
dan97a31352010-04-16 13:59:31 +00001147 if( iFrame==0 ){
1148 sqlite3Put4byte(aFrame, nPgsz);
1149 sqlite3_randomness(8, &aFrame[4]);
drh7ed91f22010-04-29 22:34:07 +00001150 pWal->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
1151 pWal->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
1152 rc = sqlite3OsWrite(pWal->pFd, aFrame, WAL_HDRSIZE, 0);
dan97a31352010-04-16 13:59:31 +00001153 if( rc!=SQLITE_OK ){
1154 return rc;
1155 }
1156 }
1157
drh7ed91f22010-04-29 22:34:07 +00001158 aCksum[0] = pWal->hdr.iCheck1;
1159 aCksum[1] = pWal->hdr.iCheck2;
dan7c246102010-04-12 19:00:29 +00001160
1161 /* Write the log file. */
dan7c246102010-04-12 19:00:29 +00001162 for(p=pList; p; p=p->pDirty){
1163 u32 nDbsize; /* Db-size field for frame header */
1164 i64 iOffset; /* Write offset in log file */
1165
drh7ed91f22010-04-29 22:34:07 +00001166 iOffset = walFrameOffset(++iFrame, nPgsz);
dan7c246102010-04-12 19:00:29 +00001167
1168 /* Populate and write the frame header */
1169 nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
drh7ed91f22010-04-29 22:34:07 +00001170 walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
1171 rc = sqlite3OsWrite(pWal->pFd, aFrame, sizeof(aFrame), iOffset);
dan7c246102010-04-12 19:00:29 +00001172 if( rc!=SQLITE_OK ){
1173 return rc;
1174 }
1175
1176 /* Write the page data */
drh7ed91f22010-04-29 22:34:07 +00001177 rc = sqlite3OsWrite(pWal->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
dan7c246102010-04-12 19:00:29 +00001178 if( rc!=SQLITE_OK ){
1179 return rc;
1180 }
1181 pLast = p;
1182 }
1183
1184 /* Sync the log file if the 'isSync' flag was specified. */
danc5118782010-04-17 17:34:41 +00001185 if( sync_flags ){
drh7ed91f22010-04-29 22:34:07 +00001186 i64 iSegment = sqlite3OsSectorSize(pWal->pFd);
1187 i64 iOffset = walFrameOffset(iFrame+1, nPgsz);
dan67032392010-04-17 15:42:43 +00001188
1189 assert( isCommit );
dan7c246102010-04-12 19:00:29 +00001190
1191 if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
1192 iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
1193 }
1194 iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
1195 while( iOffset<iSegment ){
drh7ed91f22010-04-29 22:34:07 +00001196 walEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
1197 rc = sqlite3OsWrite(pWal->pFd, aFrame, sizeof(aFrame), iOffset);
dan7c246102010-04-12 19:00:29 +00001198 if( rc!=SQLITE_OK ){
1199 return rc;
1200 }
1201
drh7ed91f22010-04-29 22:34:07 +00001202 iOffset += WAL_FRAME_HDRSIZE;
1203 rc = sqlite3OsWrite(pWal->pFd, pLast->pData, nPgsz, iOffset);
dan7c246102010-04-12 19:00:29 +00001204 if( rc!=SQLITE_OK ){
1205 return rc;
1206 }
1207 nLast++;
1208 iOffset += nPgsz;
1209 }
dan7c246102010-04-12 19:00:29 +00001210
drh7ed91f22010-04-29 22:34:07 +00001211 rc = sqlite3OsSync(pWal->pFd, sync_flags);
dan7c246102010-04-12 19:00:29 +00001212 if( rc!=SQLITE_OK ){
1213 return rc;
1214 }
1215 }
danba515902010-04-30 09:32:06 +00001216 assert( pWal->pWiData==0 );
dan7c246102010-04-12 19:00:29 +00001217
1218 /* Append data to the log summary. It is not necessary to lock the
drh7ed91f22010-04-29 22:34:07 +00001219 ** wal-index to do this as the RESERVED lock held on the db file
dan7c246102010-04-12 19:00:29 +00001220 ** guarantees that there are no other writers, and no data that may
1221 ** be in use by existing readers is being overwritten.
1222 */
drh7ed91f22010-04-29 22:34:07 +00001223 iFrame = pWal->hdr.iLastPg;
dan7c246102010-04-12 19:00:29 +00001224 for(p=pList; p; p=p->pDirty){
1225 iFrame++;
drh7ed91f22010-04-29 22:34:07 +00001226 walIndexAppend(pWal, iFrame, p->pgno);
dan7c246102010-04-12 19:00:29 +00001227 }
1228 while( nLast>0 ){
1229 iFrame++;
1230 nLast--;
drh7ed91f22010-04-29 22:34:07 +00001231 walIndexAppend(pWal, iFrame, pLast->pgno);
dan7c246102010-04-12 19:00:29 +00001232 }
1233
1234 /* Update the private copy of the header. */
drh7ed91f22010-04-29 22:34:07 +00001235 pWal->hdr.pgsz = nPgsz;
1236 pWal->hdr.iLastPg = iFrame;
dan7c246102010-04-12 19:00:29 +00001237 if( isCommit ){
drh7ed91f22010-04-29 22:34:07 +00001238 pWal->hdr.iChange++;
1239 pWal->hdr.nPage = nTruncate;
dan7c246102010-04-12 19:00:29 +00001240 }
drh7ed91f22010-04-29 22:34:07 +00001241 pWal->hdr.iCheck1 = aCksum[0];
1242 pWal->hdr.iCheck2 = aCksum[1];
dan7c246102010-04-12 19:00:29 +00001243
drh7ed91f22010-04-29 22:34:07 +00001244 /* If this is a commit, update the wal-index header too. */
1245 if( isCommit ){
1246 walIndexWriteHdr(pWal, &pWal->hdr);
1247 pWal->iCallback = iFrame;
dan7c246102010-04-12 19:00:29 +00001248 }
drh7ed91f22010-04-29 22:34:07 +00001249 walIndexUnmap(pWal);
dan7c246102010-04-12 19:00:29 +00001250
dan8d22a172010-04-19 18:03:51 +00001251 return rc;
dan7c246102010-04-12 19:00:29 +00001252}
1253
1254/*
danb9bf16b2010-04-14 11:23:30 +00001255** Checkpoint the database:
1256**
drh7ed91f22010-04-29 22:34:07 +00001257** 1. Acquire a CHECKPOINT lock
1258** 2. Copy the contents of the log into the database file.
1259** 3. Zero the wal-index header (so new readers will ignore the log).
1260** 4. Drop the CHECKPOINT lock.
dan7c246102010-04-12 19:00:29 +00001261*/
drhc438efd2010-04-26 00:19:45 +00001262int sqlite3WalCheckpoint(
drh7ed91f22010-04-29 22:34:07 +00001263 Wal *pWal, /* Wal connection */
dan7c246102010-04-12 19:00:29 +00001264 sqlite3_file *pFd, /* File descriptor open on db file */
danc5118782010-04-17 17:34:41 +00001265 int sync_flags, /* Flags to sync db file with (or 0) */
dan64d039e2010-04-13 19:27:31 +00001266 u8 *zBuf, /* Temporary buffer to use */
1267 int (*xBusyHandler)(void *), /* Pointer to busy-handler function */
1268 void *pBusyHandlerArg /* Argument to pass to xBusyHandler */
dan7c246102010-04-12 19:00:29 +00001269){
danb9bf16b2010-04-14 11:23:30 +00001270 int rc; /* Return code */
dan31c03902010-04-29 14:51:33 +00001271 int isChanged = 0; /* True if a new wal-index header is loaded */
dan7c246102010-04-12 19:00:29 +00001272
drh7ed91f22010-04-29 22:34:07 +00001273 assert( pWal->lockState==SQLITE_SHM_UNLOCK );
dan39c79f52010-04-15 10:58:51 +00001274
drh7ed91f22010-04-29 22:34:07 +00001275 /* Get the CHECKPOINT lock */
dan64d039e2010-04-13 19:27:31 +00001276 do {
drh7ed91f22010-04-29 22:34:07 +00001277 rc = walSetLock(pWal, SQLITE_SHM_CHECKPOINT);
dan64d039e2010-04-13 19:27:31 +00001278 }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
danb9bf16b2010-04-14 11:23:30 +00001279 if( rc!=SQLITE_OK ){
drh7ed91f22010-04-29 22:34:07 +00001280 walSetLock(pWal, SQLITE_SHM_UNLOCK);
danb9bf16b2010-04-14 11:23:30 +00001281 return rc;
1282 }
dan64d039e2010-04-13 19:27:31 +00001283
danb9bf16b2010-04-14 11:23:30 +00001284 /* Copy data from the log to the database file. */
drh7ed91f22010-04-29 22:34:07 +00001285 rc = walIndexReadHdr(pWal, &isChanged);
danb9bf16b2010-04-14 11:23:30 +00001286 if( rc==SQLITE_OK ){
drh7ed91f22010-04-29 22:34:07 +00001287 rc = walCheckpoint(pWal, pFd, sync_flags, zBuf);
danb9bf16b2010-04-14 11:23:30 +00001288 }
dan31c03902010-04-29 14:51:33 +00001289 if( isChanged ){
1290 /* If a new wal-index header was loaded before the checkpoint was
drh7ed91f22010-04-29 22:34:07 +00001291 ** performed, then the pager-cache associated with log pWal is now
dan31c03902010-04-29 14:51:33 +00001292 ** out of date. So zero the cached wal-index header to ensure that
1293 ** next time the pager opens a snapshot on this database it knows that
1294 ** the cache needs to be reset.
1295 */
drh7ed91f22010-04-29 22:34:07 +00001296 memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
dan31c03902010-04-29 14:51:33 +00001297 }
danb9bf16b2010-04-14 11:23:30 +00001298
1299 /* Release the locks. */
dan87bfb512010-04-30 11:43:28 +00001300 walIndexUnmap(pWal);
drh7ed91f22010-04-29 22:34:07 +00001301 walSetLock(pWal, SQLITE_SHM_UNLOCK);
dan64d039e2010-04-13 19:27:31 +00001302 return rc;
dan7c246102010-04-12 19:00:29 +00001303}
1304
drh7ed91f22010-04-29 22:34:07 +00001305/* Return the value to pass to a sqlite3_wal_hook callback, the
1306** number of frames in the WAL at the point of the last commit since
1307** sqlite3WalCallback() was called. If no commits have occurred since
1308** the last call, then return 0.
1309*/
1310int sqlite3WalCallback(Wal *pWal){
dan8d22a172010-04-19 18:03:51 +00001311 u32 ret = 0;
drh7ed91f22010-04-29 22:34:07 +00001312 if( pWal ){
1313 ret = pWal->iCallback;
1314 pWal->iCallback = 0;
dan8d22a172010-04-19 18:03:51 +00001315 }
1316 return (int)ret;
1317}