src/wal.c - chromium.googlesource.com/chromium/deps/sqlite - Gitiles


 /*
 ** This file contains the implementation of a log file used in
 ** "journal_mode=wal" mode.
 */

 /*
 ** LOG FILE FORMAT
 **
 ** A log file consists of a header followed by zero or more log frames.
 ** The log header is 12 bytes in size and consists of the following three
 ** big-endian 32-bit unsigned integer values:
 **
 **     0: Database page size,
 **     4: Randomly selected salt value 1,
 **     8: Randomly selected salt value 2.
 **
 ** Immediately following the log header are zero or more log frames. Each
 ** frame itself consists of a 16-byte header followed by a <page-size> bytes
 ** of page data. The header is broken into 4 big-endian 32-bit unsigned
 ** integer values, as follows:
 **
 **     0: Page number.
 **     4: For commit records, the size of the database image in pages
 **        after the commit. For all other records, zero.
 **     8: Checksum value 1.
 **    12: Checksum value 2.
 */

 /*
 ** LOG SUMMARY FILE FORMAT
 **
 ** The log-summary file consists of a header region, followed by an
 ** region that contains no useful data (used to apply byte-range locks
 ** to), followed by the data region.
 **
 ** The contents of both the header and data region are specified in terms
 ** of 1, 2 and 4 byte unsigned integers. All integers are stored in
 ** machine-endian order.
 **
 ** A log-summary file is essentially a shadow-pager map. It contains a
 ** mapping from database page number to the set of locations in the log
 ** file that contain versions of the database page. When a database
 ** client needs to read a page of data, it first queries the log-summary
 ** file to determine if the required version of the page is stored in
 ** the log. If so, it is read from the log file. If not, it is read from
 ** the database file.
 **
 ** Whenever a transaction is appended to the log or a checkpoint transfers
 ** data from the log file into the database file, the log-summary is
 ** updated accordingly.
 **
 ** The fields in the log-summary file header are described in the comment
 ** directly above the definition of struct LogSummaryHdr (see below).
 ** Immediately following the fields in the LogSummaryHdr structure is
 ** an 8 byte checksum based on the contents of the header. This field is
 ** not the same as the iCheck1 and iCheck2 fields of the LogSummaryHdr.
 */

 #include "wal.h"

 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/mman.h>

 typedef struct LogSummaryHdr LogSummaryHdr;
 typedef struct LogSummary LogSummary;
 typedef struct LogIterator LogIterator;
 typedef struct LogLock LogLock;


 /*
 ** The following structure may be used to store the same data that
 ** is stored in the log-summary header.
 **
 ** Member variables iCheck1 and iCheck2 contain the checksum for the
 ** last frame written to the log, or 2 and 3 respectively if the log
 ** is currently empty.
 */
 struct LogSummaryHdr {
   u32 iChange;                    /* Counter incremented each transaction */
   u32 pgsz;                       /* Database page size in bytes */
   u32 iLastPg;                    /* Address of last valid frame in log */
   u32 nPage;                      /* Size of database in pages */
   u32 iCheck1;                    /* Checkpoint value 1 */
   u32 iCheck2;                    /* Checkpoint value 2 */
 };

 /* Size of serialized LogSummaryHdr object. */
 #define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32))

 /* A block of 16 bytes beginning at LOGSUMMARY_LOCK_OFFSET is reserved
 ** for locks. Since some systems only feature mandatory file-locks, we
 ** do not read or write data from the region of the file on which locks
 ** are applied.
 */
 #define LOGSUMMARY_LOCK_OFFSET   ((sizeof(LogSummaryHdr))+2*sizeof(u32))
 #define LOGSUMMARY_LOCK_RESERVED 16

 /* Size of header before each frame in log file */
 #define LOG_FRAME_HDRSIZE 16

 /* Size of log header */
 #define LOG_HDRSIZE 12

 /*
 ** Return the offset of frame iFrame in the log file, assuming a database
 ** page size of pgsz bytes. The offset returned is to the start of the
 ** log frame-header.
 */
 #define logFrameOffset(iFrame, pgsz) (                               \
   LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE)        \
 )

 /*
 ** If using mmap() to access a shared (or otherwise) log-summary file, then
 ** the mapping size is incremented in units of the following size.
 **
 ** A 64 KB log-summary mapping corresponds to a log file containing over
 ** 13000 frames, so the mapping size does not need to be increased often.
 */
 #define LOGSUMMARY_MMAP_INCREMENT (64*1024)

 /*
 ** There is one instance of this structure for each log-summary object
 ** that this process has a connection to. They are stored in a linked
 ** list starting at pLogSummary (global variable).
 **
 ** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used
 **       directly in this implementation because the VFS does not support
 **       the required blocking file-locks.
 */
 struct LogSummary {
   sqlite3_mutex *mutex;           /* Mutex used to protect this object */
   int nRef;                       /* Number of pointers to this structure */
   int fd;                         /* File descriptor open on log-summary */
   char *zPath;                    /* Path to associated WAL file */
   LogLock *pLock;                 /* Linked list of locks on this object */
   LogSummary *pNext;              /* Next in global list */

   int nData;                      /* Size of aData allocation/mapping */
   u32 *aData;                     /* File body */
 };

 /*
 ** This module uses three different types of file-locks. All are taken
 ** on the log-summary file. The three types of locks are as follows:
 **
 ** MUTEX:  The MUTEX lock is used as a robust inter-process mutex. It
 **         is held while the log-summary header is modified, and
 **         sometimes when it is read. It is also held while a new client
 **         obtains the DMH lock (see below), and while log recovery is
 **         being run.
 **
 ** DMH:    The DMH (Dead Mans Hand mechanism) lock is used to ensure
 **         that log-recovery is always run following a system restart.
 **         When it first opens a log-summary file, a process takes a
 **         SHARED lock on the DMH region. This lock is not released until
 **         the log-summary file is closed.
 **
 **         The process then attempts to upgrade to an EXCLUSIVE lock. If
 **         successful, then the contents of the log-summary file are deemed
 **         suspect and the log-summary header zeroed. This forces the
 **         first process that reads the log-summary file to run log
 **         recovery. After zeroing the log-summary header, the process
 **         downgrades to a SHARED lock on the DMH region.
 **
 **         If the attempt to obtain the EXCLUSIVE lock fails, then the
 **         process concludes that some other process is already using the
 **         log-summary file, and it can therefore be trusted.
 **
 **         The procedure described in the previous three paragraphs (taking
 **         a SHARED lock and then upgrading to an EXCLUSIVE lock to check
 **         if the process is the only one to have an open connection to the
 **         log file) is protected by holding the MUTEX lock. This avoids the
 **         race condition wherein the first two clients connect almost
 **         simultaneously following a system restart and each prevents
 **         the other from obtaining the EXCLUSIVE lock.
 **
 **
 ** REGION: There are 4 different region locks, regions A, B, C and D.
 **         Various EXCLUSIVE and SHARED locks on these regions are obtained
 **         when a client reads, writes or checkpoints the database.
 **
 **    To obtain a reader lock:
 **
 **         1. Attempt a SHARED lock on regions A and B.
 **         2. If step 1 is successful, drop the lock on region B. Or, if
 **            it is unsuccessful, attempt a SHARED lock on region D.
 **         3. Repeat the above until the lock attempt in step 1 or 2 is
 **            successful.
 **
 **         The reader lock is released when the read transaction is finished.
 **
 **    To obtain a writer lock:
 **
 **         1. Take (wait for) an EXCLUSIVE lock on regions C and D.
 **
 **         The locks are released after the write transaction is finished
 **         and, if any frames were committed to the log, the log-summary
 **         file updated.
 **
 **    To obtain a checkpointer lock:
 **
 **         1. Take (wait for) an EXCLUSIVE lock on regions B and C.
 **         2. Take (wait for) an EXCLUSIVE lock on region A.
 **
 **         Step 1 waits until any existing writer has finished. And forces
 **         all new readers to become "region D" readers.
 **
 **         Step 2 causes the checkpointer to wait until all existing region A
 **         readers have finished their transactions. Once the exclusive lock
 **         on region A has been obtained, only "region D" readers exist.
 **         These readers are operating on the snapshot at the head of the
 **         log. As such, the log can be safely copied into the database file
 **         without interfering with the readers.
 **
 **         Once the checkpoint has finished and the log-summary header
 **         updated (to indicate the log contents can now be ignored), all
 **         locks are released.
 **
 **         However, there may still exist region D readers using data in
 **         the body of the log file, so the log file itself cannot be
 **         truncated or overwritten until all region D readers have finished.
 **         That requirement is satisfied, because writers (the clients that
 **         write to the log file) require an exclusive lock on region D.
 **         Which they cannot get until all region D readers have finished.
 */
 #define LOG_LOCK_MUTEX  (LOGSUMMARY_LOCK_OFFSET)
 #define LOG_LOCK_DMH    (LOG_LOCK_MUTEX+1)
 #define LOG_LOCK_REGION (LOG_LOCK_DMH+1)

 /*
 ** The four lockable regions associated with each log-summary. A connection
 ** may take either a SHARED or EXCLUSIVE lock on each. An ORed combination
 ** of the following bitmasks is passed as the second argument to the
 ** logLockRegion() function.
 */
 #define LOG_REGION_A 0x01
 #define LOG_REGION_B 0x02
 #define LOG_REGION_C 0x04
 #define LOG_REGION_D 0x08

 /*
 ** Values for the third parameter to logLockRegion().
 */
 #define LOG_UNLOCK  0             /* Unlock a range of bytes */
 #define LOG_RDLOCK  1             /* Put a SHARED lock on a range of bytes */
 #define LOG_WRLOCK  2             /* Put an EXCLUSIVE lock on a byte-range */
 #define LOG_WRLOCKW 3             /* Block on EXCLUSIVE lock on a byte-range */

 /*
 ** A single instance of this structure is allocated as part of each
 ** connection to a database log. All structures associated with the
 ** same log file are linked together into a list using LogLock.pNext
 ** starting at LogSummary.pLock.
 **
 ** The mLock field of the structure describes the locks (if any)
 ** currently held by the connection. If a SHARED lock is held on
 ** any of the four locking regions, then the associated LOG_REGION_X
 ** bit (see above) is set. If an EXCLUSIVE lock is held on the region,
 ** then the (LOG_REGION_X << 8) bit is set.
 */
 struct LogLock {
   LogLock *pNext;                 /* Next lock on the same log */
   u32 mLock;                      /* Mask of locks */
 };

 struct Log {
   LogSummary *pSummary;           /* Log file summary data */
   sqlite3_vfs *pVfs;              /* The VFS used to create pFd */
   sqlite3_file *pFd;              /* File handle for log file */
   int isLocked;                   /* Non-zero if a snapshot is held open */
   int isWriteLocked;              /* True if this is the writer connection */
   u32 iCallback;                  /* Value to pass to log callback (or 0) */
   LogSummaryHdr hdr;              /* Log summary header for current snapshot */
   LogLock lock;                   /* Lock held by this connection (if any) */
 };


 /*
 ** This structure is used to implement an iterator that iterates through
 ** all frames in the log in database page order. Where two or more frames
 ** correspond to the same database page, the iterator visits only the
 ** frame most recently written to the log.
 **
 ** The internals of this structure are only accessed by:
 **
 **   logIteratorInit() - Create a new iterator,
 **   logIteratorNext() - Step an iterator,
 **   logIteratorFree() - Free an iterator.
 **
 ** This functionality is used by the checkpoint code (see logCheckpoint()).
 */
 struct LogIterator {
   int nSegment;                   /* Size of LogIterator.aSegment[] array */
   int nFinal;                     /* Elements in segment nSegment-1 */
   struct LogSegment {
     int iNext;                    /* Next aIndex index */
     u8 *aIndex;                   /* Pointer to index array */
     u32 *aDbPage;                 /* Pointer to db page array */
   } aSegment[1];
 };


 /*
 ** List of all LogSummary objects created by this process. Protected by
 ** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex
 ** here instead of borrowing the LRU mutex.
 */
 #define LOG_SUMMARY_MUTEX SQLITE_MUTEX_STATIC_LRU
 static LogSummary *pLogSummary = 0;

 /*
 ** Generate an 8 byte checksum based on the data in array aByte[] and the
 ** initial values of aCksum[0] and aCksum[1]. The checksum is written into
 ** aCksum[] before returning.
 **
 ** The range of bytes to checksum is treated as an array of 32-bit
 ** little-endian unsigned integers. For each integer X in the array, from
 ** start to finish, do the following:
 **
 **   aCksum[0] += X;
 **   aCksum[1] += aCksum[0];
 **
 ** For the calculation above, use 64-bit unsigned accumulators. Before
 ** returning, truncate the values to 32-bits as follows:
 **
 **   aCksum[0] = (u32)(aCksum[0] + (aCksum[0]>>24));
 **   aCksum[1] = (u32)(aCksum[1] + (aCksum[1]>>24));
 */
 static void logChecksumBytes(u8 *aByte, int nByte, u32 *aCksum){
   u64 sum1 = aCksum[0];
   u64 sum2 = aCksum[1];
   u32 *a32 = (u32 *)aByte;
   u32 *aEnd = (u32 *)&aByte[nByte];

   assert( (nByte&0x00000003)==0 );

   if( SQLITE_LITTLEENDIAN ){
 #ifdef SQLITE_DEBUG
     u8 *a = (u8 *)a32;
     assert( *a32==(a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24)) );
 #endif
     do {
       sum1 += *a32;
       sum2 += sum1;
     } while( ++a32<aEnd );
   }else{
     do {
       u8 *a = (u8*)a32;
       sum1 += a[0] + (a[1]<<8) + (a[2]<<16) + (a[3]<<24);
       sum2 += sum1;
     } while( ++a32<aEnd );
   }

   aCksum[0] = sum1 + (sum1>>24);
   aCksum[1] = sum2 + (sum2>>24);
 }

 /*
 ** Argument zPath must be a nul-terminated string containing a path-name.
 ** This function modifies the string in-place by removing any "./" or "../"
 ** elements in the path. For example, the following input:
 **
 **   "/home/user/plans/good/../evil/./world_domination.txt"
 **
 ** is overwritten with the 'normalized' version:
 **
 **   "/home/user/plans/evil/world_domination.txt"
 */
 static void logNormalizePath(char *zPath){
   int i, j;
   char *z = zPath;
   int n = strlen(z);

   while( n>1 && z[n-1]=='/' ){ n--; }
   for(i=j=0; i<n; i++){
     if( z[i]=='/' ){
       if( z[i+1]=='/' ) continue;
       if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
         i += 1;
         continue;
       }
       if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
         while( j>0 && z[j-1]!='/' ){ j--; }
         if( j>0 ){ j--; }
         i += 2;
         continue;
       }
     }
     z[j++] = z[i];
   }
   z[j] = 0;
 }

 /*
 ** Unmap the log-summary mapping and close the file-descriptor. If
 ** the isTruncate argument is non-zero, truncate the log-summary file
 ** region to zero bytes.
 **
 ** Regardless of the value of isTruncate, close the file-descriptor
 ** opened on the log-summary file.
 */
 static int logSummaryUnmap(LogSummary *pSummary, int isUnlink){
   int rc = SQLITE_OK;
   if( pSummary->aData ){
     assert( pSummary->fd>0 );
     munmap(pSummary->aData, pSummary->nData);
     pSummary->aData = 0;
     if( isUnlink ){
       char *zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
       if( !zFile ){
         rc = SQLITE_NOMEM;
       }
       unlink(zFile);
       sqlite3_free(zFile);
     }
   }
   if( pSummary->fd>0 ){
     close(pSummary->fd);
     pSummary->fd = -1;
   }
   return rc;
 }

 static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){
   u32 *aHdr = pSummary->aData;                   /* Write header here */
   u32 *aCksum = &aHdr[LOGSUMMARY_HDR_NFIELD];    /* Write header cksum here */

   assert( LOGSUMMARY_HDR_NFIELD==sizeof(LogSummaryHdr)/4 );
   memcpy(aHdr, pHdr, sizeof(LogSummaryHdr));
   aCksum[0] = aCksum[1] = 1;
   logChecksumBytes((u8 *)aHdr, sizeof(LogSummaryHdr), aCksum);
 }

 /*
 ** This function encodes a single frame header and writes it to a buffer
 ** supplied by the caller. A log frame-header is made up of a series of
 ** 4-byte big-endian integers, as follows:
 **
 **     0: Database page size in bytes.
 **     4: Page number.
 **     8: New database size (for commit frames, otherwise zero).
 **    12: Frame checksum 1.
 **    16: Frame checksum 2.
 */
 static void logEncodeFrame(
   u32 *aCksum,                    /* IN/OUT: Checksum values */
   u32 iPage,                      /* Database page number for frame */
   u32 nTruncate,                  /* New db size (or 0 for non-commit frames) */
   int nData,                      /* Database page size (size of aData[]) */
   u8 *aData,                      /* Pointer to page data (for checksum) */
   u8 *aFrame                      /* OUT: Write encoded frame here */
 ){
   assert( LOG_FRAME_HDRSIZE==16 );

   sqlite3Put4byte(&aFrame[0], iPage);
   sqlite3Put4byte(&aFrame[4], nTruncate);

   logChecksumBytes(aFrame, 8, aCksum);
   logChecksumBytes(aData, nData, aCksum);

   sqlite3Put4byte(&aFrame[8], aCksum[0]);
   sqlite3Put4byte(&aFrame[12], aCksum[1]);
 }

 /*
 ** Return 1 and populate *piPage, *pnTruncate and aCksum if the
 ** frame checksum looks Ok. Otherwise return 0.
 */
 static int logDecodeFrame(
   u32 *aCksum,                    /* IN/OUT: Checksum values */
   u32 *piPage,                    /* OUT: Database page number for frame */
   u32 *pnTruncate,                /* OUT: New db size (or 0 if not commit) */
   int nData,                      /* Database page size (size of aData[]) */
   u8 *aData,                      /* Pointer to page data (for checksum) */
   u8 *aFrame                      /* Frame data */
 ){
   assert( LOG_FRAME_HDRSIZE==16 );

   logChecksumBytes(aFrame, 8, aCksum);
   logChecksumBytes(aData, nData, aCksum);

   if( aCksum[0]!=sqlite3Get4byte(&aFrame[8])
    || aCksum[1]!=sqlite3Get4byte(&aFrame[12])
   ){
     /* Checksum failed. */
     return 0;
   }

   *piPage = sqlite3Get4byte(&aFrame[0]);
   *pnTruncate = sqlite3Get4byte(&aFrame[4]);
   return 1;
 }

 static void logMergesort8(
   Pgno *aContent,                 /* Pages in log */
   u8 *aBuffer,                    /* Buffer of at least *pnList items to use */
   u8 *aList,                      /* IN/OUT: List to sort */
   int *pnList                     /* IN/OUT: Number of elements in aList[] */
 ){
   int nList = *pnList;
   if( nList>1 ){
     int nLeft = nList / 2;        /* Elements in left list */
     int nRight = nList - nLeft;   /* Elements in right list */
     u8 *aLeft = aList;            /* Left list */
     u8 *aRight = &aList[nLeft];   /* Right list */
     int iLeft = 0;                /* Current index in aLeft */
     int iRight = 0;               /* Current index in aright */
     int iOut = 0;                 /* Current index in output buffer */

     /* TODO: Change to non-recursive version. */
     logMergesort8(aContent, aBuffer, aLeft, &nLeft);
     logMergesort8(aContent, aBuffer, aRight, &nRight);

     while( iRight<nRight || iLeft<nLeft ){
       u8 logpage;
       Pgno dbpage;

       if( (iLeft<nLeft)
        && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
       ){
         logpage = aLeft[iLeft++];
       }else{
         logpage = aRight[iRight++];
       }
       dbpage = aContent[logpage];

       aBuffer[iOut++] = logpage;
       if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;

       assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
       assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
     }
     memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
     *pnList = iOut;
   }

 #ifdef SQLITE_DEBUG
   {
     int i;
     for(i=1; i<*pnList; i++){
       assert( aContent[aList[i]] > aContent[aList[i-1]] );
     }
   }
 #endif
 }


 /*
 ** Memory map the first nByte bytes of the summary file opened with
 ** pSummary->fd at pSummary->aData. If the summary file is smaller than
 ** nByte bytes in size when this function is called, ftruncate() is
 ** used to expand it before it is mapped.
 **
 ** It is assumed that an exclusive lock is held on the summary file
 ** by the caller (to protect the ftruncate()).
 */
 static int logSummaryMap(LogSummary *pSummary, int nByte){
   struct stat sStat;
   int rc;
   int fd = pSummary->fd;
   void *pMap;

   assert( pSummary->aData==0 );

   /* If the file is less than nByte bytes in size, cause it to grow. */
   rc = fstat(fd, &sStat);
   if( rc!=0 ) return SQLITE_IOERR;
   if( sStat.st_size<nByte ){
     rc = ftruncate(fd, nByte);
     if( rc!=0 ) return SQLITE_IOERR;
   }else{
     nByte = sStat.st_size;
   }

   /* Map the file. */
   pMap = mmap(0, nByte, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
   if( pMap==MAP_FAILED ){
     return SQLITE_IOERR;
   }
   pSummary->aData = (u32 *)pMap;
   pSummary->nData = nByte/4;

   return SQLITE_OK;
 }

 /*
 ** Return the index in the LogSummary.aData array that corresponds to
 ** frame iFrame. The log-summary file consists of a header, followed by
 ** alternating "map" and "index" blocks.
 */
 static int logSummaryEntry(u32 iFrame){
   return (
       (LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED)/sizeof(u32)
     + (((iFrame-1)>>8)<<6)        /* Indexes that occur before iFrame */
     + iFrame-1                    /* Db page numbers that occur before iFrame */
   );
 }


 /*
 ** Set an entry in the log-summary map to map log frame iFrame to db
 ** page iPage. Values are always appended to the log-summary (i.e. the
 ** value of iFrame is always exactly one more than the value passed to
 ** the previous call), but that restriction is not enforced or asserted
 ** here.
 */
 static void logSummaryAppend(LogSummary *pSummary, u32 iFrame, u32 iPage){
   u32 iSlot = logSummaryEntry(iFrame);

   if( (iSlot+128)>=pSummary->nData ){
     int nByte = pSummary->nData*4 + LOGSUMMARY_MMAP_INCREMENT;

     sqlite3_mutex_enter(pSummary->mutex);
     munmap(pSummary->aData, pSummary->nData*4);
     pSummary->aData = 0;
     logSummaryMap(pSummary, nByte);
     sqlite3_mutex_leave(pSummary->mutex);
   }

   /* Set the log-summary entry itself */
   pSummary->aData[iSlot] = iPage;

   /* If the frame number is a multiple of 256 (frames are numbered starting
   ** at 1), build an index of the most recently added 256 frames.
   */
   if( (iFrame&0x000000FF)==0 ){
     int i;                        /* Iterator used while initializing aIndex */
     u32 *aFrame;                  /* Pointer to array of 256 frames */
     int nIndex;                   /* Number of entries in index */
     u8 *aIndex;                   /* 256 bytes to build index in */
     u8 *aTmp;                     /* Scratch space to use while sorting */

     aFrame = &pSummary->aData[iSlot-255];
     aIndex = (u8 *)&pSummary->aData[iSlot+1];
     aTmp = &aIndex[256];

     nIndex = 256;
     for(i=0; i<256; i++) aIndex[i] = (u8)i;
     logMergesort8(aFrame, aTmp, aIndex, &nIndex);
     memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
   }
 }


 /*
 ** Recover the log-summary by reading the log file. The caller must hold
 ** an exclusive lock on the log-summary file.
 */
 static int logSummaryRecover(LogSummary *pSummary, sqlite3_file *pFd){
   int rc;                         /* Return Code */
   i64 nSize;                      /* Size of log file */
   LogSummaryHdr hdr;              /* Recovered log-summary header */

   memset(&hdr, 0, sizeof(hdr));

   rc = sqlite3OsFileSize(pFd, &nSize);
   if( rc!=SQLITE_OK ){
     return rc;
   }

   if( nSize>LOG_FRAME_HDRSIZE ){
     u8 aBuf[LOG_FRAME_HDRSIZE];   /* Buffer to load first frame header into */
     u8 *aFrame = 0;               /* Malloc'd buffer to load entire frame */
     int nFrame;                   /* Number of bytes at aFrame */
     u8 *aData;                    /* Pointer to data part of aFrame buffer */
     int iFrame;                   /* Index of last frame read */
     i64 iOffset;                  /* Next offset to read from log file */
     int nPgsz;                    /* Page size according to the log */
     u32 aCksum[2];                /* Running checksum */

     /* Read in the first frame header in the file (to determine the
     ** database page size).
     */
     rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0);
     if( rc!=SQLITE_OK ){
       return rc;
     }

     /* If the database page size is not a power of two, or is greater than
     ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data.
     */
     nPgsz = sqlite3Get4byte(&aBuf[0]);
     if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE || nPgsz<512 ){
       goto finished;
     }
     aCksum[0] = sqlite3Get4byte(&aBuf[4]);
     aCksum[1] = sqlite3Get4byte(&aBuf[8]);

     /* Malloc a buffer to read frames into. */
     nFrame = nPgsz + LOG_FRAME_HDRSIZE;
     aFrame = (u8 *)sqlite3_malloc(nFrame);
     if( !aFrame ){
       return SQLITE_NOMEM;
     }
     aData = &aFrame[LOG_FRAME_HDRSIZE];

     /* Read all frames from the log file. */
     iFrame = 0;
     for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
       u32 pgno;                   /* Database page number for frame */
       u32 nTruncate;              /* dbsize field from frame header */
       int isValid;                /* True if this frame is valid */

       /* Read and decode the next log frame. */
       rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
       if( rc!=SQLITE_OK ) break;
       isValid = logDecodeFrame(aCksum, &pgno, &nTruncate, nPgsz, aData, aFrame);
       if( !isValid ) break;
       logSummaryAppend(pSummary, ++iFrame, pgno);

       /* If nTruncate is non-zero, this is a commit record. */
       if( nTruncate ){
         hdr.iCheck1 = aCksum[0];
         hdr.iCheck2 = aCksum[1];
         hdr.iLastPg = iFrame;
         hdr.nPage = nTruncate;
         hdr.pgsz = nPgsz;
       }
     }

     sqlite3_free(aFrame);
   }else{
     hdr.iCheck1 = 2;
     hdr.iCheck2 = 3;
   }

 finished:
   logSummaryWriteHdr(pSummary, &hdr);
   return rc;
 }

 /*
 ** Place, modify or remove a lock on the log-summary file associated
 ** with pSummary.
 **
 ** The locked byte-range should be inside the region dedicated to
 ** locking. This region of the log-summary file is never read or written.
 */
 static int logLockFd(
   LogSummary *pSummary,           /* The log-summary object to lock */
   int iStart,                     /* First byte to lock */
   int nByte,                      /* Number of bytes to lock */
   int op                          /* LOG_UNLOCK, RDLOCK, WRLOCK or WRLOCKW */
 ){
   int aType[4] = {
     F_UNLCK,                      /* LOG_UNLOCK */
     F_RDLCK,                      /* LOG_RDLOCK */
     F_WRLCK,                      /* LOG_WRLOCK */
     F_WRLCK                       /* LOG_WRLOCKW */
   };
   int aOp[4] = {
     F_SETLK,                      /* LOG_UNLOCK */
     F_SETLK,                      /* LOG_RDLOCK */
     F_SETLK,                      /* LOG_WRLOCK */
     F_SETLKW                      /* LOG_WRLOCKW */
   };
   struct flock f;                 /* Locking operation */
   int rc;                         /* Value returned by fcntl() */

   assert( ArraySize(aType)==ArraySize(aOp) );
   assert( op>=0 && op<ArraySize(aType) );
   assert( nByte>0 );
   assert( iStart>=LOGSUMMARY_LOCK_OFFSET
        && iStart+nByte<=LOGSUMMARY_LOCK_OFFSET+LOGSUMMARY_LOCK_RESERVED
   );
 #if defined(SQLITE_DEBUG) && defined(SQLITE_OS_UNIX)
   if( pSummary->aData ) memset(&((u8*)pSummary->aData)[iStart], op, nByte);
 #endif

   memset(&f, 0, sizeof(f));
   f.l_type = aType[op];
   f.l_whence = SEEK_SET;
   f.l_start = iStart;
   f.l_len = nByte;
   rc = fcntl(pSummary->fd, aOp[op], &f);
   return (rc==0) ? SQLITE_OK : SQLITE_BUSY;
 }

 static int logLockRegion(Log *pLog, u32 mRegion, int op){
   LogSummary *pSummary = pLog->pSummary;
   LogLock *p;                     /* Used to iterate through in-process locks */
   u32 mOther;                     /* Locks held by other connections */
   u32 mNew;                       /* New mask for pLog */

   assert(
        /* Writer lock operations */
           (op==LOG_WRLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_C|LOG_REGION_D))

        /* Normal reader lock operations */
        || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B))

        /* Region D reader lock operations */
        || (op==LOG_RDLOCK && mRegion==(LOG_REGION_D))
        || (op==LOG_RDLOCK && mRegion==(LOG_REGION_A))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_D))

        /* Checkpointer lock operations */
        || (op==LOG_WRLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
        || (op==LOG_WRLOCK && mRegion==(LOG_REGION_A))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_B|LOG_REGION_C))
        || (op==LOG_UNLOCK && mRegion==(LOG_REGION_A|LOG_REGION_B|LOG_REGION_C))
   );

   /* Assert that a connection never tries to go from an EXCLUSIVE to a
   ** SHARED lock on a region. Moving from SHARED to EXCLUSIVE sometimes
   ** happens though (when a region D reader upgrades to a writer).
   */
   assert( op!=LOG_RDLOCK || 0==(pLog->lock.mLock & (mRegion<<8)) );

   sqlite3_mutex_enter(pSummary->mutex);

   /* Calculate a mask of logs held by all connections in this process apart
   ** from this one. The least significant byte of the mask contains a mask
   ** of the SHARED logs held. The next least significant byte of the mask
   ** indicates the EXCLUSIVE locks held. For example, to test if some other
   ** connection is holding a SHARED lock on region A, or an EXCLUSIVE lock
   ** on region C, do:
   **
   **   hasSharedOnA    = (mOther & (LOG_REGION_A<<0));
   **   hasExclusiveOnC = (mOther & (LOG_REGION_C<<8));
   **
   ** In all masks, if the bit in the EXCLUSIVE byte mask is set, so is the
   ** corresponding bit in the SHARED mask.
   */
   mOther = 0;
   for(p=pSummary->pLock; p; p=p->pNext){
     assert( (p->mLock & (p->mLock<<8))==(p->mLock&0x0000FF00) );
     if( p!=&pLog->lock ){
       mOther |= p->mLock;
     }
   }

   /* If this call is to lock a region (not to unlock one), test if locks held
   ** by any other connection in this process prevent the new locks from
   ** begin granted. If so, exit the summary mutex and return SQLITE_BUSY.
   */
   if( op && (mOther & (mRegion << (op==LOG_RDLOCK ? 8 : 0))) ){
     sqlite3_mutex_leave(pSummary->mutex);
     return SQLITE_BUSY;
   }

   /* Figure out the new log mask for this connection. */
   switch( op ){
     case LOG_UNLOCK:
       mNew = (pLog->lock.mLock & ~(mRegion|(mRegion<<8)));
       break;
     case LOG_RDLOCK:
       mNew = (pLog->lock.mLock | mRegion);
       break;
     default:
       assert( op==LOG_WRLOCK );
       mNew = (pLog->lock.mLock | (mRegion<<8) | mRegion);
       break;
   }

   /* Now modify the locks held on the log-summary file descriptor. This
   ** file descriptor is shared by all log connections in this process.
   ** Therefore:
   **
   **   + If one or more log connections in this process hold a SHARED lock
   **     on a region, the file-descriptor should hold a SHARED lock on
   **     the file region.
   **
   **   + If a log connection in this process holds an EXCLUSIVE lock on a
   **     region, the file-descriptor should also hold an EXCLUSIVE lock on
   **     the region in question.
   **
   ** If this is an LOG_UNLOCK operation, only regions for which no other
   ** connection holds a lock should actually be unlocked. And if this
   ** is a LOG_RDLOCK operation and other connections already hold all
   ** the required SHARED locks, then no system call is required.
   */
   if( op==LOG_UNLOCK ){
     mRegion = (mRegion & ~mOther);
   }
   if( (op==LOG_WRLOCK)
    || (op==LOG_UNLOCK && mRegion)
    || (op==LOG_RDLOCK && (mOther&mRegion)!=mRegion)
   ){
     struct LockMap {
       int iStart;                 /* Byte offset to start locking operation */
       int iLen;                   /* Length field for locking operation */
     } aMap[] = {
       /* 0000 */ {0, 0},                    /* 0001 */ {3+LOG_LOCK_REGION, 1},
       /* 0010 */ {2+LOG_LOCK_REGION, 1},    /* 0011 */ {2+LOG_LOCK_REGION, 2},
       /* 0100 */ {1+LOG_LOCK_REGION, 1},    /* 0101 */ {0, 0},
       /* 0110 */ {1+LOG_LOCK_REGION, 2},    /* 0111 */ {1+LOG_LOCK_REGION, 3},
       /* 1000 */ {0+LOG_LOCK_REGION, 1},    /* 1001 */ {0, 0},
       /* 1010 */ {0, 0},                    /* 1011 */ {0, 0},
       /* 1100 */ {0+LOG_LOCK_REGION, 2},    /* 1101 */ {0, 0},
       /* 1110 */ {0, 0},                    /* 1111 */ {0, 0}
     };
     int rc;                       /* Return code of logLockFd() */

     assert( mRegion<ArraySize(aMap) && aMap[mRegion].iStart!=0 );

     rc = logLockFd(pSummary, aMap[mRegion].iStart, aMap[mRegion].iLen, op);
     if( rc!=0 ){
       sqlite3_mutex_leave(pSummary->mutex);
       return rc;
     }
   }

   pLog->lock.mLock = mNew;
   sqlite3_mutex_leave(pSummary->mutex);
   return SQLITE_OK;
 }

 /*
 ** Lock the DMH region, either with an EXCLUSIVE or SHARED lock. This
 ** function is never called with LOG_UNLOCK - the only way the DMH region
 ** is every completely unlocked is by by closing the file descriptor.
 */
 static int logLockDMH(LogSummary *pSummary, int eLock){
   assert( sqlite3_mutex_held(pSummary->mutex) );
   assert( eLock==LOG_RDLOCK || eLock==LOG_WRLOCK );
   return logLockFd(pSummary, LOG_LOCK_DMH, 1, eLock);
 }

 /*
 ** Lock (or unlock) the MUTEX region. It is always locked using an
 ** EXCLUSIVE, blocking lock.
 */
 static int logLockMutex(LogSummary *pSummary, int eLock){
   assert( sqlite3_mutex_held(pSummary->mutex) );
   assert( eLock==LOG_WRLOCKW || eLock==LOG_UNLOCK );
   logLockFd(pSummary, LOG_LOCK_MUTEX, 1, eLock);
   return SQLITE_OK;
 }

 /*
 ** This function intializes the connection to the log-summary identified
 ** by struct pSummary.
 */
 static int logSummaryInit(
   LogSummary *pSummary,           /* Log summary object to initialize */
   sqlite3_file *pFd               /* File descriptor open on log file */
 ){
   int rc;                         /* Return Code */
   char *zFile;                    /* File name for summary file */

   assert( pSummary->fd<0 );
   assert( pSummary->aData==0 );
   assert( pSummary->nRef>0 );
   assert( pSummary->zPath );

   /* Open a file descriptor on the summary file. */
   zFile = sqlite3_mprintf("%s-summary", pSummary->zPath);
   if( !zFile ){
     return SQLITE_NOMEM;
   }
   pSummary->fd = open(zFile, O_RDWR|O_CREAT, S_IWUSR|S_IRUSR);
   sqlite3_free(zFile);
   if( pSummary->fd<0 ){
     return SQLITE_IOERR;
   }

   /* Grab an exclusive lock the summary file. Then mmap() it.
   **
   ** TODO: This code needs to be enhanced to support a growable mapping.
   ** For now, just make the mapping very large to start with. The
   ** pages should not be allocated until they are first accessed anyhow,
   ** so using a large mapping consumes no more resources than a smaller
   ** one would.
   */
   assert( sqlite3_mutex_held(pSummary->mutex) );
   rc = logLockMutex(pSummary, LOG_WRLOCKW);
   if( rc!=SQLITE_OK ) return rc;
   rc = logSummaryMap(pSummary, LOGSUMMARY_MMAP_INCREMENT);
   if( rc!=SQLITE_OK ) goto out;

   /* Try to obtain an EXCLUSIVE lock on the dead-mans-hand region. If this
   ** is possible, the contents of the log-summary file (if any) may not
   ** be trusted. Zero the log-summary header before continuing.
   */
   rc = logLockDMH(pSummary, LOG_WRLOCK);
   if( rc==SQLITE_OK ){
     memset(pSummary->aData, 0, (LOGSUMMARY_HDR_NFIELD+2)*sizeof(u32) );
   }
   rc = logLockDMH(pSummary, LOG_RDLOCK);
   if( rc!=SQLITE_OK ){
     rc = SQLITE_IOERR;
   }

  out:
   logLockMutex(pSummary, LOG_UNLOCK);
   return rc;
 }

 /*
 ** Open a connection to the log file associated with database zDb. The
 ** database file does not actually have to exist. zDb is used only to
 ** figure out the name of the log file to open. If the log file does not
 ** exist it is created by this call.
 **
 ** A SHARED lock should be held on the database file when this function
 ** is called. The purpose of this SHARED lock is to prevent any other
 ** client from unlinking the log or log-summary file. If another process
 ** were to do this just after this client opened one of these files, the
 ** system would be badly broken.
 */
 int sqlite3WalOpen(
   sqlite3_vfs *pVfs,              /* vfs module to open log file with */
   const char *zDb,                /* Name of database file */
   Log **ppLog                     /* OUT: Allocated Log handle */
 ){
   int rc = SQLITE_OK;             /* Return Code */
   Log *pRet;                      /* Object to allocate and return */
   LogSummary *pSummary = 0;       /* Summary object */
   sqlite3_mutex *mutex = 0;       /* LOG_SUMMARY_MUTEX mutex */
   int flags;                      /* Flags passed to OsOpen() */
   char *zWal = 0;                 /* Path to WAL file */
   int nWal;                       /* Length of zWal in bytes */

   assert( zDb );

   /* Allocate an instance of struct Log to return. */
   *ppLog = 0;
   pRet = (Log *)sqlite3MallocZero(sizeof(Log) + pVfs->szOsFile);
   if( !pRet ) goto out;
   pRet->pVfs = pVfs;
   pRet->pFd = (sqlite3_file *)&pRet[1];

   /* Normalize the path name. */
   zWal = sqlite3_mprintf("%s-wal", zDb);
   if( !zWal ) goto out;
   logNormalizePath(zWal);
   flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
   nWal = sqlite3Strlen30(zWal);

   /* Enter the mutex that protects the linked-list of LogSummary structures */
   if( sqlite3GlobalConfig.bCoreMutex ){
     mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
   }
   sqlite3_mutex_enter(mutex);

   /* Search for an existing log summary object in the linked list. If one
   ** cannot be found, allocate and initialize a new object.
   */
   for(pSummary=pLogSummary; pSummary; pSummary=pSummary->pNext){
     int nPath = sqlite3Strlen30(pSummary->zPath);
     if( nWal==nPath && 0==memcmp(pSummary->zPath, zWal, nPath) ) break;
   }
   if( !pSummary ){
     int nByte = sizeof(LogSummary) + nWal + 1;
     pSummary = (LogSummary *)sqlite3MallocZero(nByte);
     if( !pSummary ){
       rc = SQLITE_NOMEM;
       goto out;
     }
     if( sqlite3GlobalConfig.bCoreMutex ){
       pSummary->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_RECURSIVE);
     }
     pSummary->zPath = (char *)&pSummary[1];
     pSummary->fd = -1;
     memcpy(pSummary->zPath, zWal, nWal);
     pSummary->pNext = pLogSummary;
     pLogSummary = pSummary;
   }
   pSummary->nRef++;
   pRet->pSummary = pSummary;

   /* Exit the mutex protecting the linked-list of LogSummary objects. */
   sqlite3_mutex_leave(mutex);
   mutex = 0;

   /* Open file handle on the log file. */
   rc = sqlite3OsOpen(pVfs, pSummary->zPath, pRet->pFd, flags, &flags);
   if( rc!=SQLITE_OK ) goto out;

   /* Object pSummary is shared between all connections to the database made
   ** by this process. So at this point it may or may not be connected to
   ** the log-summary. If it is not, connect it.
   */
   sqlite3_mutex_enter(pSummary->mutex);
   mutex = pSummary->mutex;
   if( pSummary->fd<0 ){
     rc = logSummaryInit(pSummary, pRet->pFd);
   }

   pRet->lock.pNext = pSummary->pLock;
   pSummary->pLock = &pRet->lock;

  out:
   sqlite3_mutex_leave(mutex);
   sqlite3_free(zWal);
   if( rc!=SQLITE_OK ){
     assert(0);
     if( pRet ){
       sqlite3OsClose(pRet->pFd);
       sqlite3_free(pRet);
     }
     assert( !pSummary || pSummary->nRef==0 );
     sqlite3_free(pSummary);
   }
   *ppLog = pRet;
   return rc;
 }

 static int logIteratorNext(
   LogIterator *p,               /* Iterator */
   u32 *piPage,                    /* OUT: Next db page to write */
   u32 *piFrame                    /* OUT: Log frame to read from */
 ){
   u32 iMin = *piPage;
   u32 iRet = 0xFFFFFFFF;
   int i;
   int nBlock = p->nFinal;

   for(i=p->nSegment-1; i>=0; i--){
     struct LogSegment *pSegment = &p->aSegment[i];
     while( pSegment->iNext<nBlock ){
       u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
       if( iPg>iMin ){
         if( iPg<iRet ){
           iRet = iPg;
           *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
         }
         break;
       }
       pSegment->iNext++;
     }

     nBlock = 256;
   }

   *piPage = iRet;
   return (iRet==0xFFFFFFFF);
 }

 static LogIterator *logIteratorInit(Log *pLog){
   u32 *aData = pLog->pSummary->aData;
   LogIterator *p;                 /* Return value */
   int nSegment;                   /* Number of segments to merge */
   u32 iLast;                      /* Last frame in log */
   int nByte;                      /* Number of bytes to allocate */
   int i;                          /* Iterator variable */
   int nFinal;                     /* Number of unindexed entries */
   struct LogSegment *pFinal;      /* Final (unindexed) segment */
   u8 *aTmp;                       /* Temp space used by merge-sort */

   iLast = pLog->hdr.iLastPg;
   nSegment = (iLast >> 8) + 1;
   nFinal = (iLast & 0x000000FF);

   nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
   p = (LogIterator *)sqlite3_malloc(nByte);
   if( p ){
     memset(p, 0, nByte);
     p->nSegment = nSegment;
     p->nFinal = nFinal;
   }

   for(i=0; i<nSegment-1; i++){
     p->aSegment[i].aDbPage = &aData[logSummaryEntry(i*256+1)];
     p->aSegment[i].aIndex = (u8 *)&aData[logSummaryEntry(i*256+1)+256];
   }
   pFinal = &p->aSegment[nSegment-1];

   pFinal->aDbPage = &aData[logSummaryEntry((nSegment-1)*256+1)];
   pFinal->aIndex = (u8 *)&pFinal[1];
   aTmp = &pFinal->aIndex[256];
   for(i=0; i<nFinal; i++){
     pFinal->aIndex[i] = i;
   }
   logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
   p->nFinal = nFinal;

   return p;
 }

 /*
 ** Free a log iterator allocated by logIteratorInit().
 */
 static void logIteratorFree(LogIterator *p){
   sqlite3_free(p);
 }

 /*
 ** Checkpoint the contents of the log file.
 */
 static int logCheckpoint(
   Log *pLog,                      /* Log connection */
   sqlite3_file *pFd,              /* File descriptor open on db file */
   int sync_flags,                 /* Flags for OsSync() (or 0) */
   u8 *zBuf                        /* Temporary buffer to use */
 ){
   int rc;                         /* Return code */
   int pgsz = pLog->hdr.pgsz;      /* Database page-size */
   LogIterator *pIter = 0;         /* Log iterator context */
   u32 iDbpage = 0;                /* Next database page to write */
   u32 iFrame = 0;                 /* Log frame containing data for iDbpage */

   if( pLog->hdr.iLastPg==0 ){
     return SQLITE_OK;
   }

   /* Allocate the iterator */
   pIter = logIteratorInit(pLog);
   if( !pIter ) return SQLITE_NOMEM;

   /* Sync the log file to disk */
   if( sync_flags ){
     rc = sqlite3OsSync(pLog->pFd, sync_flags);
     if( rc!=SQLITE_OK ) goto out;
   }

   /* Iterate through the contents of the log, copying data to the db file. */
   while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
     rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz,
         logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE
     );
     if( rc!=SQLITE_OK ) goto out;
     rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
     if( rc!=SQLITE_OK ) goto out;
   }

   /* Truncate the database file */
   rc = sqlite3OsTruncate(pFd, ((i64)pLog->hdr.nPage*(i64)pgsz));
   if( rc!=SQLITE_OK ) goto out;

   /* Sync the database file. If successful, update the log-summary. */
   if( sync_flags ){
     rc = sqlite3OsSync(pFd, sync_flags);
     if( rc!=SQLITE_OK ) goto out;
   }
   pLog->hdr.iLastPg = 0;
   pLog->hdr.iCheck1 = 2;
   pLog->hdr.iCheck2 = 3;
   logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);

   /* TODO: If a crash occurs and the current log is copied into the
   ** database there is no problem. However, if a crash occurs while
   ** writing the next transaction into the start of the log, such that:
   **
   **   * The first transaction currently in the log is left intact, but
   **   * The second (or subsequent) transaction is damaged,
   **
   ** then the database could become corrupt.
   **
   ** The easiest thing to do would be to write and sync a dummy header
   ** into the log at this point. Unfortunately, that turns out to be
   ** an unwelcome performance hit. Alternatives are...
   */
 #if 0
   memset(zBuf, 0, LOG_FRAME_HDRSIZE);
   rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
   if( rc!=SQLITE_OK ) goto out;
   rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
 #endif

  out:
   logIteratorFree(pIter);
   return rc;
 }

 /*
 ** Close a connection to a log file.
 */
 int sqlite3WalClose(
   Log *pLog,                      /* Log to close */
   sqlite3_file *pFd,              /* Database file */
   int sync_flags,                 /* Flags to pass to OsSync() (or 0) */
   u8 *zBuf                        /* Buffer of at least page-size bytes */
 ){
   int rc = SQLITE_OK;
   if( pLog ){
     LogLock **ppL;
     LogSummary *pSummary = pLog->pSummary;
     sqlite3_mutex *mutex = 0;

     sqlite3_mutex_enter(pSummary->mutex);
     for(ppL=&pSummary->pLock; *ppL!=&pLog->lock; ppL=&(*ppL)->pNext);
     *ppL = pLog->lock.pNext;
     sqlite3_mutex_leave(pSummary->mutex);

     if( sqlite3GlobalConfig.bCoreMutex ){
       mutex = sqlite3_mutex_alloc(LOG_SUMMARY_MUTEX);
     }
     sqlite3_mutex_enter(mutex);

     /* Decrement the reference count on the log summary. If this is the last
     ** reference to the log summary object in this process, the object will
     ** be freed. If this is also the last connection to the database, then
     ** checkpoint the database and truncate the log and log-summary files
     ** to zero bytes in size.
     **/
     pSummary->nRef--;
     if( pSummary->nRef==0 ){
       int rc;
       LogSummary **pp;
       for(pp=&pLogSummary; *pp!=pSummary; pp=&(*pp)->pNext);
       *pp = (*pp)->pNext;

       sqlite3_mutex_leave(mutex);

       rc = sqlite3OsLock(pFd, SQLITE_LOCK_EXCLUSIVE);
       if( rc==SQLITE_OK ){

         /* This is the last connection to the database (including other
         ** processes). Do three things:
         **
         **   1. Checkpoint the db.
         **   2. Truncate the log file.
         **   3. Unlink the log-summary file.
         */
         rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
         if( rc==SQLITE_OK ){
           rc = sqlite3OsDelete(pLog->pVfs, pSummary->zPath, 0);
         }

         logSummaryUnmap(pSummary, 1);
       }else{
         if( rc==SQLITE_BUSY ){
           rc = SQLITE_OK;
         }
         logSummaryUnmap(pSummary, 0);
       }

       sqlite3_mutex_free(pSummary->mutex);
       sqlite3_free(pSummary);
     }else{
       sqlite3_mutex_leave(mutex);
     }

     /* Close the connection to the log file and free the Log handle. */
     sqlite3OsClose(pLog->pFd);
     sqlite3_free(pLog);
   }
   return rc;
 }

 /*
 ** Enter and leave the log-summary mutex. In this context, entering the
 ** log-summary mutex means:
 **
 **   1. Obtaining mutex pLog->pSummary->mutex, and
 **   2. Taking an exclusive lock on the log-summary file.
 **
 ** i.e. this mutex locks out other processes as well as other threads
 ** hosted in this address space.
 */
 static int logEnterMutex(Log *pLog){
   LogSummary *pSummary = pLog->pSummary;
   int rc;

   sqlite3_mutex_enter(pSummary->mutex);
   rc = logLockMutex(pSummary, LOG_WRLOCKW);
   if( rc!=SQLITE_OK ){
     sqlite3_mutex_leave(pSummary->mutex);
   }
   return rc;
 }
 static void logLeaveMutex(Log *pLog){
   LogSummary *pSummary = pLog->pSummary;
   logLockMutex(pSummary, LOG_UNLOCK);
   sqlite3_mutex_leave(pSummary->mutex);
 }

 /*
 ** Try to read the log-summary header. Attempt to verify the header
 ** checksum. If the checksum can be verified, copy the log-summary
 ** header into structure pLog->hdr. If the contents of pLog->hdr are
 ** modified by this and pChanged is not NULL, set *pChanged to 1.
 ** Otherwise leave *pChanged unmodified.
 **
 ** If the checksum cannot be verified return SQLITE_ERROR.
 */
 int logSummaryTryHdr(Log *pLog, int *pChanged){
   u32 aCksum[2] = {1, 1};
   u32 aHdr[LOGSUMMARY_HDR_NFIELD+2];

   /* First try to read the header without a lock. Verify the checksum
   ** before returning. This will almost always work.
   */
   memcpy(aHdr, pLog->pSummary->aData, sizeof(aHdr));
   logChecksumBytes((u8*)aHdr, sizeof(u32)*LOGSUMMARY_HDR_NFIELD, aCksum);
   if( aCksum[0]!=aHdr[LOGSUMMARY_HDR_NFIELD]
    || aCksum[1]!=aHdr[LOGSUMMARY_HDR_NFIELD+1]
   ){
     return SQLITE_ERROR;
   }

   if( memcmp(&pLog->hdr, aHdr, sizeof(LogSummaryHdr)) ){
     if( pChanged ){
       *pChanged = 1;
     }
     memcpy(&pLog->hdr, aHdr, sizeof(LogSummaryHdr));
   }
   return SQLITE_OK;
 }

 /*
 ** Read the log-summary header from the log-summary file into structure
 ** pLog->hdr. If attempting to verify the header checksum fails, try
 ** to recover the log before returning.
 **
 ** If the log-summary header is successfully read, return SQLITE_OK.
 ** Otherwise an SQLite error code.
 */
 int logSummaryReadHdr(Log *pLog, int *pChanged){
   int rc;

   /* First try to read the header without a lock. Verify the checksum
   ** before returning. This will almost always work.
   */
   if( SQLITE_OK==logSummaryTryHdr(pLog, pChanged) ){
     return SQLITE_OK;
   }

   /* If the first attempt to read the header failed, lock the log-summary
   ** file and try again. If the header checksum verification fails this
   ** time as well, run log recovery.
   */
   if( SQLITE_OK==(rc = logEnterMutex(pLog)) ){
     if( SQLITE_OK!=logSummaryTryHdr(pLog, pChanged) ){
       if( pChanged ){
         *pChanged = 1;
       }
       rc = logSummaryRecover(pLog->pSummary, pLog->pFd);
       if( rc==SQLITE_OK ){
         rc = logSummaryTryHdr(pLog, 0);
       }
     }
     logLeaveMutex(pLog);
   }

   return rc;
 }

 /*
 ** Lock a snapshot.
 **
 ** If this call obtains a new read-lock and the database contents have been
 ** modified since the most recent call to LogCloseSnapshot() on this Log
 ** connection, then *pChanged is set to 1 before returning. Otherwise, it
 ** is left unmodified. This is used by the pager layer to determine whether
 ** or not any cached pages may be safely reused.
 */
 int sqlite3WalOpenSnapshot(Log *pLog, int *pChanged){
   int rc = SQLITE_OK;
   if( pLog->isLocked==0 ){
     int nAttempt;

     /* Obtain a snapshot-lock on the log-summary file. The procedure
     ** for obtaining the snapshot log is:
     **
     **    1. Attempt a SHARED lock on regions A and B.
     **    2a. If step 1 is successful, drop the lock on region B.
     **    2b. If step 1 is unsuccessful, attempt a SHARED lock on region D.
     **    3. Repeat the above until the lock attempt in step 1 or 2b is
     **       successful.
     **
     ** If neither of the locks can be obtained after 5 tries, presumably
     ** something is wrong (i.e. a process not following the locking protocol).
     ** Return an error code in this case.
     */
     rc = SQLITE_BUSY;
     for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
       rc = logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B, LOG_RDLOCK);
       if( rc==SQLITE_BUSY ){
         rc = logLockRegion(pLog, LOG_REGION_D, LOG_RDLOCK);
         if( rc==SQLITE_OK ) pLog->isLocked = LOG_REGION_D;
       }else{
         logLockRegion(pLog, LOG_REGION_B, LOG_UNLOCK);
         pLog->isLocked = LOG_REGION_A;
       }
     }
     if( rc!=SQLITE_OK ){
       return rc;
     }

     rc = logSummaryReadHdr(pLog, pChanged);
     if( rc!=SQLITE_OK ){
       /* An error occured while attempting log recovery. */
       sqlite3WalCloseSnapshot(pLog);
     }
   }
   return rc;
 }

 /*
 ** Unlock the current snapshot.
 */
 void sqlite3WalCloseSnapshot(Log *pLog){
   if( pLog->isLocked ){
     assert( pLog->isLocked==LOG_REGION_A || pLog->isLocked==LOG_REGION_D );
     logLockRegion(pLog, pLog->isLocked, LOG_UNLOCK);
   }
   pLog->isLocked = 0;
 }

 /*
 ** Read a page from the log, if it is present.
 */
 int sqlite3WalRead(Log *pLog, Pgno pgno, int *pInLog, u8 *pOut){
   u32 iRead = 0;
   u32 *aData = pLog->pSummary->aData;
   int iFrame = (pLog->hdr.iLastPg & 0xFFFFFF00);

   assert( pLog->isLocked );

   /* Do a linear search of the unindexed block of page-numbers (if any)
   ** at the end of the log-summary. An alternative to this would be to
   ** build an index in private memory each time a read transaction is
   ** opened on a new snapshot.
   */
   if( pLog->hdr.iLastPg ){
     u32 *pi = &aData[logSummaryEntry(pLog->hdr.iLastPg)];
     u32 *piStop = pi - (pLog->hdr.iLastPg & 0xFF);
     while( *pi!=pgno && pi!=piStop ) pi--;
     if( pi!=piStop ){
       iRead = (pi-piStop) + iFrame;
     }
   }
   assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );

   while( iRead==0 && iFrame>0 ){
     int iLow = 0;
     int iHigh = 255;
     u32 *aFrame;
     u8 *aIndex;

     iFrame -= 256;
     aFrame = &aData[logSummaryEntry(iFrame+1)];
     aIndex = (u8 *)&aFrame[256];

     while( iLow<=iHigh ){
       int iTest = (iLow+iHigh)>>1;
       u32 iPg = aFrame[aIndex[iTest]];

       if( iPg==pgno ){
         iRead = iFrame + 1 + aIndex[iTest];
         break;
       }
       else if( iPg<pgno ){
         iLow = iTest+1;
       }else{
         iHigh = iTest-1;
       }
     }
   }
   assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno );

   /* If iRead is non-zero, then it is the log frame number that contains the
   ** required page. Read and return data from the log file.
   */
   if( iRead ){
     i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE;
     *pInLog = 1;
     return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset);
   }

   *pInLog = 0;
   return SQLITE_OK;
 }


 /*
 ** Set *pPgno to the size of the database file (or zero, if unknown).
 */
 void sqlite3WalDbsize(Log *pLog, Pgno *pPgno){
   assert( pLog->isLocked );
   *pPgno = pLog->hdr.nPage;
 }

 /*
 ** This function returns SQLITE_OK if the caller may write to the database.
 ** Otherwise, if the caller is operating on a snapshot that has already
 ** been overwritten by another writer, SQLITE_BUSY is returned.
 */
 int sqlite3WalWriteLock(Log *pLog, int op){
   assert( pLog->isLocked );
   if( op ){

     /* Obtain the writer lock */
     int rc = logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_WRLOCK);
     if( rc!=SQLITE_OK ){
       return rc;
     }

     /* If this is connection is a region D reader, then the SHARED lock on
     ** region D has just been upgraded to EXCLUSIVE. But no lock at all is
     ** held on region A. This means that if the write-transaction is committed
     ** and this connection downgrades to a reader, it will be left with no
     ** lock at all. And so its snapshot could get clobbered by a checkpoint
     ** operation.
     **
     ** To stop this from happening, grab a SHARED lock on region A now.
     ** This should always be successful, as the only time a client holds
     ** an EXCLUSIVE lock on region A, it must also be holding an EXCLUSIVE
     ** lock on region C (a checkpointer does this). This is not possible,
     ** as this connection currently has the EXCLUSIVE lock on region C.
     */
     if( pLog->isLocked==LOG_REGION_D ){
       logLockRegion(pLog, LOG_REGION_A, LOG_RDLOCK);
       pLog->isLocked = LOG_REGION_A;
     }

     /* If this connection is not reading the most recent database snapshot,
     ** it is not possible to write to the database. In this case release
     ** the write locks and return SQLITE_BUSY.
     */
     if( memcmp(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr)) ){
       logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
       return SQLITE_BUSY;
     }
     pLog->isWriteLocked = 1;

   }else if( pLog->isWriteLocked ){
     logLockRegion(pLog, LOG_REGION_C|LOG_REGION_D, LOG_UNLOCK);
     memcpy(&pLog->hdr, pLog->pSummary->aData, sizeof(pLog->hdr));
     pLog->isWriteLocked = 0;
   }
   return SQLITE_OK;
 }

 /*
 ** The log handle passed to this function must be holding the write-lock.
 **
 ** If any data has been written (but not committed) to the log file, this
 ** function moves the write-pointer back to the start of the transaction.
 **
 ** Additionally, the callback function is invoked for each frame written
 ** to the log since the start of the transaction. If the callback returns
 ** other than SQLITE_OK, it is not invoked again and the error code is
 ** returned to the caller.
 **
 ** Otherwise, if the callback function does not return an error, this
 ** function returns SQLITE_OK.
 */
 int sqlite3WalUndo(Log *pLog, int (*xUndo)(void *, Pgno), void *pUndoCtx){
   int rc = SQLITE_OK;
   Pgno iMax = pLog->hdr.iLastPg;
   Pgno iFrame;

   assert( pLog->isWriteLocked );
   logSummaryReadHdr(pLog, 0);
   for(iFrame=pLog->hdr.iLastPg+1; iFrame<=iMax && rc==SQLITE_OK; iFrame++){
     rc = xUndo(pUndoCtx, pLog->pSummary->aData[logSummaryEntry(iFrame)]);
   }
   return rc;
 }

 /*
 ** Return true if data has been written but not committed to the log file.
 */
 int sqlite3WalDirty(Log *pLog){
   assert( pLog->isWriteLocked );
   return( pLog->hdr.iLastPg!=((LogSummaryHdr*)pLog->pSummary->aData)->iLastPg );
 }

 /*
 ** Write a set of frames to the log. The caller must hold at least a
 ** RESERVED lock on the database file.
 */
 int sqlite3WalFrames(
   Log *pLog,                      /* Log handle to write to */
   int nPgsz,                      /* Database page-size in bytes */
   PgHdr *pList,                   /* List of dirty pages to write */
   Pgno nTruncate,                 /* Database size after this commit */
   int isCommit,                   /* True if this is a commit */
   int sync_flags                  /* Flags to pass to OsSync() (or 0) */
 ){
   int rc;                         /* Used to catch return codes */
   u32 iFrame;                     /* Next frame address */
   u8 aFrame[LOG_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
   PgHdr *p;                       /* Iterator to run through pList with. */
   u32 aCksum[2];                  /* Checksums */
   PgHdr *pLast;                   /* Last frame in list */
   int nLast = 0;                  /* Number of extra copies of last page */

   assert( LOG_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
   assert( pList );

   /* If this is the first frame written into the log, write the log
   ** header to the start of the log file. See comments at the top of
   ** this file for a description of the log-header format.
   */
   assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE );
   iFrame = pLog->hdr.iLastPg;
   if( iFrame==0 ){
     sqlite3Put4byte(aFrame, nPgsz);
     sqlite3_randomness(8, &aFrame[4]);
     pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
     pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]);
     rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0);
     if( rc!=SQLITE_OK ){
       return rc;
     }
   }

   aCksum[0] = pLog->hdr.iCheck1;
   aCksum[1] = pLog->hdr.iCheck2;

   /* Write the log file. */
   for(p=pList; p; p=p->pDirty){
     u32 nDbsize;                  /* Db-size field for frame header */
     i64 iOffset;                  /* Write offset in log file */

     iOffset = logFrameOffset(++iFrame, nPgsz);

     /* Populate and write the frame header */
     nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
     logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
     rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
     if( rc!=SQLITE_OK ){
       return rc;
     }

     /* Write the page data */
     rc = sqlite3OsWrite(pLog->pFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
     if( rc!=SQLITE_OK ){
       return rc;
     }
     pLast = p;
   }

   /* Sync the log file if the 'isSync' flag was specified. */
   if( sync_flags ){
     i64 iSegment = sqlite3OsSectorSize(pLog->pFd);
     i64 iOffset = logFrameOffset(iFrame+1, nPgsz);

     assert( isCommit );

     if( iSegment<SQLITE_DEFAULT_SECTOR_SIZE ){
       iSegment = SQLITE_DEFAULT_SECTOR_SIZE;
     }
     iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
     while( iOffset<iSegment ){
       logEncodeFrame(aCksum,pLast->pgno,nTruncate,nPgsz,pLast->pData,aFrame);
       rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset);
       if( rc!=SQLITE_OK ){
         return rc;
       }

       iOffset += LOG_FRAME_HDRSIZE;
       rc = sqlite3OsWrite(pLog->pFd, pLast->pData, nPgsz, iOffset);
       if( rc!=SQLITE_OK ){
         return rc;
       }
       nLast++;
       iOffset += nPgsz;
     }

     rc = sqlite3OsSync(pLog->pFd, sync_flags);
     if( rc!=SQLITE_OK ){
       return rc;
     }
   }

   /* Append data to the log summary. It is not necessary to lock the
   ** log-summary to do this as the RESERVED lock held on the db file
   ** guarantees that there are no other writers, and no data that may
   ** be in use by existing readers is being overwritten.
   */
   iFrame = pLog->hdr.iLastPg;
   for(p=pList; p; p=p->pDirty){
     iFrame++;
     logSummaryAppend(pLog->pSummary, iFrame, p->pgno);
   }
   while( nLast>0 ){
     iFrame++;
     nLast--;
     logSummaryAppend(pLog->pSummary, iFrame, pLast->pgno);
   }

   /* Update the private copy of the header. */
   pLog->hdr.pgsz = nPgsz;
   pLog->hdr.iLastPg = iFrame;
   if( isCommit ){
     pLog->hdr.iChange++;
     pLog->hdr.nPage = nTruncate;
   }
   pLog->hdr.iCheck1 = aCksum[0];
   pLog->hdr.iCheck2 = aCksum[1];

   /* If this is a commit, update the log-summary header too. */
   if( isCommit && SQLITE_OK==(rc = logEnterMutex(pLog)) ){
     logSummaryWriteHdr(pLog->pSummary, &pLog->hdr);
     logLeaveMutex(pLog);
     pLog->iCallback = iFrame;
   }

   return rc;
 }

 /*
 ** Checkpoint the database:
 **
 **   1. Wait for an EXCLUSIVE lock on regions B and C.
 **   2. Wait for an EXCLUSIVE lock on region A.
 **   3. Copy the contents of the log into the database file.
 **   4. Zero the log-summary header (so new readers will ignore the log).
 **   5. Drop the locks obtained in steps 1 and 2.
 */
 int sqlite3WalCheckpoint(
   Log *pLog,                      /* Log connection */
   sqlite3_file *pFd,              /* File descriptor open on db file */
   int sync_flags,                 /* Flags to sync db file with (or 0) */
   u8 *zBuf,                       /* Temporary buffer to use */
   int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
   void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
 ){
   int rc;                         /* Return code */

   assert( !pLog->isLocked );

   /* Wait for an EXCLUSIVE lock on regions B and C. */
   do {
     rc = logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_WRLOCK);
   }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
   if( rc!=SQLITE_OK ) return rc;

   /* Wait for an EXCLUSIVE lock on region A. */
   do {
     rc = logLockRegion(pLog, LOG_REGION_A, LOG_WRLOCK);
   }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
   if( rc!=SQLITE_OK ){
     logLockRegion(pLog, LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
     return rc;
   }

   /* Copy data from the log to the database file. */
   rc = logSummaryReadHdr(pLog, 0);
   if( rc==SQLITE_OK ){
     rc = logCheckpoint(pLog, pFd, sync_flags, zBuf);
   }

   /* Release the locks. */
   logLockRegion(pLog, LOG_REGION_A|LOG_REGION_B|LOG_REGION_C, LOG_UNLOCK);
   return rc;
 }

 int sqlite3WalCallback(Log *pLog){
   u32 ret = 0;
   if( pLog ){
     ret = pLog->iCallback;
     pLog->iCallback = 0;
   }
   return (int)ret;
 }