blob: 533a6bbd0af809718819a8c5ee8c2528f343377b [file] [log] [blame]
drhed7c8552001-04-11 14:29:21 +00001/*
drhb19a2bc2001-09-16 00:13:26 +00002** 2001 September 15
drhed7c8552001-04-11 14:29:21 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drhed7c8552001-04-11 14:29:21 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drhed7c8552001-04-11 14:29:21 +000010**
11*************************************************************************
drhb19a2bc2001-09-16 00:13:26 +000012** This is the implementation of the page cache subsystem or "pager".
drhed7c8552001-04-11 14:29:21 +000013**
drhb19a2bc2001-09-16 00:13:26 +000014** The pager is used to access a database disk file. It implements
15** atomic commit and rollback through the use of a journal file that
16** is separate from the database file. The pager also implements file
17** locking to prevent two processes from writing the same database
18** file simultaneously, or one process from reading the database while
19** another is writing.
drhed7c8552001-04-11 14:29:21 +000020**
drhd8d66e82003-02-12 02:10:15 +000021** @(#) $Id: pager.c,v 1.74 2003/02/12 02:10:15 drh Exp $
drhed7c8552001-04-11 14:29:21 +000022*/
drh829e8022002-11-06 14:08:11 +000023#include "os.h" /* Must be first to enable large file support */
drhd9b02572001-04-15 00:37:09 +000024#include "sqliteInt.h"
drhed7c8552001-04-11 14:29:21 +000025#include "pager.h"
drhed7c8552001-04-11 14:29:21 +000026#include <assert.h>
drhd9b02572001-04-15 00:37:09 +000027#include <string.h>
drhed7c8552001-04-11 14:29:21 +000028
29/*
drhdb48ee02003-01-16 13:42:43 +000030** Macros for troubleshooting. Normally turned off
31*/
32#if 0
33static Pager *mainPager = 0;
34#define SET_PAGER(X) if( mainPager==0 ) mainPager = (X)
35#define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0
36#define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X)
37#define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y)
38#define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
39#else
40#define SET_PAGER(X)
41#define CLR_PAGER(X)
42#define TRACE1(X)
43#define TRACE2(X,Y)
44#define TRACE3(X,Y,Z)
45#endif
46
47
48/*
drhed7c8552001-04-11 14:29:21 +000049** The page cache as a whole is always in one of the following
50** states:
51**
52** SQLITE_UNLOCK The page cache is not currently reading or
53** writing the database file. There is no
54** data held in memory. This is the initial
55** state.
56**
57** SQLITE_READLOCK The page cache is reading the database.
58** Writing is not permitted. There can be
59** multiple readers accessing the same database
drh69688d52001-04-14 16:38:23 +000060** file at the same time.
drhed7c8552001-04-11 14:29:21 +000061**
62** SQLITE_WRITELOCK The page cache is writing the database.
63** Access is exclusive. No other processes or
64** threads can be reading or writing while one
65** process is writing.
66**
drh306dc212001-05-21 13:45:10 +000067** The page cache comes up in SQLITE_UNLOCK. The first time a
68** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
drhed7c8552001-04-11 14:29:21 +000069** After all pages have been released using sqlite_page_unref(),
drh306dc212001-05-21 13:45:10 +000070** the state transitions back to SQLITE_UNLOCK. The first time
drhed7c8552001-04-11 14:29:21 +000071** that sqlite_page_write() is called, the state transitions to
drh306dc212001-05-21 13:45:10 +000072** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be
73** called on an outstanding page which means that the pager must
74** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
75** The sqlite_page_rollback() and sqlite_page_commit() functions
76** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
drhed7c8552001-04-11 14:29:21 +000077*/
78#define SQLITE_UNLOCK 0
79#define SQLITE_READLOCK 1
80#define SQLITE_WRITELOCK 2
81
drhd9b02572001-04-15 00:37:09 +000082
drhed7c8552001-04-11 14:29:21 +000083/*
84** Each in-memory image of a page begins with the following header.
drhbd03cae2001-06-02 02:40:57 +000085** This header is only visible to this pager module. The client
86** code that calls pager sees only the data that follows the header.
drhed7c8552001-04-11 14:29:21 +000087*/
drhd9b02572001-04-15 00:37:09 +000088typedef struct PgHdr PgHdr;
drhed7c8552001-04-11 14:29:21 +000089struct PgHdr {
90 Pager *pPager; /* The pager to which this page belongs */
91 Pgno pgno; /* The page number for this page */
drh69688d52001-04-14 16:38:23 +000092 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
drhed7c8552001-04-11 14:29:21 +000093 int nRef; /* Number of users of this page */
drhd9b02572001-04-15 00:37:09 +000094 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
95 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */
drh03eb96a2002-11-10 23:32:56 +000096 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */
drh193a6b42002-07-07 16:52:46 +000097 u8 inJournal; /* TRUE if has been written to journal */
98 u8 inCkpt; /* TRUE if written to the checkpoint journal */
99 u8 dirty; /* TRUE if we need to write back changes */
drhdb48ee02003-01-16 13:42:43 +0000100 u8 needSync; /* Sync journal before writing this page */
drh193a6b42002-07-07 16:52:46 +0000101 u8 alwaysRollback; /* Disable dont_rollback() for this page */
drh2554f8b2003-01-22 01:26:44 +0000102 PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */
drh69688d52001-04-14 16:38:23 +0000103 /* SQLITE_PAGE_SIZE bytes of page data follow this header */
drh968af522003-02-11 14:55:40 +0000104 /* Pager.nExtra bytes of local data follow the page data and checksum */
drhed7c8552001-04-11 14:29:21 +0000105};
106
107/*
drh69688d52001-04-14 16:38:23 +0000108** Convert a pointer to a PgHdr into a pointer to its data
109** and back again.
drhed7c8552001-04-11 14:29:21 +0000110*/
111#define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
112#define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
drh7e3b0a02001-04-28 16:52:40 +0000113#define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
drhed7c8552001-04-11 14:29:21 +0000114
115/*
drhed7c8552001-04-11 14:29:21 +0000116** How big to make the hash table used for locating in-memory pages
drh836faa42003-01-11 13:30:57 +0000117** by page number.
drhed7c8552001-04-11 14:29:21 +0000118*/
drh836faa42003-01-11 13:30:57 +0000119#define N_PG_HASH 2048
120
121/*
122** Hash a page number
123*/
124#define pager_hash(PN) ((PN)&(N_PG_HASH-1))
drhed7c8552001-04-11 14:29:21 +0000125
126/*
127** A open page cache is an instance of the following structure.
128*/
129struct Pager {
130 char *zFilename; /* Name of the database file */
131 char *zJournal; /* Name of the journal file */
drh8cfbf082001-09-19 13:22:39 +0000132 OsFile fd, jfd; /* File descriptors for database and journal */
drhfa86c412002-02-02 15:01:15 +0000133 OsFile cpfd; /* File descriptor for the checkpoint journal */
drhed7c8552001-04-11 14:29:21 +0000134 int dbSize; /* Number of pages in the file */
drh69688d52001-04-14 16:38:23 +0000135 int origDbSize; /* dbSize before the current change */
drh28be87c2002-11-05 23:03:02 +0000136 int ckptSize; /* Size of database (in pages) at ckpt_begin() */
137 off_t ckptJSize; /* Size of journal at ckpt_begin() */
drhdb48ee02003-01-16 13:42:43 +0000138#ifndef NDEBUG
139 off_t syncJSize; /* Size of journal at last fsync() call */
140#endif
drh968af522003-02-11 14:55:40 +0000141 int nRec; /* Number of pages written to the journal */
142 u32 cksumInit; /* Quasi-random value added to every checksum */
drh9bd47a92003-01-07 14:46:08 +0000143 int ckptNRec; /* Number of records in the checkpoint journal */
drh7e3b0a02001-04-28 16:52:40 +0000144 int nExtra; /* Add this many bytes to each in-memory page */
drh72f82862001-05-24 21:06:34 +0000145 void (*xDestructor)(void*); /* Call this routine when freeing pages */
drhed7c8552001-04-11 14:29:21 +0000146 int nPage; /* Total number of in-memory pages */
drhd9b02572001-04-15 00:37:09 +0000147 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
drhed7c8552001-04-11 14:29:21 +0000148 int mxPage; /* Maximum number of pages to hold in cache */
drhd9b02572001-04-15 00:37:09 +0000149 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
drh603240c2002-03-05 01:11:12 +0000150 u8 journalOpen; /* True if journal file descriptors is valid */
drhdb48ee02003-01-16 13:42:43 +0000151 u8 journalStarted; /* True if initial magic of journal is synced */
drhda47d772002-12-02 04:25:19 +0000152 u8 useJournal; /* Do not use a rollback journal on this file */
drh603240c2002-03-05 01:11:12 +0000153 u8 ckptOpen; /* True if the checkpoint journal is open */
drh0f892532002-05-30 12:27:03 +0000154 u8 ckptInUse; /* True we are in a checkpoint */
drhda47d772002-12-02 04:25:19 +0000155 u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/
drh603240c2002-03-05 01:11:12 +0000156 u8 noSync; /* Do not sync the journal if true */
drh968af522003-02-11 14:55:40 +0000157 u8 fullSync; /* Do extra syncs of the journal for robustness */
drh603240c2002-03-05 01:11:12 +0000158 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
159 u8 errMask; /* One of several kinds of errors */
160 u8 tempFile; /* zFilename is a temporary file */
161 u8 readOnly; /* True for a read-only database */
162 u8 needSync; /* True if an fsync() is needed on the journal */
drha1680452002-04-18 01:56:57 +0000163 u8 dirtyFile; /* True if database file has changed in any way */
drh193a6b42002-07-07 16:52:46 +0000164 u8 alwaysRollback; /* Disable dont_rollback() for all pages */
drh603240c2002-03-05 01:11:12 +0000165 u8 *aInJournal; /* One bit for each page in the database file */
166 u8 *aInCkpt; /* One bit for each page in the database */
drhed7c8552001-04-11 14:29:21 +0000167 PgHdr *pFirst, *pLast; /* List of free pages */
drh341eae82003-01-21 02:39:36 +0000168 PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */
drhd9b02572001-04-15 00:37:09 +0000169 PgHdr *pAll; /* List of all pages */
drh03eb96a2002-11-10 23:32:56 +0000170 PgHdr *pCkpt; /* List of pages in the checkpoint journal */
drhed7c8552001-04-11 14:29:21 +0000171 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */
drhd9b02572001-04-15 00:37:09 +0000172};
173
174/*
175** These are bits that can be set in Pager.errMask.
176*/
177#define PAGER_ERR_FULL 0x01 /* a write() failed */
178#define PAGER_ERR_MEM 0x02 /* malloc() failed */
179#define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
180#define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
drh81a20f22001-10-12 17:30:04 +0000181#define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
drhd9b02572001-04-15 00:37:09 +0000182
183/*
184** The journal file contains page records in the following
185** format.
drh968af522003-02-11 14:55:40 +0000186**
187** Actually, this structure is the complete page record for pager
188** formats less than 3. Beginning with format 3, this record is surrounded
189** by two checksums.
drhd9b02572001-04-15 00:37:09 +0000190*/
191typedef struct PageRecord PageRecord;
192struct PageRecord {
193 Pgno pgno; /* The page number */
194 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
195};
196
197/*
drh5e00f6c2001-09-13 13:46:56 +0000198** Journal files begin with the following magic string. The data
199** was obtained from /dev/random. It is used only as a sanity check.
drh94f33312002-08-12 12:29:56 +0000200**
drh968af522003-02-11 14:55:40 +0000201** There are three journal formats (so far). The 1st journal format writes
202** 32-bit integers in the byte-order of the host machine. New
203** formats writes integers as big-endian. All new journals use the
drh94f33312002-08-12 12:29:56 +0000204** new format, but we have to be able to read an older journal in order
drh968af522003-02-11 14:55:40 +0000205** to rollback journals created by older versions of the library.
206**
207** The 3rd journal format (added for 2.8.0) adds additional sanity
208** checking information to the journal. If the power fails while the
209** journal is being written, semi-random garbage data might appear in
210** the journal file after power is restored. If an attempt is then made
211** to roll the journal back, the database could be corrupted. The additional
212** sanity checking data is an attempt to discover the garbage in the
213** journal and ignore it.
214**
215** The sanity checking information for the 3rd journal format consists
216** of a 32-bit checksum on each page of data. The checksum covers both
217** the page number and the SQLITE_PAGE_SIZE bytes of data for the page.
218** This cksum is initialized to a 32-bit random value that appears in the
219** journal file right after the header. The random initializer is important,
220** because garbage data that appears at the end of a journal is likely
221** data that was once in other files that have now been deleted. If the
222** garbage data came from an obsolete journal file, the checksums might
223** be correct. But by initializing the checksum to random value which
224** is different for every journal, we minimize that risk.
drhd9b02572001-04-15 00:37:09 +0000225*/
drh968af522003-02-11 14:55:40 +0000226static const unsigned char aJournalMagic1[] = {
drhd9b02572001-04-15 00:37:09 +0000227 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
drhed7c8552001-04-11 14:29:21 +0000228};
drh968af522003-02-11 14:55:40 +0000229static const unsigned char aJournalMagic2[] = {
drh94f33312002-08-12 12:29:56 +0000230 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
231};
drh968af522003-02-11 14:55:40 +0000232static const unsigned char aJournalMagic3[] = {
233 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
234};
235#define JOURNAL_FORMAT_1 1
236#define JOURNAL_FORMAT_2 2
237#define JOURNAL_FORMAT_3 3
drh94f33312002-08-12 12:29:56 +0000238
239/*
drh968af522003-02-11 14:55:40 +0000240** The following integer determines what format to use when creating
241** new primary journal files. By default we always use format 3.
242** When testing, we can set this value to older journal formats in order to
243** make sure that newer versions of the library are able to rollback older
244** journal files.
245**
246** Note that checkpoint journals always use format 2 and omit the header.
drh94f33312002-08-12 12:29:56 +0000247*/
248#ifdef SQLITE_TEST
drh968af522003-02-11 14:55:40 +0000249int journal_format = 3;
drh74587e52002-08-13 00:01:16 +0000250#else
drh968af522003-02-11 14:55:40 +0000251# define journal_format 3
drh94f33312002-08-12 12:29:56 +0000252#endif
drhed7c8552001-04-11 14:29:21 +0000253
254/*
drh968af522003-02-11 14:55:40 +0000255** The size of the header and of each page in the journal varies according
256** to which journal format is being used. The following macros figure out
257** the sizes based on format numbers.
258*/
259#define JOURNAL_HDR_SZ(X) \
260 (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
261#define JOURNAL_PG_SZ(X) \
262 (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
263
264/*
drhdd793422001-06-28 01:54:48 +0000265** Enable reference count tracking here:
266*/
drh74587e52002-08-13 00:01:16 +0000267#ifdef SQLITE_TEST
drh5e00f6c2001-09-13 13:46:56 +0000268 int pager_refinfo_enable = 0;
drhdd793422001-06-28 01:54:48 +0000269 static void pager_refinfo(PgHdr *p){
270 static int cnt = 0;
271 if( !pager_refinfo_enable ) return;
272 printf(
273 "REFCNT: %4d addr=0x%08x nRef=%d\n",
274 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
275 );
276 cnt++; /* Something to set a breakpoint on */
277 }
278# define REFINFO(X) pager_refinfo(X)
279#else
280# define REFINFO(X)
281#endif
282
283/*
drh94f33312002-08-12 12:29:56 +0000284** Read a 32-bit integer from the given file descriptor
285*/
drh968af522003-02-11 14:55:40 +0000286static int read32bits(int format, OsFile *fd, u32 *pRes){
drh94f33312002-08-12 12:29:56 +0000287 u32 res;
288 int rc;
289 rc = sqliteOsRead(fd, &res, sizeof(res));
drh968af522003-02-11 14:55:40 +0000290 if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){
drh94f33312002-08-12 12:29:56 +0000291 unsigned char ac[4];
292 memcpy(ac, &res, 4);
293 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
294 }
295 *pRes = res;
296 return rc;
297}
298
299/*
300** Write a 32-bit integer into the given file descriptor. Writing
301** is always done using the new journal format.
302*/
303static int write32bits(OsFile *fd, u32 val){
304 unsigned char ac[4];
drh968af522003-02-11 14:55:40 +0000305 if( journal_format<=1 ){
drh94f33312002-08-12 12:29:56 +0000306 return sqliteOsWrite(fd, &val, 4);
307 }
drh94f33312002-08-12 12:29:56 +0000308 ac[0] = (val>>24) & 0xff;
309 ac[1] = (val>>16) & 0xff;
310 ac[2] = (val>>8) & 0xff;
311 ac[3] = val & 0xff;
312 return sqliteOsWrite(fd, ac, 4);
313}
314
drh2554f8b2003-01-22 01:26:44 +0000315/*
316** Write a 32-bit integer into a page header right before the
317** page data. This will overwrite the PgHdr.pDirty pointer.
318*/
drh968af522003-02-11 14:55:40 +0000319static void store32bits(u32 val, PgHdr *p, int offset){
drh2554f8b2003-01-22 01:26:44 +0000320 unsigned char *ac;
drh968af522003-02-11 14:55:40 +0000321 ac = &((char*)PGHDR_TO_DATA(p))[offset];
322 if( journal_format<=1 ){
drh2554f8b2003-01-22 01:26:44 +0000323 memcpy(ac, &val, 4);
324 }else{
325 ac[0] = (val>>24) & 0xff;
326 ac[1] = (val>>16) & 0xff;
327 ac[2] = (val>>8) & 0xff;
328 ac[3] = val & 0xff;
329 }
330}
331
drh94f33312002-08-12 12:29:56 +0000332
333/*
drhd9b02572001-04-15 00:37:09 +0000334** Convert the bits in the pPager->errMask into an approprate
335** return code.
336*/
337static int pager_errcode(Pager *pPager){
338 int rc = SQLITE_OK;
339 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL;
drh81a20f22001-10-12 17:30:04 +0000340 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR;
drhd9b02572001-04-15 00:37:09 +0000341 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL;
342 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM;
343 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
344 return rc;
drhed7c8552001-04-11 14:29:21 +0000345}
346
347/*
drh03eb96a2002-11-10 23:32:56 +0000348** Add or remove a page from the list of all pages that are in the
349** checkpoint journal.
350**
351** The Pager keeps a separate list of pages that are currently in
352** the checkpoint journal. This helps the sqlitepager_ckpt_commit()
353** routine run MUCH faster for the common case where there are many
354** pages in memory but only a few are in the checkpoint journal.
355*/
356static void page_add_to_ckpt_list(PgHdr *pPg){
357 Pager *pPager = pPg->pPager;
358 if( pPg->inCkpt ) return;
359 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
360 pPg->pPrevCkpt = 0;
361 if( pPager->pCkpt ){
362 pPager->pCkpt->pPrevCkpt = pPg;
363 }
364 pPg->pNextCkpt = pPager->pCkpt;
365 pPager->pCkpt = pPg;
366 pPg->inCkpt = 1;
367}
368static void page_remove_from_ckpt_list(PgHdr *pPg){
369 if( !pPg->inCkpt ) return;
370 if( pPg->pPrevCkpt ){
371 assert( pPg->pPrevCkpt->pNextCkpt==pPg );
372 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
373 }else{
374 assert( pPg->pPager->pCkpt==pPg );
375 pPg->pPager->pCkpt = pPg->pNextCkpt;
376 }
377 if( pPg->pNextCkpt ){
378 assert( pPg->pNextCkpt->pPrevCkpt==pPg );
379 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
380 }
381 pPg->pNextCkpt = 0;
382 pPg->pPrevCkpt = 0;
383 pPg->inCkpt = 0;
384}
385
386/*
drhed7c8552001-04-11 14:29:21 +0000387** Find a page in the hash table given its page number. Return
388** a pointer to the page or NULL if not found.
389*/
drhd9b02572001-04-15 00:37:09 +0000390static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
drh836faa42003-01-11 13:30:57 +0000391 PgHdr *p = pPager->aHash[pager_hash(pgno)];
drhed7c8552001-04-11 14:29:21 +0000392 while( p && p->pgno!=pgno ){
393 p = p->pNextHash;
394 }
395 return p;
396}
397
398/*
399** Unlock the database and clear the in-memory cache. This routine
400** sets the state of the pager back to what it was when it was first
401** opened. Any outstanding pages are invalidated and subsequent attempts
402** to access those pages will likely result in a coredump.
403*/
drhd9b02572001-04-15 00:37:09 +0000404static void pager_reset(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +0000405 PgHdr *pPg, *pNext;
drhd9b02572001-04-15 00:37:09 +0000406 for(pPg=pPager->pAll; pPg; pPg=pNext){
407 pNext = pPg->pNextAll;
408 sqliteFree(pPg);
drhed7c8552001-04-11 14:29:21 +0000409 }
410 pPager->pFirst = 0;
drh341eae82003-01-21 02:39:36 +0000411 pPager->pFirstSynced = 0;
drhd9b02572001-04-15 00:37:09 +0000412 pPager->pLast = 0;
413 pPager->pAll = 0;
drhed7c8552001-04-11 14:29:21 +0000414 memset(pPager->aHash, 0, sizeof(pPager->aHash));
415 pPager->nPage = 0;
drhfa86c412002-02-02 15:01:15 +0000416 if( pPager->state>=SQLITE_WRITELOCK ){
drhd9b02572001-04-15 00:37:09 +0000417 sqlitepager_rollback(pPager);
drhed7c8552001-04-11 14:29:21 +0000418 }
drha7fcb052001-12-14 15:09:55 +0000419 sqliteOsUnlock(&pPager->fd);
drhed7c8552001-04-11 14:29:21 +0000420 pPager->state = SQLITE_UNLOCK;
drhd9b02572001-04-15 00:37:09 +0000421 pPager->dbSize = -1;
drhed7c8552001-04-11 14:29:21 +0000422 pPager->nRef = 0;
drh8cfbf082001-09-19 13:22:39 +0000423 assert( pPager->journalOpen==0 );
drhed7c8552001-04-11 14:29:21 +0000424}
425
426/*
427** When this routine is called, the pager has the journal file open and
428** a write lock on the database. This routine releases the database
429** write lock and acquires a read lock in its place. The journal file
430** is deleted and closed.
drhed7c8552001-04-11 14:29:21 +0000431*/
drhd9b02572001-04-15 00:37:09 +0000432static int pager_unwritelock(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +0000433 int rc;
drhd9b02572001-04-15 00:37:09 +0000434 PgHdr *pPg;
drhfa86c412002-02-02 15:01:15 +0000435 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
drh663fc632002-02-02 18:49:19 +0000436 sqlitepager_ckpt_commit(pPager);
drh0f892532002-05-30 12:27:03 +0000437 if( pPager->ckptOpen ){
438 sqliteOsClose(&pPager->cpfd);
439 pPager->ckptOpen = 0;
440 }
drhda47d772002-12-02 04:25:19 +0000441 if( pPager->journalOpen ){
442 sqliteOsClose(&pPager->jfd);
443 pPager->journalOpen = 0;
444 sqliteOsDelete(pPager->zJournal);
445 sqliteFree( pPager->aInJournal );
446 pPager->aInJournal = 0;
447 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
448 pPg->inJournal = 0;
449 pPg->dirty = 0;
drhdb48ee02003-01-16 13:42:43 +0000450 pPg->needSync = 0;
drhda47d772002-12-02 04:25:19 +0000451 }
452 }else{
453 assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
drhd9b02572001-04-15 00:37:09 +0000454 }
drhda47d772002-12-02 04:25:19 +0000455 rc = sqliteOsReadLock(&pPager->fd);
drh8e298f92002-07-06 16:28:47 +0000456 if( rc==SQLITE_OK ){
457 pPager->state = SQLITE_READLOCK;
458 }else{
459 /* This can only happen if a process does a BEGIN, then forks and the
460 ** child process does the COMMIT. Because of the semantics of unix
461 ** file locking, the unlock will fail.
462 */
463 pPager->state = SQLITE_UNLOCK;
464 }
drhed7c8552001-04-11 14:29:21 +0000465 return rc;
466}
467
drhed7c8552001-04-11 14:29:21 +0000468/*
drh968af522003-02-11 14:55:40 +0000469** Compute and return a checksum for the page of data.
470*/
471static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
472 u32 cksum = pPager->cksumInit + pgno;
473 /* const u8 *a = (const u8*)aData;
474 int i;
475 for(i=0; i<SQLITE_PAGE_SIZE; i++){ cksum += a[i]; } */
476 /* fprintf(stderr,"CKSUM for %p(%08x) page %d: %08x\n", pPager, pPager->cksumInit, pgno, cksum); */
477 return cksum;
478}
479
480/*
drhfa86c412002-02-02 15:01:15 +0000481** Read a single page from the journal file opened on file descriptor
482** jfd. Playback this one page.
drh968af522003-02-11 14:55:40 +0000483**
484** There are three different journal formats. The format parameter determines
485** which format is used by the journal that is played back.
drhfa86c412002-02-02 15:01:15 +0000486*/
drh968af522003-02-11 14:55:40 +0000487static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
drhfa86c412002-02-02 15:01:15 +0000488 int rc;
489 PgHdr *pPg; /* An existing page in the cache */
490 PageRecord pgRec;
drh968af522003-02-11 14:55:40 +0000491 u32 cksum;
drhfa86c412002-02-02 15:01:15 +0000492
drh968af522003-02-11 14:55:40 +0000493 rc = read32bits(format, jfd, &pgRec.pgno);
494 if( rc!=SQLITE_OK ) return SQLITE_DONE;
drh94f33312002-08-12 12:29:56 +0000495 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
drh968af522003-02-11 14:55:40 +0000496 if( rc!=SQLITE_OK ) return SQLITE_DONE;
drhfa86c412002-02-02 15:01:15 +0000497
drh968af522003-02-11 14:55:40 +0000498 /* Sanity checking on the page. This is more important that I originally
499 ** thought. If a power failure occurs while the journal is being written,
500 ** it could cause invalid data to be written into the journal. We need to
501 ** detect this invalid data (with high probability) and ignore it.
502 */
503 if( pgRec.pgno==0 ){
504 return SQLITE_DONE;
505 }
506 if( pgRec.pgno>pPager->dbSize ){
507 return SQLITE_OK;
508 }
509 if( format>=JOURNAL_FORMAT_3 ){
510 rc = read32bits(format, jfd, &cksum);
511 if( rc ) return SQLITE_DONE;
512 if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
513 return SQLITE_DONE;
514 }
515 }
drhfa86c412002-02-02 15:01:15 +0000516
517 /* Playback the page. Update the in-memory copy of the page
518 ** at the same time, if there is one.
519 */
520 pPg = pager_lookup(pPager, pgRec.pgno);
drhdb48ee02003-01-16 13:42:43 +0000521 if( pPg==0 || pPg->needSync==0 ){
522 TRACE2("PLAYBACK %d\n", pgRec.pgno);
523 sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
524 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
525 }
drhfa86c412002-02-02 15:01:15 +0000526 if( pPg ){
drh3a840692003-01-29 22:58:26 +0000527 if( pPg->nRef==0 ||
528 memcmp(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE)==0
529 ){
530 /* Do not update the data on this page if the page is in use
531 ** and the page has never been modified. This avoids resetting
532 ** the "extra" data. That in turn avoids invalidating BTree cursors
533 ** in trees that have never been modified. The end result is that
534 ** you can have a SELECT going on in one table and ROLLBACK changes
535 ** to a different table and the SELECT is unaffected by the ROLLBACK.
536 */
537 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
538 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
539 }
drhdb48ee02003-01-16 13:42:43 +0000540 pPg->dirty = 0;
541 pPg->needSync = 0;
drhfa86c412002-02-02 15:01:15 +0000542 }
543 return rc;
544}
545
546/*
drhed7c8552001-04-11 14:29:21 +0000547** Playback the journal and thus restore the database file to
548** the state it was in before we started making changes.
549**
drhd9b02572001-04-15 00:37:09 +0000550** The journal file format is as follows: There is an initial
551** file-type string for sanity checking. Then there is a single
552** Pgno number which is the number of pages in the database before
553** changes were made. The database is truncated to this size.
drh306dc212001-05-21 13:45:10 +0000554** Next come zero or more page records where each page record
555** consists of a Pgno and SQLITE_PAGE_SIZE bytes of data. See
556** the PageRecord structure for details.
drhed7c8552001-04-11 14:29:21 +0000557**
drhd9b02572001-04-15 00:37:09 +0000558** If the file opened as the journal file is not a well-formed
559** journal file (as determined by looking at the magic number
560** at the beginning) then this routine returns SQLITE_PROTOCOL.
561** If any other errors occur during playback, the database will
562** likely be corrupted, so the PAGER_ERR_CORRUPT bit is set in
563** pPager->errMask and SQLITE_CORRUPT is returned. If it all
564** works, then this routine returns SQLITE_OK.
drhed7c8552001-04-11 14:29:21 +0000565*/
drhd9b02572001-04-15 00:37:09 +0000566static int pager_playback(Pager *pPager){
drh968af522003-02-11 14:55:40 +0000567 off_t szJ; /* Size of the journal file in bytes */
568 int nRec; /* Number of Records in the journal */
drhd9b02572001-04-15 00:37:09 +0000569 int i; /* Loop counter */
570 Pgno mxPg = 0; /* Size of the original file in pages */
drh968af522003-02-11 14:55:40 +0000571 int format; /* Format of the journal file. */
572 unsigned char aMagic[sizeof(aJournalMagic1)];
drhed7c8552001-04-11 14:29:21 +0000573 int rc;
574
drhc3a64ba2001-11-22 00:01:27 +0000575 /* Figure out how many records are in the journal. Abort early if
576 ** the journal is empty.
drhed7c8552001-04-11 14:29:21 +0000577 */
drh8cfbf082001-09-19 13:22:39 +0000578 assert( pPager->journalOpen );
drha7fcb052001-12-14 15:09:55 +0000579 sqliteOsSeek(&pPager->jfd, 0);
drh968af522003-02-11 14:55:40 +0000580 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
drhc3a64ba2001-11-22 00:01:27 +0000581 if( rc!=SQLITE_OK ){
582 goto end_playback;
583 }
drh968af522003-02-11 14:55:40 +0000584 if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
drhc3a64ba2001-11-22 00:01:27 +0000585 goto end_playback;
586 }
587
588 /* Read the beginning of the journal and truncate the
589 ** database file back to its original size.
590 */
drha7fcb052001-12-14 15:09:55 +0000591 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
drh94f33312002-08-12 12:29:56 +0000592 if( rc!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000593 rc = SQLITE_PROTOCOL;
594 goto end_playback;
drhd9b02572001-04-15 00:37:09 +0000595 }
drh968af522003-02-11 14:55:40 +0000596 if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
597 format = JOURNAL_FORMAT_3;
598 }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
599 format = JOURNAL_FORMAT_2;
600 }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
601 format = JOURNAL_FORMAT_1;
drh94f33312002-08-12 12:29:56 +0000602 }else{
603 rc = SQLITE_PROTOCOL;
604 goto end_playback;
605 }
drh968af522003-02-11 14:55:40 +0000606 if( format>=JOURNAL_FORMAT_3 ){
607 rc = read32bits(format, &pPager->jfd, &nRec);
608 if( rc ) goto end_playback;
609 rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
610 if( rc ) goto end_playback;
611 if( nRec==0xffffffff ){
612 nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3);
613 }
614 }else{
drhd8d66e82003-02-12 02:10:15 +0000615 nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2);
616 assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
drh968af522003-02-11 14:55:40 +0000617 }
618 rc = read32bits(format, &pPager->jfd, &mxPg);
drhd9b02572001-04-15 00:37:09 +0000619 if( rc!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000620 goto end_playback;
drhd9b02572001-04-15 00:37:09 +0000621 }
drhd8d66e82003-02-12 02:10:15 +0000622 assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
drh28be87c2002-11-05 23:03:02 +0000623 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
drh81a20f22001-10-12 17:30:04 +0000624 if( rc!=SQLITE_OK ){
625 goto end_playback;
626 }
drhd9b02572001-04-15 00:37:09 +0000627 pPager->dbSize = mxPg;
628
drhfa86c412002-02-02 15:01:15 +0000629 /* Copy original pages out of the journal and back into the database file.
drhed7c8552001-04-11 14:29:21 +0000630 */
drh968af522003-02-11 14:55:40 +0000631 for(i=0; i<nRec; i++){
632 rc = pager_playback_one_page(pPager, &pPager->jfd, format);
633 if( rc!=SQLITE_OK ){
634 if( rc==SQLITE_DONE ){
drh968af522003-02-11 14:55:40 +0000635 rc = SQLITE_OK;
636 }
637 break;
638 }
drhed7c8552001-04-11 14:29:21 +0000639 }
drh81a20f22001-10-12 17:30:04 +0000640
drhdb48ee02003-01-16 13:42:43 +0000641
drh81a20f22001-10-12 17:30:04 +0000642end_playback:
drhdb48ee02003-01-16 13:42:43 +0000643#if !defined(NDEBUG) && defined(SQLITE_TEST)
644 /* For pages that were never written into the journal, restore the
645 ** memory copy from the original database file.
646 **
647 ** This is code is used during testing only. It is necessary to
648 ** compensate for the sqliteOsTruncate() call inside
649 ** sqlitepager_rollback().
650 */
651 if( rc==SQLITE_OK ){
652 PgHdr *pPg;
653 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
drh3a840692003-01-29 22:58:26 +0000654 char zBuf[SQLITE_PAGE_SIZE];
drhdb48ee02003-01-16 13:42:43 +0000655 if( (int)pPg->pgno <= pPager->origDbSize ){
656 sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
drh3a840692003-01-29 22:58:26 +0000657 rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
drhdb48ee02003-01-16 13:42:43 +0000658 if( rc ) break;
659 }else{
drh3a840692003-01-29 22:58:26 +0000660 memset(zBuf, 0, SQLITE_PAGE_SIZE);
drhdb48ee02003-01-16 13:42:43 +0000661 }
drh3a840692003-01-29 22:58:26 +0000662 if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){
663 memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE);
664 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
665 }
drhdb48ee02003-01-16 13:42:43 +0000666 pPg->needSync = 0;
667 pPg->dirty = 0;
668 }
669 }
670#endif
drhd9b02572001-04-15 00:37:09 +0000671 if( rc!=SQLITE_OK ){
672 pager_unwritelock(pPager);
673 pPager->errMask |= PAGER_ERR_CORRUPT;
674 rc = SQLITE_CORRUPT;
675 }else{
676 rc = pager_unwritelock(pPager);
drhed7c8552001-04-11 14:29:21 +0000677 }
drhd9b02572001-04-15 00:37:09 +0000678 return rc;
drhed7c8552001-04-11 14:29:21 +0000679}
680
681/*
drhfa86c412002-02-02 15:01:15 +0000682** Playback the checkpoint journal.
683**
684** This is similar to playing back the transaction journal but with
685** a few extra twists.
686**
drh663fc632002-02-02 18:49:19 +0000687** (1) The number of pages in the database file at the start of
688** the checkpoint is stored in pPager->ckptSize, not in the
689** journal file itself.
drhfa86c412002-02-02 15:01:15 +0000690**
691** (2) In addition to playing back the checkpoint journal, also
692** playback all pages of the transaction journal beginning
693** at offset pPager->ckptJSize.
694*/
695static int pager_ckpt_playback(Pager *pPager){
drh968af522003-02-11 14:55:40 +0000696 off_t szJ; /* Size of the full journal */
697 int nRec; /* Number of Records */
drhfa86c412002-02-02 15:01:15 +0000698 int i; /* Loop counter */
699 int rc;
700
701 /* Truncate the database back to its original size.
702 */
drh28be87c2002-11-05 23:03:02 +0000703 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
drhfa86c412002-02-02 15:01:15 +0000704 pPager->dbSize = pPager->ckptSize;
705
706 /* Figure out how many records are in the checkpoint journal.
707 */
drh0f892532002-05-30 12:27:03 +0000708 assert( pPager->ckptInUse && pPager->journalOpen );
drhfa86c412002-02-02 15:01:15 +0000709 sqliteOsSeek(&pPager->cpfd, 0);
drh9bd47a92003-01-07 14:46:08 +0000710 nRec = pPager->ckptNRec;
drhfa86c412002-02-02 15:01:15 +0000711
712 /* Copy original pages out of the checkpoint journal and back into the
drh968af522003-02-11 14:55:40 +0000713 ** database file. Note that the checkpoint journal always uses format
714 ** 2 instead of format 3 since it does not need to be concerned with
715 ** power failures corrupting the journal and can thus omit the checksums.
drhfa86c412002-02-02 15:01:15 +0000716 */
717 for(i=nRec-1; i>=0; i--){
drh968af522003-02-11 14:55:40 +0000718 rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
719 assert( rc!=SQLITE_DONE );
drhfa86c412002-02-02 15:01:15 +0000720 if( rc!=SQLITE_OK ) goto end_ckpt_playback;
721 }
722
723 /* Figure out how many pages need to be copied out of the transaction
724 ** journal.
725 */
726 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
727 if( rc!=SQLITE_OK ){
728 goto end_ckpt_playback;
729 }
drh968af522003-02-11 14:55:40 +0000730 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
drhfa86c412002-02-02 15:01:15 +0000731 if( rc!=SQLITE_OK ){
732 goto end_ckpt_playback;
733 }
drh968af522003-02-11 14:55:40 +0000734 nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format);
drhfa86c412002-02-02 15:01:15 +0000735 for(i=nRec-1; i>=0; i--){
drh968af522003-02-11 14:55:40 +0000736 rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format);
737 if( rc!=SQLITE_OK ){
738 assert( rc!=SQLITE_DONE );
739 goto end_ckpt_playback;
740 }
drhfa86c412002-02-02 15:01:15 +0000741 }
742
drhfa86c412002-02-02 15:01:15 +0000743end_ckpt_playback:
drhfa86c412002-02-02 15:01:15 +0000744 if( rc!=SQLITE_OK ){
drhfa86c412002-02-02 15:01:15 +0000745 pPager->errMask |= PAGER_ERR_CORRUPT;
746 rc = SQLITE_CORRUPT;
drhfa86c412002-02-02 15:01:15 +0000747 }
748 return rc;
749}
750
751/*
drhf57b14a2001-09-14 18:54:08 +0000752** Change the maximum number of in-memory pages that are allowed.
drhcd61c282002-03-06 22:01:34 +0000753**
754** The maximum number is the absolute value of the mxPage parameter.
755** If mxPage is negative, the noSync flag is also set. noSync bypasses
756** calls to sqliteOsSync(). The pager runs much faster with noSync on,
757** but if the operating system crashes or there is an abrupt power
758** failure, the database file might be left in an inconsistent and
759** unrepairable state.
drhf57b14a2001-09-14 18:54:08 +0000760*/
761void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
drh603240c2002-03-05 01:11:12 +0000762 if( mxPage>=0 ){
drha1680452002-04-18 01:56:57 +0000763 pPager->noSync = pPager->tempFile;
drh603240c2002-03-05 01:11:12 +0000764 }else{
765 pPager->noSync = 1;
766 mxPage = -mxPage;
767 }
drhf57b14a2001-09-14 18:54:08 +0000768 if( mxPage>10 ){
769 pPager->mxPage = mxPage;
770 }
771}
772
773/*
drhfa86c412002-02-02 15:01:15 +0000774** Open a temporary file. Write the name of the file into zName
775** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write
776** the file descriptor into *fd. Return SQLITE_OK on success or some
777** other error code if we fail.
778**
779** The OS will automatically delete the temporary file when it is
780** closed.
781*/
782static int sqlitepager_opentemp(char *zFile, OsFile *fd){
783 int cnt = 8;
784 int rc;
785 do{
786 cnt--;
787 sqliteOsTempFileName(zFile);
788 rc = sqliteOsOpenExclusive(zFile, fd, 1);
789 }while( cnt>0 && rc!=SQLITE_OK );
790 return rc;
791}
792
793/*
drhed7c8552001-04-11 14:29:21 +0000794** Create a new page cache and put a pointer to the page cache in *ppPager.
drh5e00f6c2001-09-13 13:46:56 +0000795** The file to be cached need not exist. The file is not locked until
drhd9b02572001-04-15 00:37:09 +0000796** the first call to sqlitepager_get() and is only held open until the
797** last page is released using sqlitepager_unref().
drh382c0242001-10-06 16:33:02 +0000798**
drh6446c4d2001-12-15 14:22:18 +0000799** If zFilename is NULL then a randomly-named temporary file is created
800** and used as the file to be cached. The file will be deleted
801** automatically when it is closed.
drhed7c8552001-04-11 14:29:21 +0000802*/
drh7e3b0a02001-04-28 16:52:40 +0000803int sqlitepager_open(
804 Pager **ppPager, /* Return the Pager structure here */
805 const char *zFilename, /* Name of the database file to open */
806 int mxPage, /* Max number of in-memory cache pages */
drhda47d772002-12-02 04:25:19 +0000807 int nExtra, /* Extra bytes append to each in-memory page */
808 int useJournal /* TRUE to use a rollback journal on this file */
drh7e3b0a02001-04-28 16:52:40 +0000809){
drhed7c8552001-04-11 14:29:21 +0000810 Pager *pPager;
drh3e7a6092002-12-07 21:45:14 +0000811 char *zFullPathname;
drhed7c8552001-04-11 14:29:21 +0000812 int nameLen;
drh8cfbf082001-09-19 13:22:39 +0000813 OsFile fd;
814 int rc;
drh5e00f6c2001-09-13 13:46:56 +0000815 int tempFile;
816 int readOnly = 0;
drh8cfbf082001-09-19 13:22:39 +0000817 char zTemp[SQLITE_TEMPNAME_SIZE];
drhed7c8552001-04-11 14:29:21 +0000818
drhd9b02572001-04-15 00:37:09 +0000819 *ppPager = 0;
820 if( sqlite_malloc_failed ){
821 return SQLITE_NOMEM;
822 }
drh5e00f6c2001-09-13 13:46:56 +0000823 if( zFilename ){
drh3e7a6092002-12-07 21:45:14 +0000824 zFullPathname = sqliteOsFullPathname(zFilename);
825 rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly);
drh5e00f6c2001-09-13 13:46:56 +0000826 tempFile = 0;
827 }else{
drhfa86c412002-02-02 15:01:15 +0000828 rc = sqlitepager_opentemp(zTemp, &fd);
drh5e00f6c2001-09-13 13:46:56 +0000829 zFilename = zTemp;
drh3e7a6092002-12-07 21:45:14 +0000830 zFullPathname = sqliteOsFullPathname(zFilename);
drh5e00f6c2001-09-13 13:46:56 +0000831 tempFile = 1;
832 }
drh3e7a6092002-12-07 21:45:14 +0000833 if( sqlite_malloc_failed ){
834 return SQLITE_NOMEM;
835 }
drh8cfbf082001-09-19 13:22:39 +0000836 if( rc!=SQLITE_OK ){
drh3e7a6092002-12-07 21:45:14 +0000837 sqliteFree(zFullPathname);
drhed7c8552001-04-11 14:29:21 +0000838 return SQLITE_CANTOPEN;
839 }
drh3e7a6092002-12-07 21:45:14 +0000840 nameLen = strlen(zFullPathname);
drhed7c8552001-04-11 14:29:21 +0000841 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*2 + 30 );
drhd9b02572001-04-15 00:37:09 +0000842 if( pPager==0 ){
drha7fcb052001-12-14 15:09:55 +0000843 sqliteOsClose(&fd);
drh3e7a6092002-12-07 21:45:14 +0000844 sqliteFree(zFullPathname);
drhd9b02572001-04-15 00:37:09 +0000845 return SQLITE_NOMEM;
846 }
drhdb48ee02003-01-16 13:42:43 +0000847 SET_PAGER(pPager);
drhed7c8552001-04-11 14:29:21 +0000848 pPager->zFilename = (char*)&pPager[1];
849 pPager->zJournal = &pPager->zFilename[nameLen+1];
drh3e7a6092002-12-07 21:45:14 +0000850 strcpy(pPager->zFilename, zFullPathname);
851 strcpy(pPager->zJournal, zFullPathname);
852 sqliteFree(zFullPathname);
drhed7c8552001-04-11 14:29:21 +0000853 strcpy(&pPager->zJournal[nameLen], "-journal");
854 pPager->fd = fd;
drh8cfbf082001-09-19 13:22:39 +0000855 pPager->journalOpen = 0;
drhda47d772002-12-02 04:25:19 +0000856 pPager->useJournal = useJournal;
drhfa86c412002-02-02 15:01:15 +0000857 pPager->ckptOpen = 0;
drh0f892532002-05-30 12:27:03 +0000858 pPager->ckptInUse = 0;
drhed7c8552001-04-11 14:29:21 +0000859 pPager->nRef = 0;
860 pPager->dbSize = -1;
drhfa86c412002-02-02 15:01:15 +0000861 pPager->ckptSize = 0;
862 pPager->ckptJSize = 0;
drhed7c8552001-04-11 14:29:21 +0000863 pPager->nPage = 0;
drhd79caeb2001-04-15 02:27:24 +0000864 pPager->mxPage = mxPage>5 ? mxPage : 10;
drhed7c8552001-04-11 14:29:21 +0000865 pPager->state = SQLITE_UNLOCK;
drhd9b02572001-04-15 00:37:09 +0000866 pPager->errMask = 0;
drh5e00f6c2001-09-13 13:46:56 +0000867 pPager->tempFile = tempFile;
868 pPager->readOnly = readOnly;
drhf57b14a2001-09-14 18:54:08 +0000869 pPager->needSync = 0;
drhda47d772002-12-02 04:25:19 +0000870 pPager->noSync = pPager->tempFile || !useJournal;
drhed7c8552001-04-11 14:29:21 +0000871 pPager->pFirst = 0;
drh341eae82003-01-21 02:39:36 +0000872 pPager->pFirstSynced = 0;
drhed7c8552001-04-11 14:29:21 +0000873 pPager->pLast = 0;
drh7c717f72001-06-24 20:39:41 +0000874 pPager->nExtra = nExtra;
drhed7c8552001-04-11 14:29:21 +0000875 memset(pPager->aHash, 0, sizeof(pPager->aHash));
876 *ppPager = pPager;
877 return SQLITE_OK;
878}
879
880/*
drh72f82862001-05-24 21:06:34 +0000881** Set the destructor for this pager. If not NULL, the destructor is called
drh5e00f6c2001-09-13 13:46:56 +0000882** when the reference count on each page reaches zero. The destructor can
883** be used to clean up information in the extra segment appended to each page.
drh72f82862001-05-24 21:06:34 +0000884**
885** The destructor is not called as a result sqlitepager_close().
886** Destructors are only called by sqlitepager_unref().
887*/
888void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
889 pPager->xDestructor = xDesc;
890}
891
892/*
drh5e00f6c2001-09-13 13:46:56 +0000893** Return the total number of pages in the disk file associated with
894** pPager.
drhed7c8552001-04-11 14:29:21 +0000895*/
drhd9b02572001-04-15 00:37:09 +0000896int sqlitepager_pagecount(Pager *pPager){
drh28be87c2002-11-05 23:03:02 +0000897 off_t n;
drhd9b02572001-04-15 00:37:09 +0000898 assert( pPager!=0 );
drhed7c8552001-04-11 14:29:21 +0000899 if( pPager->dbSize>=0 ){
900 return pPager->dbSize;
901 }
drha7fcb052001-12-14 15:09:55 +0000902 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000903 pPager->errMask |= PAGER_ERR_DISK;
drh8cfbf082001-09-19 13:22:39 +0000904 return 0;
drhed7c8552001-04-11 14:29:21 +0000905 }
drh8cfbf082001-09-19 13:22:39 +0000906 n /= SQLITE_PAGE_SIZE;
drhd9b02572001-04-15 00:37:09 +0000907 if( pPager->state!=SQLITE_UNLOCK ){
drhed7c8552001-04-11 14:29:21 +0000908 pPager->dbSize = n;
909 }
910 return n;
911}
912
913/*
914** Shutdown the page cache. Free all memory and close all files.
915**
916** If a transaction was in progress when this routine is called, that
917** transaction is rolled back. All outstanding pages are invalidated
918** and their memory is freed. Any attempt to use a page associated
919** with this page cache after this function returns will likely
920** result in a coredump.
921*/
drhd9b02572001-04-15 00:37:09 +0000922int sqlitepager_close(Pager *pPager){
923 PgHdr *pPg, *pNext;
drhed7c8552001-04-11 14:29:21 +0000924 switch( pPager->state ){
925 case SQLITE_WRITELOCK: {
drhd9b02572001-04-15 00:37:09 +0000926 sqlitepager_rollback(pPager);
drha7fcb052001-12-14 15:09:55 +0000927 sqliteOsUnlock(&pPager->fd);
drh8cfbf082001-09-19 13:22:39 +0000928 assert( pPager->journalOpen==0 );
drhed7c8552001-04-11 14:29:21 +0000929 break;
930 }
931 case SQLITE_READLOCK: {
drha7fcb052001-12-14 15:09:55 +0000932 sqliteOsUnlock(&pPager->fd);
drhed7c8552001-04-11 14:29:21 +0000933 break;
934 }
935 default: {
936 /* Do nothing */
937 break;
938 }
939 }
drhd9b02572001-04-15 00:37:09 +0000940 for(pPg=pPager->pAll; pPg; pPg=pNext){
941 pNext = pPg->pNextAll;
942 sqliteFree(pPg);
drhed7c8552001-04-11 14:29:21 +0000943 }
drha7fcb052001-12-14 15:09:55 +0000944 sqliteOsClose(&pPager->fd);
drh8cfbf082001-09-19 13:22:39 +0000945 assert( pPager->journalOpen==0 );
drh0f892532002-05-30 12:27:03 +0000946 /* Temp files are automatically deleted by the OS
947 ** if( pPager->tempFile ){
948 ** sqliteOsDelete(pPager->zFilename);
949 ** }
950 */
drhdb48ee02003-01-16 13:42:43 +0000951 CLR_PAGER(pPager);
drhed7c8552001-04-11 14:29:21 +0000952 sqliteFree(pPager);
953 return SQLITE_OK;
954}
955
956/*
drh5e00f6c2001-09-13 13:46:56 +0000957** Return the page number for the given page data.
drhed7c8552001-04-11 14:29:21 +0000958*/
drhd9b02572001-04-15 00:37:09 +0000959Pgno sqlitepager_pagenumber(void *pData){
drhed7c8552001-04-11 14:29:21 +0000960 PgHdr *p = DATA_TO_PGHDR(pData);
961 return p->pgno;
962}
963
964/*
drh7e3b0a02001-04-28 16:52:40 +0000965** Increment the reference count for a page. If the page is
966** currently on the freelist (the reference count is zero) then
967** remove it from the freelist.
968*/
drh836faa42003-01-11 13:30:57 +0000969#define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
970static void _page_ref(PgHdr *pPg){
drh7e3b0a02001-04-28 16:52:40 +0000971 if( pPg->nRef==0 ){
972 /* The page is currently on the freelist. Remove it. */
drh341eae82003-01-21 02:39:36 +0000973 if( pPg==pPg->pPager->pFirstSynced ){
974 PgHdr *p = pPg->pNextFree;
975 while( p && p->needSync ){ p = p->pNextFree; }
976 pPg->pPager->pFirstSynced = p;
977 }
drh7e3b0a02001-04-28 16:52:40 +0000978 if( pPg->pPrevFree ){
979 pPg->pPrevFree->pNextFree = pPg->pNextFree;
980 }else{
981 pPg->pPager->pFirst = pPg->pNextFree;
982 }
983 if( pPg->pNextFree ){
984 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
985 }else{
986 pPg->pPager->pLast = pPg->pPrevFree;
987 }
988 pPg->pPager->nRef++;
989 }
990 pPg->nRef++;
drhdd793422001-06-28 01:54:48 +0000991 REFINFO(pPg);
drhdf0b3b02001-06-23 11:36:20 +0000992}
993
994/*
995** Increment the reference count for a page. The input pointer is
996** a reference to the page data.
997*/
998int sqlitepager_ref(void *pData){
999 PgHdr *pPg = DATA_TO_PGHDR(pData);
1000 page_ref(pPg);
drh8c42ca92001-06-22 19:15:00 +00001001 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001002}
1003
1004/*
drhb19a2bc2001-09-16 00:13:26 +00001005** Sync the journal and then write all free dirty pages to the database
1006** file.
1007**
1008** Writing all free dirty pages to the database after the sync is a
1009** non-obvious optimization. fsync() is an expensive operation so we
drhaaab5722002-02-19 13:39:21 +00001010** want to minimize the number ot times it is called. After an fsync() call,
drh6446c4d2001-12-15 14:22:18 +00001011** we are free to write dirty pages back to the database. It is best
1012** to go ahead and write as many dirty pages as possible to minimize
1013** the risk of having to do another fsync() later on. Writing dirty
1014** free pages in this way was observed to make database operations go
1015** up to 10 times faster.
drhfa86c412002-02-02 15:01:15 +00001016**
1017** If we are writing to temporary database, there is no need to preserve
1018** the integrity of the journal file, so we can save time and skip the
1019** fsync().
drh50e5dad2001-09-15 00:57:28 +00001020*/
1021static int syncAllPages(Pager *pPager){
1022 PgHdr *pPg;
1023 int rc = SQLITE_OK;
drh03eb96a2002-11-10 23:32:56 +00001024
1025 /* Sync the journal before modifying the main database
1026 ** (assuming there is a journal and it needs to be synced.)
1027 */
drh50e5dad2001-09-15 00:57:28 +00001028 if( pPager->needSync ){
drhfa86c412002-02-02 15:01:15 +00001029 if( !pPager->tempFile ){
drhdb48ee02003-01-16 13:42:43 +00001030 assert( pPager->journalOpen );
1031 assert( !pPager->noSync );
drh968af522003-02-11 14:55:40 +00001032#ifndef NDEBUG
1033 {
1034 off_t hdrSz, pgSz;
1035 hdrSz = JOURNAL_HDR_SZ(journal_format);
1036 pgSz = JOURNAL_PG_SZ(journal_format);
1037 rc = sqliteOsFileSize(&pPager->jfd, &pPager->syncJSize);
1038 if( rc!=0 ) return rc;
1039 assert( pPager->nRec*pgSz+hdrSz==pPager->syncJSize );
1040 }
1041#endif
drhd8d66e82003-02-12 02:10:15 +00001042 if( journal_format>=3 ){
1043 off_t szJ;
1044 if( pPager->fullSync ){
1045 TRACE1("SYNC\n");
1046 rc = sqliteOsSync(&pPager->jfd);
1047 if( rc!=0 ) return rc;
1048 }
1049 sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1));
1050 write32bits(&pPager->jfd, pPager->nRec);
1051 szJ = JOURNAL_HDR_SZ(journal_format) +
1052 pPager->nRec*JOURNAL_PG_SZ(journal_format);
1053 sqliteOsSeek(&pPager->jfd, szJ);
drh968af522003-02-11 14:55:40 +00001054 }
drhdb48ee02003-01-16 13:42:43 +00001055 TRACE1("SYNC\n");
drhfa86c412002-02-02 15:01:15 +00001056 rc = sqliteOsSync(&pPager->jfd);
1057 if( rc!=0 ) return rc;
drhdb48ee02003-01-16 13:42:43 +00001058 pPager->journalStarted = 1;
drhfa86c412002-02-02 15:01:15 +00001059 }
drh50e5dad2001-09-15 00:57:28 +00001060 pPager->needSync = 0;
drh341eae82003-01-21 02:39:36 +00001061
1062 /* Erase the needSync flag from every page.
1063 */
1064 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1065 pPg->needSync = 0;
1066 }
1067 pPager->pFirstSynced = pPager->pFirst;
drh50e5dad2001-09-15 00:57:28 +00001068 }
drh03eb96a2002-11-10 23:32:56 +00001069
drh341eae82003-01-21 02:39:36 +00001070#ifndef NDEBUG
1071 /* If the Pager.needSync flag is clear then the PgHdr.needSync
1072 ** flag must also be clear for all pages. Verify that this
1073 ** invariant is true.
drh03eb96a2002-11-10 23:32:56 +00001074 */
drh341eae82003-01-21 02:39:36 +00001075 else{
1076 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1077 assert( pPg->needSync==0 );
1078 }
1079 assert( pPager->pFirstSynced==pPager->pFirst );
drh03eb96a2002-11-10 23:32:56 +00001080 }
drh341eae82003-01-21 02:39:36 +00001081#endif
drhdb48ee02003-01-16 13:42:43 +00001082
drh81a20f22001-10-12 17:30:04 +00001083 return rc;
drh50e5dad2001-09-15 00:57:28 +00001084}
1085
1086/*
drh2554f8b2003-01-22 01:26:44 +00001087** Given a list of pages (connected by the PgHdr.pDirty pointer) write
1088** every one of those pages out to the database file and mark them all
1089** as clean.
1090*/
1091static int pager_write_pagelist(PgHdr *pList){
1092 Pager *pPager;
1093 int rc;
1094
1095 if( pList==0 ) return SQLITE_OK;
1096 pPager = pList->pPager;
1097 while( pList ){
1098 assert( pList->dirty );
1099 sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1100 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
1101 if( rc ) return rc;
1102 pList->dirty = 0;
1103 pList = pList->pDirty;
1104 }
1105 return SQLITE_OK;
1106}
1107
1108/*
1109** Collect every dirty page into a dirty list and
1110** return a pointer to the head of that list. All pages are
1111** collected even if they are still in use.
1112*/
1113static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
1114 PgHdr *p, *pList;
1115 pList = 0;
1116 for(p=pPager->pAll; p; p=p->pNextAll){
1117 if( p->dirty ){
1118 p->pDirty = pList;
1119 pList = p;
1120 }
1121 }
1122 return pList;
1123}
1124
1125/*
drhd9b02572001-04-15 00:37:09 +00001126** Acquire a page.
1127**
drh58a11682001-11-10 13:51:08 +00001128** A read lock on the disk file is obtained when the first page is acquired.
drh5e00f6c2001-09-13 13:46:56 +00001129** This read lock is dropped when the last page is released.
drhd9b02572001-04-15 00:37:09 +00001130**
drh306dc212001-05-21 13:45:10 +00001131** A _get works for any page number greater than 0. If the database
1132** file is smaller than the requested page, then no actual disk
1133** read occurs and the memory image of the page is initialized to
1134** all zeros. The extra data appended to a page is always initialized
1135** to zeros the first time a page is loaded into memory.
1136**
drhd9b02572001-04-15 00:37:09 +00001137** The acquisition might fail for several reasons. In all cases,
1138** an appropriate error code is returned and *ppPage is set to NULL.
drh7e3b0a02001-04-28 16:52:40 +00001139**
1140** See also sqlitepager_lookup(). Both this routine and _lookup() attempt
1141** to find a page in the in-memory cache first. If the page is not already
drh5e00f6c2001-09-13 13:46:56 +00001142** in memory, this routine goes to disk to read it in whereas _lookup()
drh7e3b0a02001-04-28 16:52:40 +00001143** just returns 0. This routine acquires a read-lock the first time it
1144** has to go to disk, and could also playback an old journal if necessary.
1145** Since _lookup() never goes to disk, it never has to deal with locks
1146** or journal files.
drhed7c8552001-04-11 14:29:21 +00001147*/
drhd9b02572001-04-15 00:37:09 +00001148int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
drhed7c8552001-04-11 14:29:21 +00001149 PgHdr *pPg;
drh8766c342002-11-09 00:33:15 +00001150 int rc;
drhed7c8552001-04-11 14:29:21 +00001151
drhd9b02572001-04-15 00:37:09 +00001152 /* Make sure we have not hit any critical errors.
1153 */
drh836faa42003-01-11 13:30:57 +00001154 assert( pPager!=0 );
1155 assert( pgno!=0 );
drhd9b02572001-04-15 00:37:09 +00001156 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1157 return pager_errcode(pPager);
1158 }
1159
drhed7c8552001-04-11 14:29:21 +00001160 /* If this is the first page accessed, then get a read lock
1161 ** on the database file.
1162 */
1163 if( pPager->nRef==0 ){
drh8766c342002-11-09 00:33:15 +00001164 rc = sqliteOsReadLock(&pPager->fd);
1165 if( rc!=SQLITE_OK ){
drhed7c8552001-04-11 14:29:21 +00001166 *ppPage = 0;
drh8766c342002-11-09 00:33:15 +00001167 return rc;
drhed7c8552001-04-11 14:29:21 +00001168 }
drhd9b02572001-04-15 00:37:09 +00001169 pPager->state = SQLITE_READLOCK;
drhed7c8552001-04-11 14:29:21 +00001170
1171 /* If a journal file exists, try to play it back.
1172 */
drhda47d772002-12-02 04:25:19 +00001173 if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){
drhf57b3392001-10-08 13:22:32 +00001174 int rc, dummy;
drhed7c8552001-04-11 14:29:21 +00001175
drha7fcb052001-12-14 15:09:55 +00001176 /* Get a write lock on the database
1177 */
1178 rc = sqliteOsWriteLock(&pPager->fd);
1179 if( rc!=SQLITE_OK ){
drh8766c342002-11-09 00:33:15 +00001180 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
1181 /* This should never happen! */
1182 rc = SQLITE_INTERNAL;
1183 }
drha7fcb052001-12-14 15:09:55 +00001184 *ppPage = 0;
drh8766c342002-11-09 00:33:15 +00001185 return rc;
drha7fcb052001-12-14 15:09:55 +00001186 }
1187 pPager->state = SQLITE_WRITELOCK;
1188
drhed7c8552001-04-11 14:29:21 +00001189 /* Open the journal for exclusive access. Return SQLITE_BUSY if
drhf57b3392001-10-08 13:22:32 +00001190 ** we cannot get exclusive access to the journal file.
1191 **
1192 ** Even though we will only be reading from the journal, not writing,
1193 ** we have to open the journal for writing in order to obtain an
1194 ** exclusive access lock.
drhed7c8552001-04-11 14:29:21 +00001195 */
drhf57b3392001-10-08 13:22:32 +00001196 rc = sqliteOsOpenReadWrite(pPager->zJournal, &pPager->jfd, &dummy);
drha7fcb052001-12-14 15:09:55 +00001197 if( rc!=SQLITE_OK ){
1198 rc = sqliteOsUnlock(&pPager->fd);
1199 assert( rc==SQLITE_OK );
drhed7c8552001-04-11 14:29:21 +00001200 *ppPage = 0;
1201 return SQLITE_BUSY;
1202 }
drha7fcb052001-12-14 15:09:55 +00001203 pPager->journalOpen = 1;
drhdb48ee02003-01-16 13:42:43 +00001204 pPager->journalStarted = 0;
drhed7c8552001-04-11 14:29:21 +00001205
1206 /* Playback and delete the journal. Drop the database write
1207 ** lock and reacquire the read lock.
1208 */
drhd9b02572001-04-15 00:37:09 +00001209 rc = pager_playback(pPager);
1210 if( rc!=SQLITE_OK ){
1211 return rc;
1212 }
drhed7c8552001-04-11 14:29:21 +00001213 }
1214 pPg = 0;
1215 }else{
1216 /* Search for page in cache */
drhd9b02572001-04-15 00:37:09 +00001217 pPg = pager_lookup(pPager, pgno);
drhed7c8552001-04-11 14:29:21 +00001218 }
1219 if( pPg==0 ){
drhd9b02572001-04-15 00:37:09 +00001220 /* The requested page is not in the page cache. */
drhed7c8552001-04-11 14:29:21 +00001221 int h;
drh7e3b0a02001-04-28 16:52:40 +00001222 pPager->nMiss++;
drhed7c8552001-04-11 14:29:21 +00001223 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
1224 /* Create a new page */
drh968af522003-02-11 14:55:40 +00001225 pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE
1226 + sizeof(u32) + pPager->nExtra );
drhd9b02572001-04-15 00:37:09 +00001227 if( pPg==0 ){
1228 *ppPage = 0;
1229 pager_unwritelock(pPager);
1230 pPager->errMask |= PAGER_ERR_MEM;
1231 return SQLITE_NOMEM;
1232 }
drh8c1238a2003-01-02 14:43:55 +00001233 memset(pPg, 0, sizeof(*pPg));
drhed7c8552001-04-11 14:29:21 +00001234 pPg->pPager = pPager;
drhd9b02572001-04-15 00:37:09 +00001235 pPg->pNextAll = pPager->pAll;
1236 if( pPager->pAll ){
1237 pPager->pAll->pPrevAll = pPg;
1238 }
1239 pPg->pPrevAll = 0;
drhd79caeb2001-04-15 02:27:24 +00001240 pPager->pAll = pPg;
drhd9b02572001-04-15 00:37:09 +00001241 pPager->nPage++;
drhed7c8552001-04-11 14:29:21 +00001242 }else{
drhdb48ee02003-01-16 13:42:43 +00001243 /* Find a page to recycle. Try to locate a page that does not
1244 ** require us to do an fsync() on the journal.
1245 */
drh341eae82003-01-21 02:39:36 +00001246 pPg = pPager->pFirstSynced;
drhb19a2bc2001-09-16 00:13:26 +00001247
drhdb48ee02003-01-16 13:42:43 +00001248 /* If we could not find a page that does not require an fsync()
1249 ** on the journal file then fsync the journal file. This is a
1250 ** very slow operation, so we work hard to avoid it. But sometimes
1251 ** it can't be helped.
drhb19a2bc2001-09-16 00:13:26 +00001252 */
drh603240c2002-03-05 01:11:12 +00001253 if( pPg==0 ){
drh50e5dad2001-09-15 00:57:28 +00001254 int rc = syncAllPages(pPager);
1255 if( rc!=0 ){
1256 sqlitepager_rollback(pPager);
1257 *ppPage = 0;
1258 return SQLITE_IOERR;
1259 }
1260 pPg = pPager->pFirst;
1261 }
drhd9b02572001-04-15 00:37:09 +00001262 assert( pPg->nRef==0 );
drhdb48ee02003-01-16 13:42:43 +00001263
1264 /* Write the page to the database file if it is dirty.
1265 */
1266 if( pPg->dirty ){
1267 assert( pPg->needSync==0 );
drh2554f8b2003-01-22 01:26:44 +00001268 pPg->pDirty = 0;
1269 rc = pager_write_pagelist( pPg );
drhdb48ee02003-01-16 13:42:43 +00001270 if( rc!=SQLITE_OK ){
1271 sqlitepager_rollback(pPager);
1272 *ppPage = 0;
1273 return SQLITE_IOERR;
1274 }
drhdb48ee02003-01-16 13:42:43 +00001275 }
drh50e5dad2001-09-15 00:57:28 +00001276 assert( pPg->dirty==0 );
drhd9b02572001-04-15 00:37:09 +00001277
drhdb48ee02003-01-16 13:42:43 +00001278 /* If the page we are recycling is marked as alwaysRollback, then
drh193a6b42002-07-07 16:52:46 +00001279 ** set the global alwaysRollback flag, thus disabling the
1280 ** sqlite_dont_rollback() optimization for the rest of this transaction.
1281 ** It is necessary to do this because the page marked alwaysRollback
1282 ** might be reloaded at a later time but at that point we won't remember
1283 ** that is was marked alwaysRollback. This means that all pages must
1284 ** be marked as alwaysRollback from here on out.
1285 */
1286 if( pPg->alwaysRollback ){
1287 pPager->alwaysRollback = 1;
1288 }
1289
drhd9b02572001-04-15 00:37:09 +00001290 /* Unlink the old page from the free list and the hash table
1291 */
drh341eae82003-01-21 02:39:36 +00001292 if( pPg==pPager->pFirstSynced ){
1293 PgHdr *p = pPg->pNextFree;
1294 while( p && p->needSync ){ p = p->pNextFree; }
1295 pPager->pFirstSynced = p;
1296 }
drh6019e162001-07-02 17:51:45 +00001297 if( pPg->pPrevFree ){
1298 pPg->pPrevFree->pNextFree = pPg->pNextFree;
drhed7c8552001-04-11 14:29:21 +00001299 }else{
drh6019e162001-07-02 17:51:45 +00001300 assert( pPager->pFirst==pPg );
1301 pPager->pFirst = pPg->pNextFree;
drhed7c8552001-04-11 14:29:21 +00001302 }
drh6019e162001-07-02 17:51:45 +00001303 if( pPg->pNextFree ){
1304 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1305 }else{
1306 assert( pPager->pLast==pPg );
1307 pPager->pLast = pPg->pPrevFree;
1308 }
1309 pPg->pNextFree = pPg->pPrevFree = 0;
drhed7c8552001-04-11 14:29:21 +00001310 if( pPg->pNextHash ){
1311 pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1312 }
1313 if( pPg->pPrevHash ){
1314 pPg->pPrevHash->pNextHash = pPg->pNextHash;
1315 }else{
drhd9b02572001-04-15 00:37:09 +00001316 h = pager_hash(pPg->pgno);
drhed7c8552001-04-11 14:29:21 +00001317 assert( pPager->aHash[h]==pPg );
1318 pPager->aHash[h] = pPg->pNextHash;
1319 }
drh6019e162001-07-02 17:51:45 +00001320 pPg->pNextHash = pPg->pPrevHash = 0;
drhd9b02572001-04-15 00:37:09 +00001321 pPager->nOvfl++;
drhed7c8552001-04-11 14:29:21 +00001322 }
1323 pPg->pgno = pgno;
drh1ab43002002-01-14 09:28:19 +00001324 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
drhed6c8672003-01-12 18:02:16 +00001325 sqliteCheckMemory(pPager->aInJournal, pgno/8);
drhdb48ee02003-01-16 13:42:43 +00001326 assert( pPager->journalOpen );
drh6019e162001-07-02 17:51:45 +00001327 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
drhdb48ee02003-01-16 13:42:43 +00001328 pPg->needSync = 0;
drh6019e162001-07-02 17:51:45 +00001329 }else{
1330 pPg->inJournal = 0;
drhdb48ee02003-01-16 13:42:43 +00001331 pPg->needSync = 0;
drh6019e162001-07-02 17:51:45 +00001332 }
drh03eb96a2002-11-10 23:32:56 +00001333 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
1334 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
1335 page_add_to_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001336 }else{
drh03eb96a2002-11-10 23:32:56 +00001337 page_remove_from_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001338 }
drhed7c8552001-04-11 14:29:21 +00001339 pPg->dirty = 0;
1340 pPg->nRef = 1;
drhdd793422001-06-28 01:54:48 +00001341 REFINFO(pPg);
drhd9b02572001-04-15 00:37:09 +00001342 pPager->nRef++;
1343 h = pager_hash(pgno);
drhed7c8552001-04-11 14:29:21 +00001344 pPg->pNextHash = pPager->aHash[h];
1345 pPager->aHash[h] = pPg;
1346 if( pPg->pNextHash ){
1347 assert( pPg->pNextHash->pPrevHash==0 );
1348 pPg->pNextHash->pPrevHash = pPg;
1349 }
drh306dc212001-05-21 13:45:10 +00001350 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
drh1ab43002002-01-14 09:28:19 +00001351 if( pPager->dbSize<(int)pgno ){
drh306dc212001-05-21 13:45:10 +00001352 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1353 }else{
drh81a20f22001-10-12 17:30:04 +00001354 int rc;
drhd0d006e2002-12-01 02:00:57 +00001355 sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
drha7fcb052001-12-14 15:09:55 +00001356 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
drh81a20f22001-10-12 17:30:04 +00001357 if( rc!=SQLITE_OK ){
drh28be87c2002-11-05 23:03:02 +00001358 off_t fileSize;
drh4e371ee2002-09-05 16:08:27 +00001359 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
1360 || fileSize>=pgno*SQLITE_PAGE_SIZE ){
1361 return rc;
1362 }else{
1363 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1364 }
drh81a20f22001-10-12 17:30:04 +00001365 }
drh306dc212001-05-21 13:45:10 +00001366 }
drh7e3b0a02001-04-28 16:52:40 +00001367 if( pPager->nExtra>0 ){
1368 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
1369 }
drhed7c8552001-04-11 14:29:21 +00001370 }else{
drhd9b02572001-04-15 00:37:09 +00001371 /* The requested page is in the page cache. */
drh7e3b0a02001-04-28 16:52:40 +00001372 pPager->nHit++;
drhdf0b3b02001-06-23 11:36:20 +00001373 page_ref(pPg);
drhed7c8552001-04-11 14:29:21 +00001374 }
1375 *ppPage = PGHDR_TO_DATA(pPg);
1376 return SQLITE_OK;
1377}
1378
1379/*
drh7e3b0a02001-04-28 16:52:40 +00001380** Acquire a page if it is already in the in-memory cache. Do
1381** not read the page from disk. Return a pointer to the page,
1382** or 0 if the page is not in cache.
1383**
1384** See also sqlitepager_get(). The difference between this routine
1385** and sqlitepager_get() is that _get() will go to the disk and read
1386** in the page if the page is not already in cache. This routine
drh5e00f6c2001-09-13 13:46:56 +00001387** returns NULL if the page is not in cache or if a disk I/O error
1388** has ever happened.
drh7e3b0a02001-04-28 16:52:40 +00001389*/
1390void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
1391 PgHdr *pPg;
1392
drh836faa42003-01-11 13:30:57 +00001393 assert( pPager!=0 );
1394 assert( pgno!=0 );
drh7e3b0a02001-04-28 16:52:40 +00001395 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1396 return 0;
1397 }
drh836faa42003-01-11 13:30:57 +00001398 /* if( pPager->nRef==0 ){
1399 ** return 0;
1400 ** }
1401 */
drh7e3b0a02001-04-28 16:52:40 +00001402 pPg = pager_lookup(pPager, pgno);
1403 if( pPg==0 ) return 0;
drhdf0b3b02001-06-23 11:36:20 +00001404 page_ref(pPg);
drh7e3b0a02001-04-28 16:52:40 +00001405 return PGHDR_TO_DATA(pPg);
1406}
1407
1408/*
drhed7c8552001-04-11 14:29:21 +00001409** Release a page.
1410**
1411** If the number of references to the page drop to zero, then the
1412** page is added to the LRU list. When all references to all pages
drhd9b02572001-04-15 00:37:09 +00001413** are released, a rollback occurs and the lock on the database is
drhed7c8552001-04-11 14:29:21 +00001414** removed.
1415*/
drhd9b02572001-04-15 00:37:09 +00001416int sqlitepager_unref(void *pData){
drhed7c8552001-04-11 14:29:21 +00001417 PgHdr *pPg;
drhd9b02572001-04-15 00:37:09 +00001418
1419 /* Decrement the reference count for this page
1420 */
drhed7c8552001-04-11 14:29:21 +00001421 pPg = DATA_TO_PGHDR(pData);
1422 assert( pPg->nRef>0 );
drhed7c8552001-04-11 14:29:21 +00001423 pPg->nRef--;
drhdd793422001-06-28 01:54:48 +00001424 REFINFO(pPg);
drhd9b02572001-04-15 00:37:09 +00001425
drh72f82862001-05-24 21:06:34 +00001426 /* When the number of references to a page reach 0, call the
1427 ** destructor and add the page to the freelist.
drhd9b02572001-04-15 00:37:09 +00001428 */
drhed7c8552001-04-11 14:29:21 +00001429 if( pPg->nRef==0 ){
drh1eaa2692001-09-18 02:02:23 +00001430 Pager *pPager;
1431 pPager = pPg->pPager;
drhd9b02572001-04-15 00:37:09 +00001432 pPg->pNextFree = 0;
1433 pPg->pPrevFree = pPager->pLast;
drhed7c8552001-04-11 14:29:21 +00001434 pPager->pLast = pPg;
drhd9b02572001-04-15 00:37:09 +00001435 if( pPg->pPrevFree ){
1436 pPg->pPrevFree->pNextFree = pPg;
drhed7c8552001-04-11 14:29:21 +00001437 }else{
1438 pPager->pFirst = pPg;
1439 }
drh341eae82003-01-21 02:39:36 +00001440 if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
1441 pPager->pFirstSynced = pPg;
1442 }
drh72f82862001-05-24 21:06:34 +00001443 if( pPager->xDestructor ){
1444 pPager->xDestructor(pData);
1445 }
drhd9b02572001-04-15 00:37:09 +00001446
1447 /* When all pages reach the freelist, drop the read lock from
1448 ** the database file.
1449 */
1450 pPager->nRef--;
1451 assert( pPager->nRef>=0 );
1452 if( pPager->nRef==0 ){
1453 pager_reset(pPager);
1454 }
drhed7c8552001-04-11 14:29:21 +00001455 }
drhd9b02572001-04-15 00:37:09 +00001456 return SQLITE_OK;
drhed7c8552001-04-11 14:29:21 +00001457}
1458
1459/*
drhda47d772002-12-02 04:25:19 +00001460** Create a journal file for pPager. There should already be a write
1461** lock on the database file when this routine is called.
1462**
1463** Return SQLITE_OK if everything. Return an error code and release the
1464** write lock if anything goes wrong.
1465*/
1466static int pager_open_journal(Pager *pPager){
1467 int rc;
1468 assert( pPager->state==SQLITE_WRITELOCK );
1469 assert( pPager->journalOpen==0 );
1470 assert( pPager->useJournal );
1471 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
1472 if( pPager->aInJournal==0 ){
1473 sqliteOsReadLock(&pPager->fd);
1474 pPager->state = SQLITE_READLOCK;
1475 return SQLITE_NOMEM;
1476 }
1477 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
1478 if( rc!=SQLITE_OK ){
1479 sqliteFree(pPager->aInJournal);
1480 pPager->aInJournal = 0;
1481 sqliteOsReadLock(&pPager->fd);
1482 pPager->state = SQLITE_READLOCK;
1483 return SQLITE_CANTOPEN;
1484 }
1485 pPager->journalOpen = 1;
drhdb48ee02003-01-16 13:42:43 +00001486 pPager->journalStarted = 0;
drhda47d772002-12-02 04:25:19 +00001487 pPager->needSync = 0;
1488 pPager->alwaysRollback = 0;
drh968af522003-02-11 14:55:40 +00001489 pPager->nRec = 0;
drhda47d772002-12-02 04:25:19 +00001490 sqlitepager_pagecount(pPager);
1491 pPager->origDbSize = pPager->dbSize;
drh968af522003-02-11 14:55:40 +00001492 if( journal_format==JOURNAL_FORMAT_3 ){
1493 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3));
1494 if( rc==SQLITE_OK ){
1495 rc = write32bits(&pPager->jfd, pPager->tempFile ? 0xffffffff : 0);
1496 }
1497 if( rc==SQLITE_OK ){
1498 pPager->cksumInit = (u32)sqliteRandomInteger();
1499 rc = write32bits(&pPager->jfd, pPager->cksumInit);
1500 }
1501 }else if( journal_format==JOURNAL_FORMAT_2 ){
1502 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2));
drhda47d772002-12-02 04:25:19 +00001503 }else{
drh968af522003-02-11 14:55:40 +00001504 assert( journal_format==JOURNAL_FORMAT_1 );
1505 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1));
drhda47d772002-12-02 04:25:19 +00001506 }
1507 if( rc==SQLITE_OK ){
1508 rc = write32bits(&pPager->jfd, pPager->dbSize);
1509 }
1510 if( pPager->ckptAutoopen && rc==SQLITE_OK ){
1511 rc = sqlitepager_ckpt_begin(pPager);
1512 }
1513 if( rc!=SQLITE_OK ){
1514 rc = pager_unwritelock(pPager);
1515 if( rc==SQLITE_OK ){
1516 rc = SQLITE_FULL;
1517 }
1518 }
drhdb48ee02003-01-16 13:42:43 +00001519#ifndef NDEBUG
1520 pPager->syncJSize = 0;
1521#endif
drhda47d772002-12-02 04:25:19 +00001522 return rc;
1523}
1524
1525/*
drh4b845d72002-03-05 12:41:19 +00001526** Acquire a write-lock on the database. The lock is removed when
1527** the any of the following happen:
1528**
1529** * sqlitepager_commit() is called.
1530** * sqlitepager_rollback() is called.
1531** * sqlitepager_close() is called.
1532** * sqlitepager_unref() is called to on every outstanding page.
1533**
1534** The parameter to this routine is a pointer to any open page of the
1535** database file. Nothing changes about the page - it is used merely
1536** to acquire a pointer to the Pager structure and as proof that there
1537** is already a read-lock on the database.
1538**
drhda47d772002-12-02 04:25:19 +00001539** A journal file is opened if this is not a temporary file. For
1540** temporary files, the opening of the journal file is deferred until
1541** there is an actual need to write to the journal.
1542**
drh4b845d72002-03-05 12:41:19 +00001543** If the database is already write-locked, this routine is a no-op.
1544*/
1545int sqlitepager_begin(void *pData){
1546 PgHdr *pPg = DATA_TO_PGHDR(pData);
1547 Pager *pPager = pPg->pPager;
1548 int rc = SQLITE_OK;
1549 assert( pPg->nRef>0 );
1550 assert( pPager->state!=SQLITE_UNLOCK );
1551 if( pPager->state==SQLITE_READLOCK ){
1552 assert( pPager->aInJournal==0 );
1553 rc = sqliteOsWriteLock(&pPager->fd);
1554 if( rc!=SQLITE_OK ){
1555 return rc;
1556 }
drh4b845d72002-03-05 12:41:19 +00001557 pPager->state = SQLITE_WRITELOCK;
drhda47d772002-12-02 04:25:19 +00001558 pPager->dirtyFile = 0;
drhdb48ee02003-01-16 13:42:43 +00001559 TRACE1("TRANSACTION\n");
drhda47d772002-12-02 04:25:19 +00001560 if( pPager->useJournal && !pPager->tempFile ){
1561 rc = pager_open_journal(pPager);
drh4b845d72002-03-05 12:41:19 +00001562 }
1563 }
1564 return rc;
1565}
1566
1567/*
drhed7c8552001-04-11 14:29:21 +00001568** Mark a data page as writeable. The page is written into the journal
1569** if it is not there already. This routine must be called before making
1570** changes to a page.
1571**
1572** The first time this routine is called, the pager creates a new
1573** journal and acquires a write lock on the database. If the write
1574** lock could not be acquired, this routine returns SQLITE_BUSY. The
drh306dc212001-05-21 13:45:10 +00001575** calling routine must check for that return value and be careful not to
drhed7c8552001-04-11 14:29:21 +00001576** change any page data until this routine returns SQLITE_OK.
drhd9b02572001-04-15 00:37:09 +00001577**
1578** If the journal file could not be written because the disk is full,
1579** then this routine returns SQLITE_FULL and does an immediate rollback.
1580** All subsequent write attempts also return SQLITE_FULL until there
1581** is a call to sqlitepager_commit() or sqlitepager_rollback() to
1582** reset.
drhed7c8552001-04-11 14:29:21 +00001583*/
drhd9b02572001-04-15 00:37:09 +00001584int sqlitepager_write(void *pData){
drh69688d52001-04-14 16:38:23 +00001585 PgHdr *pPg = DATA_TO_PGHDR(pData);
1586 Pager *pPager = pPg->pPager;
drhd79caeb2001-04-15 02:27:24 +00001587 int rc = SQLITE_OK;
drh69688d52001-04-14 16:38:23 +00001588
drh6446c4d2001-12-15 14:22:18 +00001589 /* Check for errors
1590 */
drhd9b02572001-04-15 00:37:09 +00001591 if( pPager->errMask ){
1592 return pager_errcode(pPager);
1593 }
drh5e00f6c2001-09-13 13:46:56 +00001594 if( pPager->readOnly ){
1595 return SQLITE_PERM;
1596 }
drh6446c4d2001-12-15 14:22:18 +00001597
1598 /* Mark the page as dirty. If the page has already been written
1599 ** to the journal then we can return right away.
1600 */
drhd9b02572001-04-15 00:37:09 +00001601 pPg->dirty = 1;
drh0f892532002-05-30 12:27:03 +00001602 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
drha1680452002-04-18 01:56:57 +00001603 pPager->dirtyFile = 1;
drhfa86c412002-02-02 15:01:15 +00001604 return SQLITE_OK;
1605 }
drh6446c4d2001-12-15 14:22:18 +00001606
1607 /* If we get this far, it means that the page needs to be
drhfa86c412002-02-02 15:01:15 +00001608 ** written to the transaction journal or the ckeckpoint journal
1609 ** or both.
1610 **
1611 ** First check to see that the transaction journal exists and
1612 ** create it if it does not.
drh6446c4d2001-12-15 14:22:18 +00001613 */
drhd9b02572001-04-15 00:37:09 +00001614 assert( pPager->state!=SQLITE_UNLOCK );
drh4b845d72002-03-05 12:41:19 +00001615 rc = sqlitepager_begin(pData);
drhda47d772002-12-02 04:25:19 +00001616 if( rc!=SQLITE_OK ){
1617 return rc;
1618 }
drhd9b02572001-04-15 00:37:09 +00001619 assert( pPager->state==SQLITE_WRITELOCK );
drhda47d772002-12-02 04:25:19 +00001620 if( !pPager->journalOpen && pPager->useJournal ){
1621 rc = pager_open_journal(pPager);
1622 if( rc!=SQLITE_OK ) return rc;
1623 }
1624 assert( pPager->journalOpen || !pPager->useJournal );
1625 pPager->dirtyFile = 1;
drh6446c4d2001-12-15 14:22:18 +00001626
drhfa86c412002-02-02 15:01:15 +00001627 /* The transaction journal now exists and we have a write lock on the
1628 ** main database file. Write the current page to the transaction
1629 ** journal if it is not there already.
drh6446c4d2001-12-15 14:22:18 +00001630 */
drhdb48ee02003-01-16 13:42:43 +00001631 if( !pPg->inJournal && pPager->useJournal ){
1632 if( (int)pPg->pgno <= pPager->origDbSize ){
drh968af522003-02-11 14:55:40 +00001633 int szPg;
1634 u32 saved;
1635 if( journal_format>=JOURNAL_FORMAT_3 ){
1636 u32 cksum = pager_cksum(pPager, pPg->pgno, pData);
1637 saved = *(u32*)PGHDR_TO_EXTRA(pPg);
1638 store32bits(cksum, pPg, SQLITE_PAGE_SIZE);
1639 szPg = SQLITE_PAGE_SIZE+8;
1640 }else{
1641 szPg = SQLITE_PAGE_SIZE+4;
1642 }
1643 store32bits(pPg->pgno, pPg, -4);
1644 rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
1645 if( journal_format>=JOURNAL_FORMAT_3 ){
1646 *(u32*)PGHDR_TO_EXTRA(pPg) = saved;
1647 }
1648 pPager->nRec++;
drhdb48ee02003-01-16 13:42:43 +00001649 if( rc!=SQLITE_OK ){
1650 sqlitepager_rollback(pPager);
1651 pPager->errMask |= PAGER_ERR_FULL;
1652 return rc;
1653 }
1654 assert( pPager->aInJournal!=0 );
1655 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1656 pPg->needSync = !pPager->noSync;
1657 pPg->inJournal = 1;
1658 if( pPager->ckptInUse ){
1659 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1660 page_add_to_ckpt_list(pPg);
1661 }
1662 TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
1663 }else{
1664 pPg->needSync = !pPager->journalStarted && !pPager->noSync;
1665 TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
drhd9b02572001-04-15 00:37:09 +00001666 }
drhdb48ee02003-01-16 13:42:43 +00001667 if( pPg->needSync ){
1668 pPager->needSync = 1;
drhfa86c412002-02-02 15:01:15 +00001669 }
drh69688d52001-04-14 16:38:23 +00001670 }
drh6446c4d2001-12-15 14:22:18 +00001671
drhfa86c412002-02-02 15:01:15 +00001672 /* If the checkpoint journal is open and the page is not in it,
drh968af522003-02-11 14:55:40 +00001673 ** then write the current page to the checkpoint journal. Note that
1674 ** the checkpoint journal always uses the simplier format 2 that lacks
1675 ** checksums. The header is also omitted from the checkpoint journal.
drh6446c4d2001-12-15 14:22:18 +00001676 */
drh0f892532002-05-30 12:27:03 +00001677 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
drh1e336b42002-02-14 12:50:33 +00001678 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
drh968af522003-02-11 14:55:40 +00001679 store32bits(pPg->pgno, pPg, -4);
drh2554f8b2003-01-22 01:26:44 +00001680 rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
drhfa86c412002-02-02 15:01:15 +00001681 if( rc!=SQLITE_OK ){
1682 sqlitepager_rollback(pPager);
1683 pPager->errMask |= PAGER_ERR_FULL;
1684 return rc;
1685 }
drh9bd47a92003-01-07 14:46:08 +00001686 pPager->ckptNRec++;
drhfa86c412002-02-02 15:01:15 +00001687 assert( pPager->aInCkpt!=0 );
1688 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001689 page_add_to_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001690 }
1691
1692 /* Update the database size and return.
1693 */
drh1ab43002002-01-14 09:28:19 +00001694 if( pPager->dbSize<(int)pPg->pgno ){
drh306dc212001-05-21 13:45:10 +00001695 pPager->dbSize = pPg->pgno;
1696 }
drh69688d52001-04-14 16:38:23 +00001697 return rc;
drhed7c8552001-04-11 14:29:21 +00001698}
1699
1700/*
drhaacc5432002-01-06 17:07:40 +00001701** Return TRUE if the page given in the argument was previously passed
drh6019e162001-07-02 17:51:45 +00001702** to sqlitepager_write(). In other words, return TRUE if it is ok
1703** to change the content of the page.
1704*/
1705int sqlitepager_iswriteable(void *pData){
1706 PgHdr *pPg = DATA_TO_PGHDR(pData);
1707 return pPg->dirty;
1708}
1709
1710/*
drh30e58752002-03-02 20:41:57 +00001711** A call to this routine tells the pager that it is not necessary to
1712** write the information on page "pgno" back to the disk, even though
1713** that page might be marked as dirty.
1714**
1715** The overlying software layer calls this routine when all of the data
1716** on the given page is unused. The pager marks the page as clean so
1717** that it does not get written to disk.
1718**
1719** Tests show that this optimization, together with the
1720** sqlitepager_dont_rollback() below, more than double the speed
1721** of large INSERT operations and quadruple the speed of large DELETEs.
drh8e298f92002-07-06 16:28:47 +00001722**
1723** When this routine is called, set the alwaysRollback flag to true.
1724** Subsequent calls to sqlitepager_dont_rollback() for the same page
1725** will thereafter be ignored. This is necessary to avoid a problem
1726** where a page with data is added to the freelist during one part of
1727** a transaction then removed from the freelist during a later part
1728** of the same transaction and reused for some other purpose. When it
1729** is first added to the freelist, this routine is called. When reused,
1730** the dont_rollback() routine is called. But because the page contains
1731** critical data, we still need to be sure it gets rolled back in spite
1732** of the dont_rollback() call.
drh30e58752002-03-02 20:41:57 +00001733*/
1734void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
1735 PgHdr *pPg;
drh8e298f92002-07-06 16:28:47 +00001736
drh30e58752002-03-02 20:41:57 +00001737 pPg = pager_lookup(pPager, pgno);
drh8e298f92002-07-06 16:28:47 +00001738 pPg->alwaysRollback = 1;
drh30e58752002-03-02 20:41:57 +00001739 if( pPg && pPg->dirty ){
drh8124a302002-06-25 14:43:57 +00001740 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
1741 /* If this pages is the last page in the file and the file has grown
1742 ** during the current transaction, then do NOT mark the page as clean.
1743 ** When the database file grows, we must make sure that the last page
1744 ** gets written at least once so that the disk file will be the correct
1745 ** size. If you do not write this page and the size of the file
1746 ** on the disk ends up being too small, that can lead to database
1747 ** corruption during the next transaction.
1748 */
1749 }else{
drhdb48ee02003-01-16 13:42:43 +00001750 TRACE2("DONT_WRITE %d\n", pgno);
drh8124a302002-06-25 14:43:57 +00001751 pPg->dirty = 0;
1752 }
drh30e58752002-03-02 20:41:57 +00001753 }
1754}
1755
1756/*
1757** A call to this routine tells the pager that if a rollback occurs,
1758** it is not necessary to restore the data on the given page. This
1759** means that the pager does not have to record the given page in the
1760** rollback journal.
1761*/
1762void sqlitepager_dont_rollback(void *pData){
1763 PgHdr *pPg = DATA_TO_PGHDR(pData);
1764 Pager *pPager = pPg->pPager;
1765
1766 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
drh193a6b42002-07-07 16:52:46 +00001767 if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
drh30e58752002-03-02 20:41:57 +00001768 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
1769 assert( pPager->aInJournal!=0 );
1770 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1771 pPg->inJournal = 1;
drh0f892532002-05-30 12:27:03 +00001772 if( pPager->ckptInUse ){
drh30e58752002-03-02 20:41:57 +00001773 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001774 page_add_to_ckpt_list(pPg);
drh30e58752002-03-02 20:41:57 +00001775 }
drhdb48ee02003-01-16 13:42:43 +00001776 TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
drh30e58752002-03-02 20:41:57 +00001777 }
drh0f892532002-05-30 12:27:03 +00001778 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
drh30e58752002-03-02 20:41:57 +00001779 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1780 assert( pPager->aInCkpt!=0 );
1781 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001782 page_add_to_ckpt_list(pPg);
drh30e58752002-03-02 20:41:57 +00001783 }
1784}
1785
1786/*
drhed7c8552001-04-11 14:29:21 +00001787** Commit all changes to the database and release the write lock.
drhd9b02572001-04-15 00:37:09 +00001788**
1789** If the commit fails for any reason, a rollback attempt is made
1790** and an error code is returned. If the commit worked, SQLITE_OK
1791** is returned.
drhed7c8552001-04-11 14:29:21 +00001792*/
drhd9b02572001-04-15 00:37:09 +00001793int sqlitepager_commit(Pager *pPager){
drha1b351a2001-09-14 16:42:12 +00001794 int rc;
drhed7c8552001-04-11 14:29:21 +00001795 PgHdr *pPg;
drhd9b02572001-04-15 00:37:09 +00001796
1797 if( pPager->errMask==PAGER_ERR_FULL ){
1798 rc = sqlitepager_rollback(pPager);
drh4e371ee2002-09-05 16:08:27 +00001799 if( rc==SQLITE_OK ){
1800 rc = SQLITE_FULL;
1801 }
drhd9b02572001-04-15 00:37:09 +00001802 return rc;
1803 }
1804 if( pPager->errMask!=0 ){
1805 rc = pager_errcode(pPager);
1806 return rc;
1807 }
1808 if( pPager->state!=SQLITE_WRITELOCK ){
1809 return SQLITE_ERROR;
1810 }
drhdb48ee02003-01-16 13:42:43 +00001811 TRACE1("COMMIT\n");
drha1680452002-04-18 01:56:57 +00001812 if( pPager->dirtyFile==0 ){
1813 /* Exit early (without doing the time-consuming sqliteOsSync() calls)
1814 ** if there have been no changes to the database file. */
drh341eae82003-01-21 02:39:36 +00001815 assert( pPager->needSync==0 );
drha1680452002-04-18 01:56:57 +00001816 rc = pager_unwritelock(pPager);
1817 pPager->dbSize = -1;
1818 return rc;
1819 }
drhda47d772002-12-02 04:25:19 +00001820 assert( pPager->journalOpen );
drha7fcb052001-12-14 15:09:55 +00001821 if( pPager->needSync && sqliteOsSync(&pPager->jfd)!=SQLITE_OK ){
drhd9b02572001-04-15 00:37:09 +00001822 goto commit_abort;
drhed7c8552001-04-11 14:29:21 +00001823 }
drh2554f8b2003-01-22 01:26:44 +00001824 pPg = pager_get_all_dirty_pages(pPager);
1825 if( pPg ){
1826 rc = pager_write_pagelist(pPg);
1827 if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){
1828 goto commit_abort;
1829 }
drh603240c2002-03-05 01:11:12 +00001830 }
drhd9b02572001-04-15 00:37:09 +00001831 rc = pager_unwritelock(pPager);
1832 pPager->dbSize = -1;
1833 return rc;
1834
1835 /* Jump here if anything goes wrong during the commit process.
1836 */
1837commit_abort:
1838 rc = sqlitepager_rollback(pPager);
1839 if( rc==SQLITE_OK ){
1840 rc = SQLITE_FULL;
drhed7c8552001-04-11 14:29:21 +00001841 }
drhed7c8552001-04-11 14:29:21 +00001842 return rc;
1843}
1844
1845/*
1846** Rollback all changes. The database falls back to read-only mode.
1847** All in-memory cache pages revert to their original data contents.
1848** The journal is deleted.
drhd9b02572001-04-15 00:37:09 +00001849**
1850** This routine cannot fail unless some other process is not following
1851** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
1852** process is writing trash into the journal file (SQLITE_CORRUPT) or
1853** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error
1854** codes are returned for all these occasions. Otherwise,
1855** SQLITE_OK is returned.
drhed7c8552001-04-11 14:29:21 +00001856*/
drhd9b02572001-04-15 00:37:09 +00001857int sqlitepager_rollback(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +00001858 int rc;
drhdb48ee02003-01-16 13:42:43 +00001859 TRACE1("ROLLBACK\n");
drhda47d772002-12-02 04:25:19 +00001860 if( !pPager->dirtyFile || !pPager->journalOpen ){
1861 rc = pager_unwritelock(pPager);
1862 pPager->dbSize = -1;
1863 return rc;
1864 }
drhdb48ee02003-01-16 13:42:43 +00001865
1866#if defined(SQLITE_TEST) && !defined(NDEBUG)
1867 /* Truncate the journal to the size it was at the conclusion of the
1868 ** last sqliteOsSync() call. This is really an error check. If the
1869 ** rollback still works, it means that the rollback would have also
1870 ** worked if it had occurred after an OS crash or unexpected power
1871 ** loss.
1872 */
drha218b6a2003-01-25 15:43:22 +00001873 if( !pPager->noSync ){
drh968af522003-02-11 14:55:40 +00001874 int m = JOURNAL_HDR_SZ(journal_format);
drha218b6a2003-01-25 15:43:22 +00001875 assert( !pPager->tempFile );
drh968af522003-02-11 14:55:40 +00001876 if( pPager->syncJSize<m ){
1877 pPager->syncJSize = m;
drha218b6a2003-01-25 15:43:22 +00001878 }
1879 TRACE2("TRUNCATE JOURNAL %lld\n", pPager->syncJSize);
1880 rc = sqliteOsTruncate(&pPager->jfd, pPager->syncJSize);
1881 if( rc ) return rc;
drh968af522003-02-11 14:55:40 +00001882 pPager->nRec = 0;
drhdb48ee02003-01-16 13:42:43 +00001883 }
drhdb48ee02003-01-16 13:42:43 +00001884#endif
1885
drhd9b02572001-04-15 00:37:09 +00001886 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
drh4b845d72002-03-05 12:41:19 +00001887 if( pPager->state>=SQLITE_WRITELOCK ){
1888 pager_playback(pPager);
1889 }
drhd9b02572001-04-15 00:37:09 +00001890 return pager_errcode(pPager);
drhed7c8552001-04-11 14:29:21 +00001891 }
drhd9b02572001-04-15 00:37:09 +00001892 if( pPager->state!=SQLITE_WRITELOCK ){
1893 return SQLITE_OK;
1894 }
1895 rc = pager_playback(pPager);
1896 if( rc!=SQLITE_OK ){
1897 rc = SQLITE_CORRUPT;
1898 pPager->errMask |= PAGER_ERR_CORRUPT;
1899 }
1900 pPager->dbSize = -1;
drhed7c8552001-04-11 14:29:21 +00001901 return rc;
drh98808ba2001-10-18 12:34:46 +00001902}
drhd9b02572001-04-15 00:37:09 +00001903
1904/*
drh5e00f6c2001-09-13 13:46:56 +00001905** Return TRUE if the database file is opened read-only. Return FALSE
1906** if the database is (in theory) writable.
1907*/
1908int sqlitepager_isreadonly(Pager *pPager){
drhbe0072d2001-09-13 14:46:09 +00001909 return pPager->readOnly;
drh5e00f6c2001-09-13 13:46:56 +00001910}
1911
1912/*
drhd9b02572001-04-15 00:37:09 +00001913** This routine is used for testing and analysis only.
1914*/
1915int *sqlitepager_stats(Pager *pPager){
1916 static int a[9];
1917 a[0] = pPager->nRef;
1918 a[1] = pPager->nPage;
1919 a[2] = pPager->mxPage;
1920 a[3] = pPager->dbSize;
1921 a[4] = pPager->state;
1922 a[5] = pPager->errMask;
1923 a[6] = pPager->nHit;
1924 a[7] = pPager->nMiss;
1925 a[8] = pPager->nOvfl;
1926 return a;
1927}
drhdd793422001-06-28 01:54:48 +00001928
drhfa86c412002-02-02 15:01:15 +00001929/*
1930** Set the checkpoint.
1931**
1932** This routine should be called with the transaction journal already
1933** open. A new checkpoint journal is created that can be used to rollback
drhaaab5722002-02-19 13:39:21 +00001934** changes of a single SQL command within a larger transaction.
drhfa86c412002-02-02 15:01:15 +00001935*/
1936int sqlitepager_ckpt_begin(Pager *pPager){
1937 int rc;
1938 char zTemp[SQLITE_TEMPNAME_SIZE];
drhda47d772002-12-02 04:25:19 +00001939 if( !pPager->journalOpen ){
1940 pPager->ckptAutoopen = 1;
1941 return SQLITE_OK;
1942 }
drhfa86c412002-02-02 15:01:15 +00001943 assert( pPager->journalOpen );
drh0f892532002-05-30 12:27:03 +00001944 assert( !pPager->ckptInUse );
drhfa86c412002-02-02 15:01:15 +00001945 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
1946 if( pPager->aInCkpt==0 ){
1947 sqliteOsReadLock(&pPager->fd);
1948 return SQLITE_NOMEM;
1949 }
drh968af522003-02-11 14:55:40 +00001950#ifndef NDEBUG
drhfa86c412002-02-02 15:01:15 +00001951 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
1952 if( rc ) goto ckpt_begin_failed;
drh968af522003-02-11 14:55:40 +00001953 assert( pPager->ckptJSize ==
1954 pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) );
1955#endif
1956 pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format)
1957 + JOURNAL_HDR_SZ(journal_format);
drh663fc632002-02-02 18:49:19 +00001958 pPager->ckptSize = pPager->dbSize;
drh0f892532002-05-30 12:27:03 +00001959 if( !pPager->ckptOpen ){
1960 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
1961 if( rc ) goto ckpt_begin_failed;
1962 pPager->ckptOpen = 1;
drh9bd47a92003-01-07 14:46:08 +00001963 pPager->ckptNRec = 0;
drh0f892532002-05-30 12:27:03 +00001964 }
1965 pPager->ckptInUse = 1;
drhfa86c412002-02-02 15:01:15 +00001966 return SQLITE_OK;
1967
1968ckpt_begin_failed:
1969 if( pPager->aInCkpt ){
1970 sqliteFree(pPager->aInCkpt);
1971 pPager->aInCkpt = 0;
1972 }
1973 return rc;
1974}
1975
1976/*
1977** Commit a checkpoint.
1978*/
1979int sqlitepager_ckpt_commit(Pager *pPager){
drh0f892532002-05-30 12:27:03 +00001980 if( pPager->ckptInUse ){
drh03eb96a2002-11-10 23:32:56 +00001981 PgHdr *pPg, *pNext;
drh96ddd6d2002-09-05 19:10:33 +00001982 sqliteOsSeek(&pPager->cpfd, 0);
drh9bd47a92003-01-07 14:46:08 +00001983 /* sqliteOsTruncate(&pPager->cpfd, 0); */
1984 pPager->ckptNRec = 0;
drh0f892532002-05-30 12:27:03 +00001985 pPager->ckptInUse = 0;
drh663fc632002-02-02 18:49:19 +00001986 sqliteFree( pPager->aInCkpt );
1987 pPager->aInCkpt = 0;
drh03eb96a2002-11-10 23:32:56 +00001988 for(pPg=pPager->pCkpt; pPg; pPg=pNext){
1989 pNext = pPg->pNextCkpt;
1990 assert( pPg->inCkpt );
drh663fc632002-02-02 18:49:19 +00001991 pPg->inCkpt = 0;
drh03eb96a2002-11-10 23:32:56 +00001992 pPg->pPrevCkpt = pPg->pNextCkpt = 0;
drh663fc632002-02-02 18:49:19 +00001993 }
drh03eb96a2002-11-10 23:32:56 +00001994 pPager->pCkpt = 0;
drh663fc632002-02-02 18:49:19 +00001995 }
drhda47d772002-12-02 04:25:19 +00001996 pPager->ckptAutoopen = 0;
drhfa86c412002-02-02 15:01:15 +00001997 return SQLITE_OK;
1998}
1999
2000/*
2001** Rollback a checkpoint.
2002*/
2003int sqlitepager_ckpt_rollback(Pager *pPager){
2004 int rc;
drh0f892532002-05-30 12:27:03 +00002005 if( pPager->ckptInUse ){
drh663fc632002-02-02 18:49:19 +00002006 rc = pager_ckpt_playback(pPager);
2007 sqlitepager_ckpt_commit(pPager);
2008 }else{
2009 rc = SQLITE_OK;
2010 }
drhda47d772002-12-02 04:25:19 +00002011 pPager->ckptAutoopen = 0;
drhfa86c412002-02-02 15:01:15 +00002012 return rc;
2013}
2014
drh74587e52002-08-13 00:01:16 +00002015#ifdef SQLITE_TEST
drhdd793422001-06-28 01:54:48 +00002016/*
2017** Print a listing of all referenced pages and their ref count.
2018*/
2019void sqlitepager_refdump(Pager *pPager){
2020 PgHdr *pPg;
2021 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2022 if( pPg->nRef<=0 ) continue;
2023 printf("PAGE %3d addr=0x%08x nRef=%d\n",
2024 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
2025 }
2026}
2027#endif