blob: 04c7f25a37cb5c5b916ba84616a2045425baae76 [file] [log] [blame]
drhed7c8552001-04-11 14:29:21 +00001/*
drhb19a2bc2001-09-16 00:13:26 +00002** 2001 September 15
drhed7c8552001-04-11 14:29:21 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drhed7c8552001-04-11 14:29:21 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drhed7c8552001-04-11 14:29:21 +000010**
11*************************************************************************
drhb19a2bc2001-09-16 00:13:26 +000012** This is the implementation of the page cache subsystem or "pager".
drhed7c8552001-04-11 14:29:21 +000013**
drhb19a2bc2001-09-16 00:13:26 +000014** The pager is used to access a database disk file. It implements
15** atomic commit and rollback through the use of a journal file that
16** is separate from the database file. The pager also implements file
17** locking to prevent two processes from writing the same database
18** file simultaneously, or one process from reading the database while
19** another is writing.
drhed7c8552001-04-11 14:29:21 +000020**
drh03eb96a2002-11-10 23:32:56 +000021** @(#) $Id: pager.c,v 1.57 2002/11/10 23:32:57 drh Exp $
drhed7c8552001-04-11 14:29:21 +000022*/
drh829e8022002-11-06 14:08:11 +000023#include "os.h" /* Must be first to enable large file support */
drhd9b02572001-04-15 00:37:09 +000024#include "sqliteInt.h"
drhed7c8552001-04-11 14:29:21 +000025#include "pager.h"
drhed7c8552001-04-11 14:29:21 +000026#include <assert.h>
drhd9b02572001-04-15 00:37:09 +000027#include <string.h>
drhed7c8552001-04-11 14:29:21 +000028
29/*
30** The page cache as a whole is always in one of the following
31** states:
32**
33** SQLITE_UNLOCK The page cache is not currently reading or
34** writing the database file. There is no
35** data held in memory. This is the initial
36** state.
37**
38** SQLITE_READLOCK The page cache is reading the database.
39** Writing is not permitted. There can be
40** multiple readers accessing the same database
drh69688d52001-04-14 16:38:23 +000041** file at the same time.
drhed7c8552001-04-11 14:29:21 +000042**
43** SQLITE_WRITELOCK The page cache is writing the database.
44** Access is exclusive. No other processes or
45** threads can be reading or writing while one
46** process is writing.
47**
drh306dc212001-05-21 13:45:10 +000048** The page cache comes up in SQLITE_UNLOCK. The first time a
49** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
drhed7c8552001-04-11 14:29:21 +000050** After all pages have been released using sqlite_page_unref(),
drh306dc212001-05-21 13:45:10 +000051** the state transitions back to SQLITE_UNLOCK. The first time
drhed7c8552001-04-11 14:29:21 +000052** that sqlite_page_write() is called, the state transitions to
drh306dc212001-05-21 13:45:10 +000053** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be
54** called on an outstanding page which means that the pager must
55** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
56** The sqlite_page_rollback() and sqlite_page_commit() functions
57** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
drhed7c8552001-04-11 14:29:21 +000058*/
59#define SQLITE_UNLOCK 0
60#define SQLITE_READLOCK 1
61#define SQLITE_WRITELOCK 2
62
drhd9b02572001-04-15 00:37:09 +000063
drhed7c8552001-04-11 14:29:21 +000064/*
65** Each in-memory image of a page begins with the following header.
drhbd03cae2001-06-02 02:40:57 +000066** This header is only visible to this pager module. The client
67** code that calls pager sees only the data that follows the header.
drhed7c8552001-04-11 14:29:21 +000068*/
drhd9b02572001-04-15 00:37:09 +000069typedef struct PgHdr PgHdr;
drhed7c8552001-04-11 14:29:21 +000070struct PgHdr {
71 Pager *pPager; /* The pager to which this page belongs */
72 Pgno pgno; /* The page number for this page */
drh69688d52001-04-14 16:38:23 +000073 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
drhed7c8552001-04-11 14:29:21 +000074 int nRef; /* Number of users of this page */
drhd9b02572001-04-15 00:37:09 +000075 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
76 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */
drh03eb96a2002-11-10 23:32:56 +000077 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */
78 PgHdr *pSort; /* Next in list of pages to be written */
drh193a6b42002-07-07 16:52:46 +000079 u8 inJournal; /* TRUE if has been written to journal */
80 u8 inCkpt; /* TRUE if written to the checkpoint journal */
81 u8 dirty; /* TRUE if we need to write back changes */
82 u8 alwaysRollback; /* Disable dont_rollback() for this page */
drh69688d52001-04-14 16:38:23 +000083 /* SQLITE_PAGE_SIZE bytes of page data follow this header */
drh7e3b0a02001-04-28 16:52:40 +000084 /* Pager.nExtra bytes of local data follow the page data */
drhed7c8552001-04-11 14:29:21 +000085};
86
87/*
drh69688d52001-04-14 16:38:23 +000088** Convert a pointer to a PgHdr into a pointer to its data
89** and back again.
drhed7c8552001-04-11 14:29:21 +000090*/
91#define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
92#define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
drh7e3b0a02001-04-28 16:52:40 +000093#define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
drhed7c8552001-04-11 14:29:21 +000094
95/*
drhed7c8552001-04-11 14:29:21 +000096** How big to make the hash table used for locating in-memory pages
drh306dc212001-05-21 13:45:10 +000097** by page number. Knuth says this should be a prime number.
drhed7c8552001-04-11 14:29:21 +000098*/
drh603240c2002-03-05 01:11:12 +000099#define N_PG_HASH 2003
drhed7c8552001-04-11 14:29:21 +0000100
101/*
102** A open page cache is an instance of the following structure.
103*/
104struct Pager {
105 char *zFilename; /* Name of the database file */
106 char *zJournal; /* Name of the journal file */
drh8cfbf082001-09-19 13:22:39 +0000107 OsFile fd, jfd; /* File descriptors for database and journal */
drhfa86c412002-02-02 15:01:15 +0000108 OsFile cpfd; /* File descriptor for the checkpoint journal */
drhed7c8552001-04-11 14:29:21 +0000109 int dbSize; /* Number of pages in the file */
drh69688d52001-04-14 16:38:23 +0000110 int origDbSize; /* dbSize before the current change */
drh28be87c2002-11-05 23:03:02 +0000111 int ckptSize; /* Size of database (in pages) at ckpt_begin() */
112 off_t ckptJSize; /* Size of journal at ckpt_begin() */
drh7e3b0a02001-04-28 16:52:40 +0000113 int nExtra; /* Add this many bytes to each in-memory page */
drh72f82862001-05-24 21:06:34 +0000114 void (*xDestructor)(void*); /* Call this routine when freeing pages */
drhed7c8552001-04-11 14:29:21 +0000115 int nPage; /* Total number of in-memory pages */
drhd9b02572001-04-15 00:37:09 +0000116 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
drhed7c8552001-04-11 14:29:21 +0000117 int mxPage; /* Maximum number of pages to hold in cache */
drhd9b02572001-04-15 00:37:09 +0000118 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
drh603240c2002-03-05 01:11:12 +0000119 u8 journalOpen; /* True if journal file descriptors is valid */
120 u8 ckptOpen; /* True if the checkpoint journal is open */
drh0f892532002-05-30 12:27:03 +0000121 u8 ckptInUse; /* True we are in a checkpoint */
drh603240c2002-03-05 01:11:12 +0000122 u8 noSync; /* Do not sync the journal if true */
123 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
124 u8 errMask; /* One of several kinds of errors */
125 u8 tempFile; /* zFilename is a temporary file */
126 u8 readOnly; /* True for a read-only database */
127 u8 needSync; /* True if an fsync() is needed on the journal */
drha1680452002-04-18 01:56:57 +0000128 u8 dirtyFile; /* True if database file has changed in any way */
drh193a6b42002-07-07 16:52:46 +0000129 u8 alwaysRollback; /* Disable dont_rollback() for all pages */
drh94f33312002-08-12 12:29:56 +0000130 u8 journalFormat; /* Version number of the journal file */
drh603240c2002-03-05 01:11:12 +0000131 u8 *aInJournal; /* One bit for each page in the database file */
132 u8 *aInCkpt; /* One bit for each page in the database */
drhed7c8552001-04-11 14:29:21 +0000133 PgHdr *pFirst, *pLast; /* List of free pages */
drhd9b02572001-04-15 00:37:09 +0000134 PgHdr *pAll; /* List of all pages */
drh03eb96a2002-11-10 23:32:56 +0000135 PgHdr *pCkpt; /* List of pages in the checkpoint journal */
drhed7c8552001-04-11 14:29:21 +0000136 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */
drhd9b02572001-04-15 00:37:09 +0000137};
138
139/*
140** These are bits that can be set in Pager.errMask.
141*/
142#define PAGER_ERR_FULL 0x01 /* a write() failed */
143#define PAGER_ERR_MEM 0x02 /* malloc() failed */
144#define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
145#define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
drh81a20f22001-10-12 17:30:04 +0000146#define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
drhd9b02572001-04-15 00:37:09 +0000147
148/*
149** The journal file contains page records in the following
150** format.
151*/
152typedef struct PageRecord PageRecord;
153struct PageRecord {
154 Pgno pgno; /* The page number */
155 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
156};
157
158/*
drh5e00f6c2001-09-13 13:46:56 +0000159** Journal files begin with the following magic string. The data
160** was obtained from /dev/random. It is used only as a sanity check.
drh94f33312002-08-12 12:29:56 +0000161**
162** There are two journal formats. The older journal format writes
163** 32-bit integers in the byte-order of the host machine. The new
164** format writes integers as big-endian. All new journals use the
165** new format, but we have to be able to read an older journal in order
166** to roll it back.
drhd9b02572001-04-15 00:37:09 +0000167*/
drh94f33312002-08-12 12:29:56 +0000168static const unsigned char aOldJournalMagic[] = {
drhd9b02572001-04-15 00:37:09 +0000169 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
drhed7c8552001-04-11 14:29:21 +0000170};
drh94f33312002-08-12 12:29:56 +0000171static const unsigned char aJournalMagic[] = {
172 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
173};
174#define SQLITE_NEW_JOURNAL_FORMAT 1
175#define SQLITE_OLD_JOURNAL_FORMAT 0
176
177/*
178** The following integer, if set, causes journals to be written in the
179** old format. This is used for testing purposes only - to make sure
180** the code is able to rollback an old journal.
181*/
182#ifdef SQLITE_TEST
183int pager_old_format = 0;
drh74587e52002-08-13 00:01:16 +0000184#else
185# define pager_old_format 0
drh94f33312002-08-12 12:29:56 +0000186#endif
drhed7c8552001-04-11 14:29:21 +0000187
188/*
189** Hash a page number
190*/
drhd9b02572001-04-15 00:37:09 +0000191#define pager_hash(PN) ((PN)%N_PG_HASH)
drhed7c8552001-04-11 14:29:21 +0000192
193/*
drhdd793422001-06-28 01:54:48 +0000194** Enable reference count tracking here:
195*/
drh74587e52002-08-13 00:01:16 +0000196#ifdef SQLITE_TEST
drh5e00f6c2001-09-13 13:46:56 +0000197 int pager_refinfo_enable = 0;
drhdd793422001-06-28 01:54:48 +0000198 static void pager_refinfo(PgHdr *p){
199 static int cnt = 0;
200 if( !pager_refinfo_enable ) return;
201 printf(
202 "REFCNT: %4d addr=0x%08x nRef=%d\n",
203 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
204 );
205 cnt++; /* Something to set a breakpoint on */
206 }
207# define REFINFO(X) pager_refinfo(X)
208#else
209# define REFINFO(X)
210#endif
211
212/*
drh94f33312002-08-12 12:29:56 +0000213** Read a 32-bit integer from the given file descriptor
214*/
215static int read32bits(Pager *pPager, OsFile *fd, u32 *pRes){
216 u32 res;
217 int rc;
218 rc = sqliteOsRead(fd, &res, sizeof(res));
219 if( rc==SQLITE_OK && pPager->journalFormat==SQLITE_NEW_JOURNAL_FORMAT ){
220 unsigned char ac[4];
221 memcpy(ac, &res, 4);
222 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
223 }
224 *pRes = res;
225 return rc;
226}
227
228/*
229** Write a 32-bit integer into the given file descriptor. Writing
230** is always done using the new journal format.
231*/
232static int write32bits(OsFile *fd, u32 val){
233 unsigned char ac[4];
drh94f33312002-08-12 12:29:56 +0000234 if( pager_old_format ){
235 return sqliteOsWrite(fd, &val, 4);
236 }
drh94f33312002-08-12 12:29:56 +0000237 ac[0] = (val>>24) & 0xff;
238 ac[1] = (val>>16) & 0xff;
239 ac[2] = (val>>8) & 0xff;
240 ac[3] = val & 0xff;
241 return sqliteOsWrite(fd, ac, 4);
242}
243
244
245/*
drhd9b02572001-04-15 00:37:09 +0000246** Convert the bits in the pPager->errMask into an approprate
247** return code.
248*/
249static int pager_errcode(Pager *pPager){
250 int rc = SQLITE_OK;
251 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL;
drh81a20f22001-10-12 17:30:04 +0000252 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR;
drhd9b02572001-04-15 00:37:09 +0000253 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL;
254 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM;
255 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
256 return rc;
drhed7c8552001-04-11 14:29:21 +0000257}
258
259/*
drh03eb96a2002-11-10 23:32:56 +0000260** Add or remove a page from the list of all pages that are in the
261** checkpoint journal.
262**
263** The Pager keeps a separate list of pages that are currently in
264** the checkpoint journal. This helps the sqlitepager_ckpt_commit()
265** routine run MUCH faster for the common case where there are many
266** pages in memory but only a few are in the checkpoint journal.
267*/
268static void page_add_to_ckpt_list(PgHdr *pPg){
269 Pager *pPager = pPg->pPager;
270 if( pPg->inCkpt ) return;
271 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
272 pPg->pPrevCkpt = 0;
273 if( pPager->pCkpt ){
274 pPager->pCkpt->pPrevCkpt = pPg;
275 }
276 pPg->pNextCkpt = pPager->pCkpt;
277 pPager->pCkpt = pPg;
278 pPg->inCkpt = 1;
279}
280static void page_remove_from_ckpt_list(PgHdr *pPg){
281 if( !pPg->inCkpt ) return;
282 if( pPg->pPrevCkpt ){
283 assert( pPg->pPrevCkpt->pNextCkpt==pPg );
284 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
285 }else{
286 assert( pPg->pPager->pCkpt==pPg );
287 pPg->pPager->pCkpt = pPg->pNextCkpt;
288 }
289 if( pPg->pNextCkpt ){
290 assert( pPg->pNextCkpt->pPrevCkpt==pPg );
291 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
292 }
293 pPg->pNextCkpt = 0;
294 pPg->pPrevCkpt = 0;
295 pPg->inCkpt = 0;
296}
297
298/*
drhed7c8552001-04-11 14:29:21 +0000299** Find a page in the hash table given its page number. Return
300** a pointer to the page or NULL if not found.
301*/
drhd9b02572001-04-15 00:37:09 +0000302static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
drhed7c8552001-04-11 14:29:21 +0000303 PgHdr *p = pPager->aHash[pgno % N_PG_HASH];
304 while( p && p->pgno!=pgno ){
305 p = p->pNextHash;
306 }
307 return p;
308}
309
310/*
311** Unlock the database and clear the in-memory cache. This routine
312** sets the state of the pager back to what it was when it was first
313** opened. Any outstanding pages are invalidated and subsequent attempts
314** to access those pages will likely result in a coredump.
315*/
drhd9b02572001-04-15 00:37:09 +0000316static void pager_reset(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +0000317 PgHdr *pPg, *pNext;
drhd9b02572001-04-15 00:37:09 +0000318 for(pPg=pPager->pAll; pPg; pPg=pNext){
319 pNext = pPg->pNextAll;
320 sqliteFree(pPg);
drhed7c8552001-04-11 14:29:21 +0000321 }
322 pPager->pFirst = 0;
drhd9b02572001-04-15 00:37:09 +0000323 pPager->pLast = 0;
324 pPager->pAll = 0;
drhed7c8552001-04-11 14:29:21 +0000325 memset(pPager->aHash, 0, sizeof(pPager->aHash));
326 pPager->nPage = 0;
drhfa86c412002-02-02 15:01:15 +0000327 if( pPager->state>=SQLITE_WRITELOCK ){
drhd9b02572001-04-15 00:37:09 +0000328 sqlitepager_rollback(pPager);
drhed7c8552001-04-11 14:29:21 +0000329 }
drha7fcb052001-12-14 15:09:55 +0000330 sqliteOsUnlock(&pPager->fd);
drhed7c8552001-04-11 14:29:21 +0000331 pPager->state = SQLITE_UNLOCK;
drhd9b02572001-04-15 00:37:09 +0000332 pPager->dbSize = -1;
drhed7c8552001-04-11 14:29:21 +0000333 pPager->nRef = 0;
drh8cfbf082001-09-19 13:22:39 +0000334 assert( pPager->journalOpen==0 );
drhed7c8552001-04-11 14:29:21 +0000335}
336
337/*
338** When this routine is called, the pager has the journal file open and
339** a write lock on the database. This routine releases the database
340** write lock and acquires a read lock in its place. The journal file
341** is deleted and closed.
drhed7c8552001-04-11 14:29:21 +0000342*/
drhd9b02572001-04-15 00:37:09 +0000343static int pager_unwritelock(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +0000344 int rc;
drhd9b02572001-04-15 00:37:09 +0000345 PgHdr *pPg;
drhfa86c412002-02-02 15:01:15 +0000346 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
drh663fc632002-02-02 18:49:19 +0000347 sqlitepager_ckpt_commit(pPager);
drh0f892532002-05-30 12:27:03 +0000348 if( pPager->ckptOpen ){
349 sqliteOsClose(&pPager->cpfd);
350 pPager->ckptOpen = 0;
351 }
drha7fcb052001-12-14 15:09:55 +0000352 sqliteOsClose(&pPager->jfd);
drh8cfbf082001-09-19 13:22:39 +0000353 pPager->journalOpen = 0;
354 sqliteOsDelete(pPager->zJournal);
drha7fcb052001-12-14 15:09:55 +0000355 rc = sqliteOsReadLock(&pPager->fd);
drh6019e162001-07-02 17:51:45 +0000356 sqliteFree( pPager->aInJournal );
357 pPager->aInJournal = 0;
drhd9b02572001-04-15 00:37:09 +0000358 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
359 pPg->inJournal = 0;
360 pPg->dirty = 0;
361 }
drh8e298f92002-07-06 16:28:47 +0000362 if( rc==SQLITE_OK ){
363 pPager->state = SQLITE_READLOCK;
364 }else{
365 /* This can only happen if a process does a BEGIN, then forks and the
366 ** child process does the COMMIT. Because of the semantics of unix
367 ** file locking, the unlock will fail.
368 */
369 pPager->state = SQLITE_UNLOCK;
370 }
drhed7c8552001-04-11 14:29:21 +0000371 return rc;
372}
373
drhed7c8552001-04-11 14:29:21 +0000374/*
drhfa86c412002-02-02 15:01:15 +0000375** Read a single page from the journal file opened on file descriptor
376** jfd. Playback this one page.
377*/
378static int pager_playback_one_page(Pager *pPager, OsFile *jfd){
379 int rc;
380 PgHdr *pPg; /* An existing page in the cache */
381 PageRecord pgRec;
382
drh94f33312002-08-12 12:29:56 +0000383 rc = read32bits(pPager, jfd, &pgRec.pgno);
384 if( rc!=SQLITE_OK ) return rc;
385 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
drhfa86c412002-02-02 15:01:15 +0000386 if( rc!=SQLITE_OK ) return rc;
387
388 /* Sanity checking on the page */
389 if( pgRec.pgno>pPager->dbSize || pgRec.pgno==0 ) return SQLITE_CORRUPT;
390
391 /* Playback the page. Update the in-memory copy of the page
392 ** at the same time, if there is one.
393 */
394 pPg = pager_lookup(pPager, pgRec.pgno);
395 if( pPg ){
396 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
397 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
398 }
399 rc = sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*SQLITE_PAGE_SIZE);
400 if( rc==SQLITE_OK ){
401 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
402 }
403 return rc;
404}
405
406/*
drhed7c8552001-04-11 14:29:21 +0000407** Playback the journal and thus restore the database file to
408** the state it was in before we started making changes.
409**
drhd9b02572001-04-15 00:37:09 +0000410** The journal file format is as follows: There is an initial
411** file-type string for sanity checking. Then there is a single
412** Pgno number which is the number of pages in the database before
413** changes were made. The database is truncated to this size.
drh306dc212001-05-21 13:45:10 +0000414** Next come zero or more page records where each page record
415** consists of a Pgno and SQLITE_PAGE_SIZE bytes of data. See
416** the PageRecord structure for details.
drhed7c8552001-04-11 14:29:21 +0000417**
drhd9b02572001-04-15 00:37:09 +0000418** If the file opened as the journal file is not a well-formed
419** journal file (as determined by looking at the magic number
420** at the beginning) then this routine returns SQLITE_PROTOCOL.
421** If any other errors occur during playback, the database will
422** likely be corrupted, so the PAGER_ERR_CORRUPT bit is set in
423** pPager->errMask and SQLITE_CORRUPT is returned. If it all
424** works, then this routine returns SQLITE_OK.
drhed7c8552001-04-11 14:29:21 +0000425*/
drhd9b02572001-04-15 00:37:09 +0000426static int pager_playback(Pager *pPager){
drh28be87c2002-11-05 23:03:02 +0000427 off_t nRec; /* Number of Records */
drhd9b02572001-04-15 00:37:09 +0000428 int i; /* Loop counter */
429 Pgno mxPg = 0; /* Size of the original file in pages */
drhd9b02572001-04-15 00:37:09 +0000430 unsigned char aMagic[sizeof(aJournalMagic)];
drhed7c8552001-04-11 14:29:21 +0000431 int rc;
432
drhc3a64ba2001-11-22 00:01:27 +0000433 /* Figure out how many records are in the journal. Abort early if
434 ** the journal is empty.
drhed7c8552001-04-11 14:29:21 +0000435 */
drh8cfbf082001-09-19 13:22:39 +0000436 assert( pPager->journalOpen );
drha7fcb052001-12-14 15:09:55 +0000437 sqliteOsSeek(&pPager->jfd, 0);
438 rc = sqliteOsFileSize(&pPager->jfd, &nRec);
drhc3a64ba2001-11-22 00:01:27 +0000439 if( rc!=SQLITE_OK ){
440 goto end_playback;
441 }
drh28be87c2002-11-05 23:03:02 +0000442 if( nRec <= sizeof(aMagic)+sizeof(Pgno) ){
drhc3a64ba2001-11-22 00:01:27 +0000443 goto end_playback;
444 }
drh28be87c2002-11-05 23:03:02 +0000445 nRec = (nRec - (sizeof(aMagic)+sizeof(Pgno))) / sizeof(PageRecord);
drhc3a64ba2001-11-22 00:01:27 +0000446
447 /* Read the beginning of the journal and truncate the
448 ** database file back to its original size.
449 */
drha7fcb052001-12-14 15:09:55 +0000450 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
drh94f33312002-08-12 12:29:56 +0000451 if( rc!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000452 rc = SQLITE_PROTOCOL;
453 goto end_playback;
drhd9b02572001-04-15 00:37:09 +0000454 }
drh94f33312002-08-12 12:29:56 +0000455 if( memcmp(aMagic, aOldJournalMagic, sizeof(aMagic))==0 ){
456 pPager->journalFormat = SQLITE_OLD_JOURNAL_FORMAT;
457 }else if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))==0 ){
458 pPager->journalFormat = SQLITE_NEW_JOURNAL_FORMAT;
459 }else{
460 rc = SQLITE_PROTOCOL;
461 goto end_playback;
462 }
463 rc = read32bits(pPager, &pPager->jfd, &mxPg);
drhd9b02572001-04-15 00:37:09 +0000464 if( rc!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000465 goto end_playback;
drhd9b02572001-04-15 00:37:09 +0000466 }
drh28be87c2002-11-05 23:03:02 +0000467 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
drh81a20f22001-10-12 17:30:04 +0000468 if( rc!=SQLITE_OK ){
469 goto end_playback;
470 }
drhd9b02572001-04-15 00:37:09 +0000471 pPager->dbSize = mxPg;
472
drhfa86c412002-02-02 15:01:15 +0000473 /* Copy original pages out of the journal and back into the database file.
drhed7c8552001-04-11 14:29:21 +0000474 */
drhd9b02572001-04-15 00:37:09 +0000475 for(i=nRec-1; i>=0; i--){
drhfa86c412002-02-02 15:01:15 +0000476 rc = pager_playback_one_page(pPager, &pPager->jfd);
drhd9b02572001-04-15 00:37:09 +0000477 if( rc!=SQLITE_OK ) break;
drhed7c8552001-04-11 14:29:21 +0000478 }
drh81a20f22001-10-12 17:30:04 +0000479
480end_playback:
drhd9b02572001-04-15 00:37:09 +0000481 if( rc!=SQLITE_OK ){
482 pager_unwritelock(pPager);
483 pPager->errMask |= PAGER_ERR_CORRUPT;
484 rc = SQLITE_CORRUPT;
485 }else{
486 rc = pager_unwritelock(pPager);
drhed7c8552001-04-11 14:29:21 +0000487 }
drhd9b02572001-04-15 00:37:09 +0000488 return rc;
drhed7c8552001-04-11 14:29:21 +0000489}
490
491/*
drhfa86c412002-02-02 15:01:15 +0000492** Playback the checkpoint journal.
493**
494** This is similar to playing back the transaction journal but with
495** a few extra twists.
496**
drh663fc632002-02-02 18:49:19 +0000497** (1) The number of pages in the database file at the start of
498** the checkpoint is stored in pPager->ckptSize, not in the
499** journal file itself.
drhfa86c412002-02-02 15:01:15 +0000500**
501** (2) In addition to playing back the checkpoint journal, also
502** playback all pages of the transaction journal beginning
503** at offset pPager->ckptJSize.
504*/
505static int pager_ckpt_playback(Pager *pPager){
drh28be87c2002-11-05 23:03:02 +0000506 off_t nRec; /* Number of Records */
drhfa86c412002-02-02 15:01:15 +0000507 int i; /* Loop counter */
508 int rc;
509
510 /* Truncate the database back to its original size.
511 */
drh28be87c2002-11-05 23:03:02 +0000512 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
drhfa86c412002-02-02 15:01:15 +0000513 pPager->dbSize = pPager->ckptSize;
514
515 /* Figure out how many records are in the checkpoint journal.
516 */
drh0f892532002-05-30 12:27:03 +0000517 assert( pPager->ckptInUse && pPager->journalOpen );
drhfa86c412002-02-02 15:01:15 +0000518 sqliteOsSeek(&pPager->cpfd, 0);
519 rc = sqliteOsFileSize(&pPager->cpfd, &nRec);
520 if( rc!=SQLITE_OK ){
521 goto end_ckpt_playback;
522 }
523 nRec /= sizeof(PageRecord);
524
525 /* Copy original pages out of the checkpoint journal and back into the
526 ** database file.
527 */
drh74587e52002-08-13 00:01:16 +0000528 if( pager_old_format ){
529 pPager->journalFormat = SQLITE_OLD_JOURNAL_FORMAT;
530 }else{
531 pPager->journalFormat = SQLITE_NEW_JOURNAL_FORMAT;
532 }
drhfa86c412002-02-02 15:01:15 +0000533 for(i=nRec-1; i>=0; i--){
534 rc = pager_playback_one_page(pPager, &pPager->cpfd);
535 if( rc!=SQLITE_OK ) goto end_ckpt_playback;
536 }
537
538 /* Figure out how many pages need to be copied out of the transaction
539 ** journal.
540 */
541 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
542 if( rc!=SQLITE_OK ){
543 goto end_ckpt_playback;
544 }
545 rc = sqliteOsFileSize(&pPager->jfd, &nRec);
546 if( rc!=SQLITE_OK ){
547 goto end_ckpt_playback;
548 }
549 nRec = (nRec - pPager->ckptJSize)/sizeof(PageRecord);
550 for(i=nRec-1; i>=0; i--){
551 rc = pager_playback_one_page(pPager, &pPager->jfd);
552 if( rc!=SQLITE_OK ) goto end_ckpt_playback;
553 }
554
555
556end_ckpt_playback:
drhfa86c412002-02-02 15:01:15 +0000557 if( rc!=SQLITE_OK ){
drhfa86c412002-02-02 15:01:15 +0000558 pPager->errMask |= PAGER_ERR_CORRUPT;
559 rc = SQLITE_CORRUPT;
drhfa86c412002-02-02 15:01:15 +0000560 }
561 return rc;
562}
563
564/*
drhf57b14a2001-09-14 18:54:08 +0000565** Change the maximum number of in-memory pages that are allowed.
drhcd61c282002-03-06 22:01:34 +0000566**
567** The maximum number is the absolute value of the mxPage parameter.
568** If mxPage is negative, the noSync flag is also set. noSync bypasses
569** calls to sqliteOsSync(). The pager runs much faster with noSync on,
570** but if the operating system crashes or there is an abrupt power
571** failure, the database file might be left in an inconsistent and
572** unrepairable state.
drhf57b14a2001-09-14 18:54:08 +0000573*/
574void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
drh603240c2002-03-05 01:11:12 +0000575 if( mxPage>=0 ){
drha1680452002-04-18 01:56:57 +0000576 pPager->noSync = pPager->tempFile;
drh603240c2002-03-05 01:11:12 +0000577 }else{
578 pPager->noSync = 1;
579 mxPage = -mxPage;
580 }
drhf57b14a2001-09-14 18:54:08 +0000581 if( mxPage>10 ){
582 pPager->mxPage = mxPage;
583 }
584}
585
586/*
drhfa86c412002-02-02 15:01:15 +0000587** Open a temporary file. Write the name of the file into zName
588** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write
589** the file descriptor into *fd. Return SQLITE_OK on success or some
590** other error code if we fail.
591**
592** The OS will automatically delete the temporary file when it is
593** closed.
594*/
595static int sqlitepager_opentemp(char *zFile, OsFile *fd){
596 int cnt = 8;
597 int rc;
598 do{
599 cnt--;
600 sqliteOsTempFileName(zFile);
601 rc = sqliteOsOpenExclusive(zFile, fd, 1);
602 }while( cnt>0 && rc!=SQLITE_OK );
603 return rc;
604}
605
606/*
drhed7c8552001-04-11 14:29:21 +0000607** Create a new page cache and put a pointer to the page cache in *ppPager.
drh5e00f6c2001-09-13 13:46:56 +0000608** The file to be cached need not exist. The file is not locked until
drhd9b02572001-04-15 00:37:09 +0000609** the first call to sqlitepager_get() and is only held open until the
610** last page is released using sqlitepager_unref().
drh382c0242001-10-06 16:33:02 +0000611**
drh6446c4d2001-12-15 14:22:18 +0000612** If zFilename is NULL then a randomly-named temporary file is created
613** and used as the file to be cached. The file will be deleted
614** automatically when it is closed.
drhed7c8552001-04-11 14:29:21 +0000615*/
drh7e3b0a02001-04-28 16:52:40 +0000616int sqlitepager_open(
617 Pager **ppPager, /* Return the Pager structure here */
618 const char *zFilename, /* Name of the database file to open */
619 int mxPage, /* Max number of in-memory cache pages */
620 int nExtra /* Extra bytes append to each in-memory page */
621){
drhed7c8552001-04-11 14:29:21 +0000622 Pager *pPager;
623 int nameLen;
drh8cfbf082001-09-19 13:22:39 +0000624 OsFile fd;
625 int rc;
drh5e00f6c2001-09-13 13:46:56 +0000626 int tempFile;
627 int readOnly = 0;
drh8cfbf082001-09-19 13:22:39 +0000628 char zTemp[SQLITE_TEMPNAME_SIZE];
drhed7c8552001-04-11 14:29:21 +0000629
drhd9b02572001-04-15 00:37:09 +0000630 *ppPager = 0;
631 if( sqlite_malloc_failed ){
632 return SQLITE_NOMEM;
633 }
drh5e00f6c2001-09-13 13:46:56 +0000634 if( zFilename ){
drh8cfbf082001-09-19 13:22:39 +0000635 rc = sqliteOsOpenReadWrite(zFilename, &fd, &readOnly);
drh5e00f6c2001-09-13 13:46:56 +0000636 tempFile = 0;
637 }else{
drhfa86c412002-02-02 15:01:15 +0000638 rc = sqlitepager_opentemp(zTemp, &fd);
drh5e00f6c2001-09-13 13:46:56 +0000639 zFilename = zTemp;
640 tempFile = 1;
641 }
drh8cfbf082001-09-19 13:22:39 +0000642 if( rc!=SQLITE_OK ){
drhed7c8552001-04-11 14:29:21 +0000643 return SQLITE_CANTOPEN;
644 }
645 nameLen = strlen(zFilename);
646 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*2 + 30 );
drhd9b02572001-04-15 00:37:09 +0000647 if( pPager==0 ){
drha7fcb052001-12-14 15:09:55 +0000648 sqliteOsClose(&fd);
drhd9b02572001-04-15 00:37:09 +0000649 return SQLITE_NOMEM;
650 }
drhed7c8552001-04-11 14:29:21 +0000651 pPager->zFilename = (char*)&pPager[1];
652 pPager->zJournal = &pPager->zFilename[nameLen+1];
653 strcpy(pPager->zFilename, zFilename);
654 strcpy(pPager->zJournal, zFilename);
655 strcpy(&pPager->zJournal[nameLen], "-journal");
656 pPager->fd = fd;
drh8cfbf082001-09-19 13:22:39 +0000657 pPager->journalOpen = 0;
drhfa86c412002-02-02 15:01:15 +0000658 pPager->ckptOpen = 0;
drh0f892532002-05-30 12:27:03 +0000659 pPager->ckptInUse = 0;
drhed7c8552001-04-11 14:29:21 +0000660 pPager->nRef = 0;
661 pPager->dbSize = -1;
drhfa86c412002-02-02 15:01:15 +0000662 pPager->ckptSize = 0;
663 pPager->ckptJSize = 0;
drhed7c8552001-04-11 14:29:21 +0000664 pPager->nPage = 0;
drhd79caeb2001-04-15 02:27:24 +0000665 pPager->mxPage = mxPage>5 ? mxPage : 10;
drhed7c8552001-04-11 14:29:21 +0000666 pPager->state = SQLITE_UNLOCK;
drhd9b02572001-04-15 00:37:09 +0000667 pPager->errMask = 0;
drh5e00f6c2001-09-13 13:46:56 +0000668 pPager->tempFile = tempFile;
669 pPager->readOnly = readOnly;
drhf57b14a2001-09-14 18:54:08 +0000670 pPager->needSync = 0;
drha1680452002-04-18 01:56:57 +0000671 pPager->noSync = pPager->tempFile;
drhed7c8552001-04-11 14:29:21 +0000672 pPager->pFirst = 0;
673 pPager->pLast = 0;
drh7c717f72001-06-24 20:39:41 +0000674 pPager->nExtra = nExtra;
drhed7c8552001-04-11 14:29:21 +0000675 memset(pPager->aHash, 0, sizeof(pPager->aHash));
676 *ppPager = pPager;
677 return SQLITE_OK;
678}
679
680/*
drh72f82862001-05-24 21:06:34 +0000681** Set the destructor for this pager. If not NULL, the destructor is called
drh5e00f6c2001-09-13 13:46:56 +0000682** when the reference count on each page reaches zero. The destructor can
683** be used to clean up information in the extra segment appended to each page.
drh72f82862001-05-24 21:06:34 +0000684**
685** The destructor is not called as a result sqlitepager_close().
686** Destructors are only called by sqlitepager_unref().
687*/
688void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
689 pPager->xDestructor = xDesc;
690}
691
692/*
drh5e00f6c2001-09-13 13:46:56 +0000693** Return the total number of pages in the disk file associated with
694** pPager.
drhed7c8552001-04-11 14:29:21 +0000695*/
drhd9b02572001-04-15 00:37:09 +0000696int sqlitepager_pagecount(Pager *pPager){
drh28be87c2002-11-05 23:03:02 +0000697 off_t n;
drhd9b02572001-04-15 00:37:09 +0000698 assert( pPager!=0 );
drhed7c8552001-04-11 14:29:21 +0000699 if( pPager->dbSize>=0 ){
700 return pPager->dbSize;
701 }
drha7fcb052001-12-14 15:09:55 +0000702 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
drh81a20f22001-10-12 17:30:04 +0000703 pPager->errMask |= PAGER_ERR_DISK;
drh8cfbf082001-09-19 13:22:39 +0000704 return 0;
drhed7c8552001-04-11 14:29:21 +0000705 }
drh8cfbf082001-09-19 13:22:39 +0000706 n /= SQLITE_PAGE_SIZE;
drhd9b02572001-04-15 00:37:09 +0000707 if( pPager->state!=SQLITE_UNLOCK ){
drhed7c8552001-04-11 14:29:21 +0000708 pPager->dbSize = n;
709 }
710 return n;
711}
712
713/*
714** Shutdown the page cache. Free all memory and close all files.
715**
716** If a transaction was in progress when this routine is called, that
717** transaction is rolled back. All outstanding pages are invalidated
718** and their memory is freed. Any attempt to use a page associated
719** with this page cache after this function returns will likely
720** result in a coredump.
721*/
drhd9b02572001-04-15 00:37:09 +0000722int sqlitepager_close(Pager *pPager){
723 PgHdr *pPg, *pNext;
drhed7c8552001-04-11 14:29:21 +0000724 switch( pPager->state ){
725 case SQLITE_WRITELOCK: {
drhd9b02572001-04-15 00:37:09 +0000726 sqlitepager_rollback(pPager);
drha7fcb052001-12-14 15:09:55 +0000727 sqliteOsUnlock(&pPager->fd);
drh8cfbf082001-09-19 13:22:39 +0000728 assert( pPager->journalOpen==0 );
drhed7c8552001-04-11 14:29:21 +0000729 break;
730 }
731 case SQLITE_READLOCK: {
drha7fcb052001-12-14 15:09:55 +0000732 sqliteOsUnlock(&pPager->fd);
drhed7c8552001-04-11 14:29:21 +0000733 break;
734 }
735 default: {
736 /* Do nothing */
737 break;
738 }
739 }
drhd9b02572001-04-15 00:37:09 +0000740 for(pPg=pPager->pAll; pPg; pPg=pNext){
741 pNext = pPg->pNextAll;
742 sqliteFree(pPg);
drhed7c8552001-04-11 14:29:21 +0000743 }
drha7fcb052001-12-14 15:09:55 +0000744 sqliteOsClose(&pPager->fd);
drh8cfbf082001-09-19 13:22:39 +0000745 assert( pPager->journalOpen==0 );
drh0f892532002-05-30 12:27:03 +0000746 /* Temp files are automatically deleted by the OS
747 ** if( pPager->tempFile ){
748 ** sqliteOsDelete(pPager->zFilename);
749 ** }
750 */
drhed7c8552001-04-11 14:29:21 +0000751 sqliteFree(pPager);
752 return SQLITE_OK;
753}
754
755/*
drh5e00f6c2001-09-13 13:46:56 +0000756** Return the page number for the given page data.
drhed7c8552001-04-11 14:29:21 +0000757*/
drhd9b02572001-04-15 00:37:09 +0000758Pgno sqlitepager_pagenumber(void *pData){
drhed7c8552001-04-11 14:29:21 +0000759 PgHdr *p = DATA_TO_PGHDR(pData);
760 return p->pgno;
761}
762
763/*
drh7e3b0a02001-04-28 16:52:40 +0000764** Increment the reference count for a page. If the page is
765** currently on the freelist (the reference count is zero) then
766** remove it from the freelist.
767*/
drhdf0b3b02001-06-23 11:36:20 +0000768static void page_ref(PgHdr *pPg){
drh7e3b0a02001-04-28 16:52:40 +0000769 if( pPg->nRef==0 ){
770 /* The page is currently on the freelist. Remove it. */
771 if( pPg->pPrevFree ){
772 pPg->pPrevFree->pNextFree = pPg->pNextFree;
773 }else{
774 pPg->pPager->pFirst = pPg->pNextFree;
775 }
776 if( pPg->pNextFree ){
777 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
778 }else{
779 pPg->pPager->pLast = pPg->pPrevFree;
780 }
781 pPg->pPager->nRef++;
782 }
783 pPg->nRef++;
drhdd793422001-06-28 01:54:48 +0000784 REFINFO(pPg);
drhdf0b3b02001-06-23 11:36:20 +0000785}
786
787/*
788** Increment the reference count for a page. The input pointer is
789** a reference to the page data.
790*/
791int sqlitepager_ref(void *pData){
792 PgHdr *pPg = DATA_TO_PGHDR(pData);
793 page_ref(pPg);
drh8c42ca92001-06-22 19:15:00 +0000794 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000795}
796
797/*
drh03eb96a2002-11-10 23:32:56 +0000798** The parameters are pointers to the head of two sorted lists
799** of page headers. Merge these two lists together and return
800** a single sorted list. This routine forms the core of the
801** merge-sort algorithm that sorts dirty pages into accending
802** order prior to writing them back to the disk.
803**
804** In the case of a tie, left sorts in front of right.
805**
806** Headers are sorted in order of ascending page number.
807*/
808static PgHdr *page_merge(PgHdr *pLeft, PgHdr *pRight){
809 PgHdr sHead;
810 PgHdr *pTail;
811 pTail = &sHead;
812 pTail->pSort = 0;
813 while( pLeft && pRight ){
814 if( pLeft->pgno<=pRight->pgno ){
815 pTail->pSort = pLeft;
816 pLeft = pLeft->pSort;
817 }else{
818 pTail->pSort = pRight;
819 pRight = pRight->pSort;
820 }
821 pTail = pTail->pSort;
822 }
823 if( pLeft ){
824 pTail->pSort = pLeft;
825 }else if( pRight ){
826 pTail->pSort = pRight;
827 }
828 return sHead.pSort;
829}
830
831
832/*
drhb19a2bc2001-09-16 00:13:26 +0000833** Sync the journal and then write all free dirty pages to the database
834** file.
835**
836** Writing all free dirty pages to the database after the sync is a
837** non-obvious optimization. fsync() is an expensive operation so we
drhaaab5722002-02-19 13:39:21 +0000838** want to minimize the number ot times it is called. After an fsync() call,
drh6446c4d2001-12-15 14:22:18 +0000839** we are free to write dirty pages back to the database. It is best
840** to go ahead and write as many dirty pages as possible to minimize
841** the risk of having to do another fsync() later on. Writing dirty
842** free pages in this way was observed to make database operations go
843** up to 10 times faster.
drhfa86c412002-02-02 15:01:15 +0000844**
845** If we are writing to temporary database, there is no need to preserve
846** the integrity of the journal file, so we can save time and skip the
847** fsync().
drh03eb96a2002-11-10 23:32:56 +0000848**
849** This routine goes to the extra trouble of sorting all the dirty
850** pages by their page number prior to writing them. Tests show that
851** writing pages in order by page number gives a modest speed improvement
852** under Linux.
drh50e5dad2001-09-15 00:57:28 +0000853*/
854static int syncAllPages(Pager *pPager){
855 PgHdr *pPg;
drh03eb96a2002-11-10 23:32:56 +0000856 PgHdr *pToWrite;
857# define NSORT 28
858 Pgno lastPgno;
859 int i;
860 PgHdr *apSorter[NSORT];
drh50e5dad2001-09-15 00:57:28 +0000861 int rc = SQLITE_OK;
drh03eb96a2002-11-10 23:32:56 +0000862
863 /* Sync the journal before modifying the main database
864 ** (assuming there is a journal and it needs to be synced.)
865 */
drh50e5dad2001-09-15 00:57:28 +0000866 if( pPager->needSync ){
drhfa86c412002-02-02 15:01:15 +0000867 if( !pPager->tempFile ){
868 rc = sqliteOsSync(&pPager->jfd);
869 if( rc!=0 ) return rc;
870 }
drh50e5dad2001-09-15 00:57:28 +0000871 pPager->needSync = 0;
872 }
drh03eb96a2002-11-10 23:32:56 +0000873
874 /* Create a list of all dirty pages
875 */
876 pToWrite = 0;
drh50e5dad2001-09-15 00:57:28 +0000877 for(pPg=pPager->pFirst; pPg; pPg=pPg->pNextFree){
878 if( pPg->dirty ){
drh03eb96a2002-11-10 23:32:56 +0000879 pPg->pSort = pToWrite;
880 pToWrite = pPg;
drh50e5dad2001-09-15 00:57:28 +0000881 }
882 }
drh03eb96a2002-11-10 23:32:56 +0000883
884 /* Sort the list of dirty pages into accending order by
885 ** page number
886 */
887 for(i=0; i<NSORT; i++){
888 apSorter[i] = 0;
889 }
890 while( pToWrite ){
891 pPg = pToWrite;
892 pToWrite = pPg->pSort;
893 pPg->pSort = 0;
894 for(i=0; i<NSORT-1; i++){
895 if( apSorter[i]==0 ){
896 apSorter[i] = pPg;
897 break;
898 }else{
899 pPg = page_merge(apSorter[i], pPg);
900 apSorter[i] = 0;
901 }
902 }
903 if( i>=NSORT-1 ){
904 apSorter[NSORT-1] = page_merge(apSorter[NSORT-1],pPg);
905 }
906 }
907 pToWrite = 0;
908 for(i=0; i<NSORT; i++){
909 pToWrite = page_merge(apSorter[i], pToWrite);
910 }
911
912 /* Write all dirty pages back to the database and mark
913 ** them all clean.
914 */
915 lastPgno = 0;
916 for(pPg=pToWrite; pPg; pPg=pPg->pSort){
917 if( lastPgno==0 || pPg->pgno!=lastPgno-1 ){
918 sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*SQLITE_PAGE_SIZE);
919 }
920 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
921 if( rc!=SQLITE_OK ) break;
922 pPg->dirty = 0;
923 lastPgno = pPg->pgno;
924 }
drh81a20f22001-10-12 17:30:04 +0000925 return rc;
drh50e5dad2001-09-15 00:57:28 +0000926}
927
928/*
drhd9b02572001-04-15 00:37:09 +0000929** Acquire a page.
930**
drh58a11682001-11-10 13:51:08 +0000931** A read lock on the disk file is obtained when the first page is acquired.
drh5e00f6c2001-09-13 13:46:56 +0000932** This read lock is dropped when the last page is released.
drhd9b02572001-04-15 00:37:09 +0000933**
drh306dc212001-05-21 13:45:10 +0000934** A _get works for any page number greater than 0. If the database
935** file is smaller than the requested page, then no actual disk
936** read occurs and the memory image of the page is initialized to
937** all zeros. The extra data appended to a page is always initialized
938** to zeros the first time a page is loaded into memory.
939**
drhd9b02572001-04-15 00:37:09 +0000940** The acquisition might fail for several reasons. In all cases,
941** an appropriate error code is returned and *ppPage is set to NULL.
drh7e3b0a02001-04-28 16:52:40 +0000942**
943** See also sqlitepager_lookup(). Both this routine and _lookup() attempt
944** to find a page in the in-memory cache first. If the page is not already
drh5e00f6c2001-09-13 13:46:56 +0000945** in memory, this routine goes to disk to read it in whereas _lookup()
drh7e3b0a02001-04-28 16:52:40 +0000946** just returns 0. This routine acquires a read-lock the first time it
947** has to go to disk, and could also playback an old journal if necessary.
948** Since _lookup() never goes to disk, it never has to deal with locks
949** or journal files.
drhed7c8552001-04-11 14:29:21 +0000950*/
drhd9b02572001-04-15 00:37:09 +0000951int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
drhed7c8552001-04-11 14:29:21 +0000952 PgHdr *pPg;
drh8766c342002-11-09 00:33:15 +0000953 int rc;
drhed7c8552001-04-11 14:29:21 +0000954
drhd9b02572001-04-15 00:37:09 +0000955 /* Make sure we have not hit any critical errors.
956 */
957 if( pPager==0 || pgno==0 ){
958 return SQLITE_ERROR;
959 }
960 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
961 return pager_errcode(pPager);
962 }
963
drhed7c8552001-04-11 14:29:21 +0000964 /* If this is the first page accessed, then get a read lock
965 ** on the database file.
966 */
967 if( pPager->nRef==0 ){
drh8766c342002-11-09 00:33:15 +0000968 rc = sqliteOsReadLock(&pPager->fd);
969 if( rc!=SQLITE_OK ){
drhed7c8552001-04-11 14:29:21 +0000970 *ppPage = 0;
drh8766c342002-11-09 00:33:15 +0000971 return rc;
drhed7c8552001-04-11 14:29:21 +0000972 }
drhd9b02572001-04-15 00:37:09 +0000973 pPager->state = SQLITE_READLOCK;
drhed7c8552001-04-11 14:29:21 +0000974
975 /* If a journal file exists, try to play it back.
976 */
drh8cfbf082001-09-19 13:22:39 +0000977 if( sqliteOsFileExists(pPager->zJournal) ){
drhf57b3392001-10-08 13:22:32 +0000978 int rc, dummy;
drhed7c8552001-04-11 14:29:21 +0000979
drha7fcb052001-12-14 15:09:55 +0000980 /* Get a write lock on the database
981 */
982 rc = sqliteOsWriteLock(&pPager->fd);
983 if( rc!=SQLITE_OK ){
drh8766c342002-11-09 00:33:15 +0000984 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
985 /* This should never happen! */
986 rc = SQLITE_INTERNAL;
987 }
drha7fcb052001-12-14 15:09:55 +0000988 *ppPage = 0;
drh8766c342002-11-09 00:33:15 +0000989 return rc;
drha7fcb052001-12-14 15:09:55 +0000990 }
991 pPager->state = SQLITE_WRITELOCK;
992
drhed7c8552001-04-11 14:29:21 +0000993 /* Open the journal for exclusive access. Return SQLITE_BUSY if
drhf57b3392001-10-08 13:22:32 +0000994 ** we cannot get exclusive access to the journal file.
995 **
996 ** Even though we will only be reading from the journal, not writing,
997 ** we have to open the journal for writing in order to obtain an
998 ** exclusive access lock.
drhed7c8552001-04-11 14:29:21 +0000999 */
drhf57b3392001-10-08 13:22:32 +00001000 rc = sqliteOsOpenReadWrite(pPager->zJournal, &pPager->jfd, &dummy);
drha7fcb052001-12-14 15:09:55 +00001001 if( rc!=SQLITE_OK ){
1002 rc = sqliteOsUnlock(&pPager->fd);
1003 assert( rc==SQLITE_OK );
drhed7c8552001-04-11 14:29:21 +00001004 *ppPage = 0;
1005 return SQLITE_BUSY;
1006 }
drha7fcb052001-12-14 15:09:55 +00001007 pPager->journalOpen = 1;
drhed7c8552001-04-11 14:29:21 +00001008
1009 /* Playback and delete the journal. Drop the database write
1010 ** lock and reacquire the read lock.
1011 */
drhd9b02572001-04-15 00:37:09 +00001012 rc = pager_playback(pPager);
1013 if( rc!=SQLITE_OK ){
1014 return rc;
1015 }
drhed7c8552001-04-11 14:29:21 +00001016 }
1017 pPg = 0;
1018 }else{
1019 /* Search for page in cache */
drhd9b02572001-04-15 00:37:09 +00001020 pPg = pager_lookup(pPager, pgno);
drhed7c8552001-04-11 14:29:21 +00001021 }
1022 if( pPg==0 ){
drhd9b02572001-04-15 00:37:09 +00001023 /* The requested page is not in the page cache. */
drhed7c8552001-04-11 14:29:21 +00001024 int h;
drh7e3b0a02001-04-28 16:52:40 +00001025 pPager->nMiss++;
drhed7c8552001-04-11 14:29:21 +00001026 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
1027 /* Create a new page */
drh7e3b0a02001-04-28 16:52:40 +00001028 pPg = sqliteMalloc( sizeof(*pPg) + SQLITE_PAGE_SIZE + pPager->nExtra );
drhd9b02572001-04-15 00:37:09 +00001029 if( pPg==0 ){
1030 *ppPage = 0;
1031 pager_unwritelock(pPager);
1032 pPager->errMask |= PAGER_ERR_MEM;
1033 return SQLITE_NOMEM;
1034 }
drhed7c8552001-04-11 14:29:21 +00001035 pPg->pPager = pPager;
drhd9b02572001-04-15 00:37:09 +00001036 pPg->pNextAll = pPager->pAll;
1037 if( pPager->pAll ){
1038 pPager->pAll->pPrevAll = pPg;
1039 }
1040 pPg->pPrevAll = 0;
drhd79caeb2001-04-15 02:27:24 +00001041 pPager->pAll = pPg;
drhd9b02572001-04-15 00:37:09 +00001042 pPager->nPage++;
drhed7c8552001-04-11 14:29:21 +00001043 }else{
drhd9b02572001-04-15 00:37:09 +00001044 /* Recycle an older page. First locate the page to be recycled.
1045 ** Try to find one that is not dirty and is near the head of
1046 ** of the free list */
drhed7c8552001-04-11 14:29:21 +00001047 pPg = pPager->pFirst;
drh603240c2002-03-05 01:11:12 +00001048 while( pPg && pPg->dirty ){
drhd9b02572001-04-15 00:37:09 +00001049 pPg = pPg->pNextFree;
1050 }
drhb19a2bc2001-09-16 00:13:26 +00001051
1052 /* If we could not find a page that has not been used recently
1053 ** and which is not dirty, then sync the journal and write all
1054 ** dirty free pages into the database file, thus making them
1055 ** clean pages and available for recycling.
1056 **
1057 ** We have to sync the journal before writing a page to the main
1058 ** database. But syncing is a very slow operation. So after a
1059 ** sync, it is best to write everything we can back to the main
1060 ** database to minimize the risk of having to sync again in the
drh94f33312002-08-12 12:29:56 +00001061 ** near future. That is why we write all dirty pages after a
drhb19a2bc2001-09-16 00:13:26 +00001062 ** sync.
1063 */
drh603240c2002-03-05 01:11:12 +00001064 if( pPg==0 ){
drh50e5dad2001-09-15 00:57:28 +00001065 int rc = syncAllPages(pPager);
1066 if( rc!=0 ){
1067 sqlitepager_rollback(pPager);
1068 *ppPage = 0;
1069 return SQLITE_IOERR;
1070 }
1071 pPg = pPager->pFirst;
1072 }
drhd9b02572001-04-15 00:37:09 +00001073 assert( pPg->nRef==0 );
drh50e5dad2001-09-15 00:57:28 +00001074 assert( pPg->dirty==0 );
drhd9b02572001-04-15 00:37:09 +00001075
drh193a6b42002-07-07 16:52:46 +00001076 /* If the page we are recyclying is marked as alwaysRollback, then
1077 ** set the global alwaysRollback flag, thus disabling the
1078 ** sqlite_dont_rollback() optimization for the rest of this transaction.
1079 ** It is necessary to do this because the page marked alwaysRollback
1080 ** might be reloaded at a later time but at that point we won't remember
1081 ** that is was marked alwaysRollback. This means that all pages must
1082 ** be marked as alwaysRollback from here on out.
1083 */
1084 if( pPg->alwaysRollback ){
1085 pPager->alwaysRollback = 1;
1086 }
1087
drhd9b02572001-04-15 00:37:09 +00001088 /* Unlink the old page from the free list and the hash table
1089 */
drh6019e162001-07-02 17:51:45 +00001090 if( pPg->pPrevFree ){
1091 pPg->pPrevFree->pNextFree = pPg->pNextFree;
drhed7c8552001-04-11 14:29:21 +00001092 }else{
drh6019e162001-07-02 17:51:45 +00001093 assert( pPager->pFirst==pPg );
1094 pPager->pFirst = pPg->pNextFree;
drhed7c8552001-04-11 14:29:21 +00001095 }
drh6019e162001-07-02 17:51:45 +00001096 if( pPg->pNextFree ){
1097 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1098 }else{
1099 assert( pPager->pLast==pPg );
1100 pPager->pLast = pPg->pPrevFree;
1101 }
1102 pPg->pNextFree = pPg->pPrevFree = 0;
drhed7c8552001-04-11 14:29:21 +00001103 if( pPg->pNextHash ){
1104 pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1105 }
1106 if( pPg->pPrevHash ){
1107 pPg->pPrevHash->pNextHash = pPg->pNextHash;
1108 }else{
drhd9b02572001-04-15 00:37:09 +00001109 h = pager_hash(pPg->pgno);
drhed7c8552001-04-11 14:29:21 +00001110 assert( pPager->aHash[h]==pPg );
1111 pPager->aHash[h] = pPg->pNextHash;
1112 }
drh6019e162001-07-02 17:51:45 +00001113 pPg->pNextHash = pPg->pPrevHash = 0;
drhd9b02572001-04-15 00:37:09 +00001114 pPager->nOvfl++;
drhed7c8552001-04-11 14:29:21 +00001115 }
1116 pPg->pgno = pgno;
drh1ab43002002-01-14 09:28:19 +00001117 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
drh6019e162001-07-02 17:51:45 +00001118 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
1119 }else{
1120 pPg->inJournal = 0;
1121 }
drh03eb96a2002-11-10 23:32:56 +00001122 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
1123 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
1124 page_add_to_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001125 }else{
drh03eb96a2002-11-10 23:32:56 +00001126 page_remove_from_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001127 }
drhed7c8552001-04-11 14:29:21 +00001128 pPg->dirty = 0;
1129 pPg->nRef = 1;
drhdd793422001-06-28 01:54:48 +00001130 REFINFO(pPg);
drhd9b02572001-04-15 00:37:09 +00001131 pPager->nRef++;
1132 h = pager_hash(pgno);
drhed7c8552001-04-11 14:29:21 +00001133 pPg->pNextHash = pPager->aHash[h];
1134 pPager->aHash[h] = pPg;
1135 if( pPg->pNextHash ){
1136 assert( pPg->pNextHash->pPrevHash==0 );
1137 pPg->pNextHash->pPrevHash = pPg;
1138 }
drh306dc212001-05-21 13:45:10 +00001139 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
drh1ab43002002-01-14 09:28:19 +00001140 if( pPager->dbSize<(int)pgno ){
drh306dc212001-05-21 13:45:10 +00001141 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1142 }else{
drh81a20f22001-10-12 17:30:04 +00001143 int rc;
drha7fcb052001-12-14 15:09:55 +00001144 sqliteOsSeek(&pPager->fd, (pgno-1)*SQLITE_PAGE_SIZE);
1145 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
drh81a20f22001-10-12 17:30:04 +00001146 if( rc!=SQLITE_OK ){
drh28be87c2002-11-05 23:03:02 +00001147 off_t fileSize;
drh4e371ee2002-09-05 16:08:27 +00001148 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
1149 || fileSize>=pgno*SQLITE_PAGE_SIZE ){
1150 return rc;
1151 }else{
1152 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1153 }
drh81a20f22001-10-12 17:30:04 +00001154 }
drh306dc212001-05-21 13:45:10 +00001155 }
drh7e3b0a02001-04-28 16:52:40 +00001156 if( pPager->nExtra>0 ){
1157 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
1158 }
drhed7c8552001-04-11 14:29:21 +00001159 }else{
drhd9b02572001-04-15 00:37:09 +00001160 /* The requested page is in the page cache. */
drh7e3b0a02001-04-28 16:52:40 +00001161 pPager->nHit++;
drhdf0b3b02001-06-23 11:36:20 +00001162 page_ref(pPg);
drhed7c8552001-04-11 14:29:21 +00001163 }
1164 *ppPage = PGHDR_TO_DATA(pPg);
1165 return SQLITE_OK;
1166}
1167
1168/*
drh7e3b0a02001-04-28 16:52:40 +00001169** Acquire a page if it is already in the in-memory cache. Do
1170** not read the page from disk. Return a pointer to the page,
1171** or 0 if the page is not in cache.
1172**
1173** See also sqlitepager_get(). The difference between this routine
1174** and sqlitepager_get() is that _get() will go to the disk and read
1175** in the page if the page is not already in cache. This routine
drh5e00f6c2001-09-13 13:46:56 +00001176** returns NULL if the page is not in cache or if a disk I/O error
1177** has ever happened.
drh7e3b0a02001-04-28 16:52:40 +00001178*/
1179void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
1180 PgHdr *pPg;
1181
1182 /* Make sure we have not hit any critical errors.
1183 */
1184 if( pPager==0 || pgno==0 ){
1185 return 0;
1186 }
1187 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1188 return 0;
1189 }
1190 if( pPager->nRef==0 ){
1191 return 0;
1192 }
1193 pPg = pager_lookup(pPager, pgno);
1194 if( pPg==0 ) return 0;
drhdf0b3b02001-06-23 11:36:20 +00001195 page_ref(pPg);
drh7e3b0a02001-04-28 16:52:40 +00001196 return PGHDR_TO_DATA(pPg);
1197}
1198
1199/*
drhed7c8552001-04-11 14:29:21 +00001200** Release a page.
1201**
1202** If the number of references to the page drop to zero, then the
1203** page is added to the LRU list. When all references to all pages
drhd9b02572001-04-15 00:37:09 +00001204** are released, a rollback occurs and the lock on the database is
drhed7c8552001-04-11 14:29:21 +00001205** removed.
1206*/
drhd9b02572001-04-15 00:37:09 +00001207int sqlitepager_unref(void *pData){
drhed7c8552001-04-11 14:29:21 +00001208 PgHdr *pPg;
drhd9b02572001-04-15 00:37:09 +00001209
1210 /* Decrement the reference count for this page
1211 */
drhed7c8552001-04-11 14:29:21 +00001212 pPg = DATA_TO_PGHDR(pData);
1213 assert( pPg->nRef>0 );
drhed7c8552001-04-11 14:29:21 +00001214 pPg->nRef--;
drhdd793422001-06-28 01:54:48 +00001215 REFINFO(pPg);
drhd9b02572001-04-15 00:37:09 +00001216
drh72f82862001-05-24 21:06:34 +00001217 /* When the number of references to a page reach 0, call the
1218 ** destructor and add the page to the freelist.
drhd9b02572001-04-15 00:37:09 +00001219 */
drhed7c8552001-04-11 14:29:21 +00001220 if( pPg->nRef==0 ){
drh1eaa2692001-09-18 02:02:23 +00001221 Pager *pPager;
1222 pPager = pPg->pPager;
drhd9b02572001-04-15 00:37:09 +00001223 pPg->pNextFree = 0;
1224 pPg->pPrevFree = pPager->pLast;
drhed7c8552001-04-11 14:29:21 +00001225 pPager->pLast = pPg;
drhd9b02572001-04-15 00:37:09 +00001226 if( pPg->pPrevFree ){
1227 pPg->pPrevFree->pNextFree = pPg;
drhed7c8552001-04-11 14:29:21 +00001228 }else{
1229 pPager->pFirst = pPg;
1230 }
drh72f82862001-05-24 21:06:34 +00001231 if( pPager->xDestructor ){
1232 pPager->xDestructor(pData);
1233 }
drhd9b02572001-04-15 00:37:09 +00001234
1235 /* When all pages reach the freelist, drop the read lock from
1236 ** the database file.
1237 */
1238 pPager->nRef--;
1239 assert( pPager->nRef>=0 );
1240 if( pPager->nRef==0 ){
1241 pager_reset(pPager);
1242 }
drhed7c8552001-04-11 14:29:21 +00001243 }
drhd9b02572001-04-15 00:37:09 +00001244 return SQLITE_OK;
drhed7c8552001-04-11 14:29:21 +00001245}
1246
1247/*
drh4b845d72002-03-05 12:41:19 +00001248** Acquire a write-lock on the database. The lock is removed when
1249** the any of the following happen:
1250**
1251** * sqlitepager_commit() is called.
1252** * sqlitepager_rollback() is called.
1253** * sqlitepager_close() is called.
1254** * sqlitepager_unref() is called to on every outstanding page.
1255**
1256** The parameter to this routine is a pointer to any open page of the
1257** database file. Nothing changes about the page - it is used merely
1258** to acquire a pointer to the Pager structure and as proof that there
1259** is already a read-lock on the database.
1260**
1261** If the database is already write-locked, this routine is a no-op.
1262*/
1263int sqlitepager_begin(void *pData){
1264 PgHdr *pPg = DATA_TO_PGHDR(pData);
1265 Pager *pPager = pPg->pPager;
1266 int rc = SQLITE_OK;
1267 assert( pPg->nRef>0 );
1268 assert( pPager->state!=SQLITE_UNLOCK );
1269 if( pPager->state==SQLITE_READLOCK ){
1270 assert( pPager->aInJournal==0 );
1271 rc = sqliteOsWriteLock(&pPager->fd);
1272 if( rc!=SQLITE_OK ){
1273 return rc;
1274 }
1275 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
1276 if( pPager->aInJournal==0 ){
1277 sqliteOsReadLock(&pPager->fd);
1278 return SQLITE_NOMEM;
1279 }
drh8e298f92002-07-06 16:28:47 +00001280 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
drh4b845d72002-03-05 12:41:19 +00001281 if( rc!=SQLITE_OK ){
1282 sqliteFree(pPager->aInJournal);
1283 pPager->aInJournal = 0;
1284 sqliteOsReadLock(&pPager->fd);
1285 return SQLITE_CANTOPEN;
1286 }
1287 pPager->journalOpen = 1;
drha1680452002-04-18 01:56:57 +00001288 pPager->needSync = 0;
1289 pPager->dirtyFile = 0;
drh193a6b42002-07-07 16:52:46 +00001290 pPager->alwaysRollback = 0;
drh4b845d72002-03-05 12:41:19 +00001291 pPager->state = SQLITE_WRITELOCK;
1292 sqlitepager_pagecount(pPager);
1293 pPager->origDbSize = pPager->dbSize;
drh94f33312002-08-12 12:29:56 +00001294 if( pager_old_format ){
1295 rc = sqliteOsWrite(&pPager->jfd, aOldJournalMagic,
1296 sizeof(aOldJournalMagic));
1297 }else{
1298 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic, sizeof(aJournalMagic));
1299 }
drh4b845d72002-03-05 12:41:19 +00001300 if( rc==SQLITE_OK ){
drh94f33312002-08-12 12:29:56 +00001301 rc = write32bits(&pPager->jfd, pPager->dbSize);
drh4b845d72002-03-05 12:41:19 +00001302 }
1303 if( rc!=SQLITE_OK ){
1304 rc = pager_unwritelock(pPager);
drh4e371ee2002-09-05 16:08:27 +00001305 if( rc==SQLITE_OK ){
1306 rc = SQLITE_FULL;
1307 }
drh4b845d72002-03-05 12:41:19 +00001308 }
1309 }
1310 return rc;
1311}
1312
1313/*
drhed7c8552001-04-11 14:29:21 +00001314** Mark a data page as writeable. The page is written into the journal
1315** if it is not there already. This routine must be called before making
1316** changes to a page.
1317**
1318** The first time this routine is called, the pager creates a new
1319** journal and acquires a write lock on the database. If the write
1320** lock could not be acquired, this routine returns SQLITE_BUSY. The
drh306dc212001-05-21 13:45:10 +00001321** calling routine must check for that return value and be careful not to
drhed7c8552001-04-11 14:29:21 +00001322** change any page data until this routine returns SQLITE_OK.
drhd9b02572001-04-15 00:37:09 +00001323**
1324** If the journal file could not be written because the disk is full,
1325** then this routine returns SQLITE_FULL and does an immediate rollback.
1326** All subsequent write attempts also return SQLITE_FULL until there
1327** is a call to sqlitepager_commit() or sqlitepager_rollback() to
1328** reset.
drhed7c8552001-04-11 14:29:21 +00001329*/
drhd9b02572001-04-15 00:37:09 +00001330int sqlitepager_write(void *pData){
drh69688d52001-04-14 16:38:23 +00001331 PgHdr *pPg = DATA_TO_PGHDR(pData);
1332 Pager *pPager = pPg->pPager;
drhd79caeb2001-04-15 02:27:24 +00001333 int rc = SQLITE_OK;
drh69688d52001-04-14 16:38:23 +00001334
drh6446c4d2001-12-15 14:22:18 +00001335 /* Check for errors
1336 */
drhd9b02572001-04-15 00:37:09 +00001337 if( pPager->errMask ){
1338 return pager_errcode(pPager);
1339 }
drh5e00f6c2001-09-13 13:46:56 +00001340 if( pPager->readOnly ){
1341 return SQLITE_PERM;
1342 }
drh6446c4d2001-12-15 14:22:18 +00001343
1344 /* Mark the page as dirty. If the page has already been written
1345 ** to the journal then we can return right away.
1346 */
drhd9b02572001-04-15 00:37:09 +00001347 pPg->dirty = 1;
drh0f892532002-05-30 12:27:03 +00001348 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
drha1680452002-04-18 01:56:57 +00001349 pPager->dirtyFile = 1;
drhfa86c412002-02-02 15:01:15 +00001350 return SQLITE_OK;
1351 }
drh6446c4d2001-12-15 14:22:18 +00001352
1353 /* If we get this far, it means that the page needs to be
drhfa86c412002-02-02 15:01:15 +00001354 ** written to the transaction journal or the ckeckpoint journal
1355 ** or both.
1356 **
1357 ** First check to see that the transaction journal exists and
1358 ** create it if it does not.
drh6446c4d2001-12-15 14:22:18 +00001359 */
drhd9b02572001-04-15 00:37:09 +00001360 assert( pPager->state!=SQLITE_UNLOCK );
drh4b845d72002-03-05 12:41:19 +00001361 rc = sqlitepager_begin(pData);
drha1680452002-04-18 01:56:57 +00001362 pPager->dirtyFile = 1;
drh4b845d72002-03-05 12:41:19 +00001363 if( rc!=SQLITE_OK ) return rc;
drhd9b02572001-04-15 00:37:09 +00001364 assert( pPager->state==SQLITE_WRITELOCK );
drh8cfbf082001-09-19 13:22:39 +00001365 assert( pPager->journalOpen );
drh6446c4d2001-12-15 14:22:18 +00001366
drhfa86c412002-02-02 15:01:15 +00001367 /* The transaction journal now exists and we have a write lock on the
1368 ** main database file. Write the current page to the transaction
1369 ** journal if it is not there already.
drh6446c4d2001-12-15 14:22:18 +00001370 */
drhfa86c412002-02-02 15:01:15 +00001371 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
drh94f33312002-08-12 12:29:56 +00001372 rc = write32bits(&pPager->jfd, pPg->pgno);
drhd9b02572001-04-15 00:37:09 +00001373 if( rc==SQLITE_OK ){
drha7fcb052001-12-14 15:09:55 +00001374 rc = sqliteOsWrite(&pPager->jfd, pData, SQLITE_PAGE_SIZE);
drhd9b02572001-04-15 00:37:09 +00001375 }
1376 if( rc!=SQLITE_OK ){
1377 sqlitepager_rollback(pPager);
1378 pPager->errMask |= PAGER_ERR_FULL;
1379 return rc;
1380 }
drh6019e162001-07-02 17:51:45 +00001381 assert( pPager->aInJournal!=0 );
1382 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh603240c2002-03-05 01:11:12 +00001383 pPager->needSync = !pPager->noSync;
drhfa86c412002-02-02 15:01:15 +00001384 pPg->inJournal = 1;
drh0f892532002-05-30 12:27:03 +00001385 if( pPager->ckptInUse ){
drhfa86c412002-02-02 15:01:15 +00001386 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001387 page_add_to_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001388 }
drh69688d52001-04-14 16:38:23 +00001389 }
drh6446c4d2001-12-15 14:22:18 +00001390
drhfa86c412002-02-02 15:01:15 +00001391 /* If the checkpoint journal is open and the page is not in it,
1392 ** then write the current page to the checkpoint journal.
drh6446c4d2001-12-15 14:22:18 +00001393 */
drh0f892532002-05-30 12:27:03 +00001394 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
drh1e336b42002-02-14 12:50:33 +00001395 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
drh94f33312002-08-12 12:29:56 +00001396 rc = write32bits(&pPager->cpfd, pPg->pgno);
drhfa86c412002-02-02 15:01:15 +00001397 if( rc==SQLITE_OK ){
1398 rc = sqliteOsWrite(&pPager->cpfd, pData, SQLITE_PAGE_SIZE);
1399 }
1400 if( rc!=SQLITE_OK ){
1401 sqlitepager_rollback(pPager);
1402 pPager->errMask |= PAGER_ERR_FULL;
1403 return rc;
1404 }
1405 assert( pPager->aInCkpt!=0 );
1406 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001407 page_add_to_ckpt_list(pPg);
drhfa86c412002-02-02 15:01:15 +00001408 }
1409
1410 /* Update the database size and return.
1411 */
drh1ab43002002-01-14 09:28:19 +00001412 if( pPager->dbSize<(int)pPg->pgno ){
drh306dc212001-05-21 13:45:10 +00001413 pPager->dbSize = pPg->pgno;
1414 }
drh69688d52001-04-14 16:38:23 +00001415 return rc;
drhed7c8552001-04-11 14:29:21 +00001416}
1417
1418/*
drhaacc5432002-01-06 17:07:40 +00001419** Return TRUE if the page given in the argument was previously passed
drh6019e162001-07-02 17:51:45 +00001420** to sqlitepager_write(). In other words, return TRUE if it is ok
1421** to change the content of the page.
1422*/
1423int sqlitepager_iswriteable(void *pData){
1424 PgHdr *pPg = DATA_TO_PGHDR(pData);
1425 return pPg->dirty;
1426}
1427
1428/*
drh30e58752002-03-02 20:41:57 +00001429** A call to this routine tells the pager that it is not necessary to
1430** write the information on page "pgno" back to the disk, even though
1431** that page might be marked as dirty.
1432**
1433** The overlying software layer calls this routine when all of the data
1434** on the given page is unused. The pager marks the page as clean so
1435** that it does not get written to disk.
1436**
1437** Tests show that this optimization, together with the
1438** sqlitepager_dont_rollback() below, more than double the speed
1439** of large INSERT operations and quadruple the speed of large DELETEs.
drh8e298f92002-07-06 16:28:47 +00001440**
1441** When this routine is called, set the alwaysRollback flag to true.
1442** Subsequent calls to sqlitepager_dont_rollback() for the same page
1443** will thereafter be ignored. This is necessary to avoid a problem
1444** where a page with data is added to the freelist during one part of
1445** a transaction then removed from the freelist during a later part
1446** of the same transaction and reused for some other purpose. When it
1447** is first added to the freelist, this routine is called. When reused,
1448** the dont_rollback() routine is called. But because the page contains
1449** critical data, we still need to be sure it gets rolled back in spite
1450** of the dont_rollback() call.
drh30e58752002-03-02 20:41:57 +00001451*/
1452void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
1453 PgHdr *pPg;
drh8e298f92002-07-06 16:28:47 +00001454
drh30e58752002-03-02 20:41:57 +00001455 pPg = pager_lookup(pPager, pgno);
drh8e298f92002-07-06 16:28:47 +00001456 pPg->alwaysRollback = 1;
drh30e58752002-03-02 20:41:57 +00001457 if( pPg && pPg->dirty ){
drh8124a302002-06-25 14:43:57 +00001458 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
1459 /* If this pages is the last page in the file and the file has grown
1460 ** during the current transaction, then do NOT mark the page as clean.
1461 ** When the database file grows, we must make sure that the last page
1462 ** gets written at least once so that the disk file will be the correct
1463 ** size. If you do not write this page and the size of the file
1464 ** on the disk ends up being too small, that can lead to database
1465 ** corruption during the next transaction.
1466 */
1467 }else{
1468 pPg->dirty = 0;
1469 }
drh30e58752002-03-02 20:41:57 +00001470 }
1471}
1472
1473/*
1474** A call to this routine tells the pager that if a rollback occurs,
1475** it is not necessary to restore the data on the given page. This
1476** means that the pager does not have to record the given page in the
1477** rollback journal.
1478*/
1479void sqlitepager_dont_rollback(void *pData){
1480 PgHdr *pPg = DATA_TO_PGHDR(pData);
1481 Pager *pPager = pPg->pPager;
1482
1483 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
drh193a6b42002-07-07 16:52:46 +00001484 if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
drh30e58752002-03-02 20:41:57 +00001485 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
1486 assert( pPager->aInJournal!=0 );
1487 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1488 pPg->inJournal = 1;
drh0f892532002-05-30 12:27:03 +00001489 if( pPager->ckptInUse ){
drh30e58752002-03-02 20:41:57 +00001490 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001491 page_add_to_ckpt_list(pPg);
drh30e58752002-03-02 20:41:57 +00001492 }
1493 }
drh0f892532002-05-30 12:27:03 +00001494 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
drh30e58752002-03-02 20:41:57 +00001495 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1496 assert( pPager->aInCkpt!=0 );
1497 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
drh03eb96a2002-11-10 23:32:56 +00001498 page_add_to_ckpt_list(pPg);
drh30e58752002-03-02 20:41:57 +00001499 }
1500}
1501
1502/*
drhed7c8552001-04-11 14:29:21 +00001503** Commit all changes to the database and release the write lock.
drhd9b02572001-04-15 00:37:09 +00001504**
1505** If the commit fails for any reason, a rollback attempt is made
1506** and an error code is returned. If the commit worked, SQLITE_OK
1507** is returned.
drhed7c8552001-04-11 14:29:21 +00001508*/
drhd9b02572001-04-15 00:37:09 +00001509int sqlitepager_commit(Pager *pPager){
drha1b351a2001-09-14 16:42:12 +00001510 int rc;
drhed7c8552001-04-11 14:29:21 +00001511 PgHdr *pPg;
drhd9b02572001-04-15 00:37:09 +00001512
1513 if( pPager->errMask==PAGER_ERR_FULL ){
1514 rc = sqlitepager_rollback(pPager);
drh4e371ee2002-09-05 16:08:27 +00001515 if( rc==SQLITE_OK ){
1516 rc = SQLITE_FULL;
1517 }
drhd9b02572001-04-15 00:37:09 +00001518 return rc;
1519 }
1520 if( pPager->errMask!=0 ){
1521 rc = pager_errcode(pPager);
1522 return rc;
1523 }
1524 if( pPager->state!=SQLITE_WRITELOCK ){
1525 return SQLITE_ERROR;
1526 }
drh8cfbf082001-09-19 13:22:39 +00001527 assert( pPager->journalOpen );
drha1680452002-04-18 01:56:57 +00001528 if( pPager->dirtyFile==0 ){
1529 /* Exit early (without doing the time-consuming sqliteOsSync() calls)
1530 ** if there have been no changes to the database file. */
1531 rc = pager_unwritelock(pPager);
1532 pPager->dbSize = -1;
1533 return rc;
1534 }
drha7fcb052001-12-14 15:09:55 +00001535 if( pPager->needSync && sqliteOsSync(&pPager->jfd)!=SQLITE_OK ){
drhd9b02572001-04-15 00:37:09 +00001536 goto commit_abort;
drhed7c8552001-04-11 14:29:21 +00001537 }
drha1b351a2001-09-14 16:42:12 +00001538 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1539 if( pPg->dirty==0 ) continue;
drha7fcb052001-12-14 15:09:55 +00001540 rc = sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*SQLITE_PAGE_SIZE);
drha1b351a2001-09-14 16:42:12 +00001541 if( rc!=SQLITE_OK ) goto commit_abort;
drha7fcb052001-12-14 15:09:55 +00001542 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
drha1b351a2001-09-14 16:42:12 +00001543 if( rc!=SQLITE_OK ) goto commit_abort;
drhed7c8552001-04-11 14:29:21 +00001544 }
drh603240c2002-03-05 01:11:12 +00001545 if( !pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK ){
1546 goto commit_abort;
1547 }
drhd9b02572001-04-15 00:37:09 +00001548 rc = pager_unwritelock(pPager);
1549 pPager->dbSize = -1;
1550 return rc;
1551
1552 /* Jump here if anything goes wrong during the commit process.
1553 */
1554commit_abort:
1555 rc = sqlitepager_rollback(pPager);
1556 if( rc==SQLITE_OK ){
1557 rc = SQLITE_FULL;
drhed7c8552001-04-11 14:29:21 +00001558 }
drhed7c8552001-04-11 14:29:21 +00001559 return rc;
1560}
1561
1562/*
1563** Rollback all changes. The database falls back to read-only mode.
1564** All in-memory cache pages revert to their original data contents.
1565** The journal is deleted.
drhd9b02572001-04-15 00:37:09 +00001566**
1567** This routine cannot fail unless some other process is not following
1568** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
1569** process is writing trash into the journal file (SQLITE_CORRUPT) or
1570** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error
1571** codes are returned for all these occasions. Otherwise,
1572** SQLITE_OK is returned.
drhed7c8552001-04-11 14:29:21 +00001573*/
drhd9b02572001-04-15 00:37:09 +00001574int sqlitepager_rollback(Pager *pPager){
drhed7c8552001-04-11 14:29:21 +00001575 int rc;
drhd9b02572001-04-15 00:37:09 +00001576 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
drh4b845d72002-03-05 12:41:19 +00001577 if( pPager->state>=SQLITE_WRITELOCK ){
1578 pager_playback(pPager);
1579 }
drhd9b02572001-04-15 00:37:09 +00001580 return pager_errcode(pPager);
drhed7c8552001-04-11 14:29:21 +00001581 }
drhd9b02572001-04-15 00:37:09 +00001582 if( pPager->state!=SQLITE_WRITELOCK ){
1583 return SQLITE_OK;
1584 }
1585 rc = pager_playback(pPager);
1586 if( rc!=SQLITE_OK ){
1587 rc = SQLITE_CORRUPT;
1588 pPager->errMask |= PAGER_ERR_CORRUPT;
1589 }
1590 pPager->dbSize = -1;
drhed7c8552001-04-11 14:29:21 +00001591 return rc;
drh98808ba2001-10-18 12:34:46 +00001592}
drhd9b02572001-04-15 00:37:09 +00001593
1594/*
drh5e00f6c2001-09-13 13:46:56 +00001595** Return TRUE if the database file is opened read-only. Return FALSE
1596** if the database is (in theory) writable.
1597*/
1598int sqlitepager_isreadonly(Pager *pPager){
drhbe0072d2001-09-13 14:46:09 +00001599 return pPager->readOnly;
drh5e00f6c2001-09-13 13:46:56 +00001600}
1601
1602/*
drhd9b02572001-04-15 00:37:09 +00001603** This routine is used for testing and analysis only.
1604*/
1605int *sqlitepager_stats(Pager *pPager){
1606 static int a[9];
1607 a[0] = pPager->nRef;
1608 a[1] = pPager->nPage;
1609 a[2] = pPager->mxPage;
1610 a[3] = pPager->dbSize;
1611 a[4] = pPager->state;
1612 a[5] = pPager->errMask;
1613 a[6] = pPager->nHit;
1614 a[7] = pPager->nMiss;
1615 a[8] = pPager->nOvfl;
1616 return a;
1617}
drhdd793422001-06-28 01:54:48 +00001618
drhfa86c412002-02-02 15:01:15 +00001619/*
1620** Set the checkpoint.
1621**
1622** This routine should be called with the transaction journal already
1623** open. A new checkpoint journal is created that can be used to rollback
drhaaab5722002-02-19 13:39:21 +00001624** changes of a single SQL command within a larger transaction.
drhfa86c412002-02-02 15:01:15 +00001625*/
1626int sqlitepager_ckpt_begin(Pager *pPager){
1627 int rc;
1628 char zTemp[SQLITE_TEMPNAME_SIZE];
1629 assert( pPager->journalOpen );
drh0f892532002-05-30 12:27:03 +00001630 assert( !pPager->ckptInUse );
drhfa86c412002-02-02 15:01:15 +00001631 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
1632 if( pPager->aInCkpt==0 ){
1633 sqliteOsReadLock(&pPager->fd);
1634 return SQLITE_NOMEM;
1635 }
1636 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
1637 if( rc ) goto ckpt_begin_failed;
drh663fc632002-02-02 18:49:19 +00001638 pPager->ckptSize = pPager->dbSize;
drh0f892532002-05-30 12:27:03 +00001639 if( !pPager->ckptOpen ){
1640 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
1641 if( rc ) goto ckpt_begin_failed;
1642 pPager->ckptOpen = 1;
1643 }
1644 pPager->ckptInUse = 1;
drhfa86c412002-02-02 15:01:15 +00001645 return SQLITE_OK;
1646
1647ckpt_begin_failed:
1648 if( pPager->aInCkpt ){
1649 sqliteFree(pPager->aInCkpt);
1650 pPager->aInCkpt = 0;
1651 }
1652 return rc;
1653}
1654
1655/*
1656** Commit a checkpoint.
1657*/
1658int sqlitepager_ckpt_commit(Pager *pPager){
drh0f892532002-05-30 12:27:03 +00001659 if( pPager->ckptInUse ){
drh03eb96a2002-11-10 23:32:56 +00001660 PgHdr *pPg, *pNext;
drh96ddd6d2002-09-05 19:10:33 +00001661 sqliteOsSeek(&pPager->cpfd, 0);
drh0f892532002-05-30 12:27:03 +00001662 sqliteOsTruncate(&pPager->cpfd, 0);
1663 pPager->ckptInUse = 0;
drh663fc632002-02-02 18:49:19 +00001664 sqliteFree( pPager->aInCkpt );
1665 pPager->aInCkpt = 0;
drh03eb96a2002-11-10 23:32:56 +00001666 for(pPg=pPager->pCkpt; pPg; pPg=pNext){
1667 pNext = pPg->pNextCkpt;
1668 assert( pPg->inCkpt );
drh663fc632002-02-02 18:49:19 +00001669 pPg->inCkpt = 0;
drh03eb96a2002-11-10 23:32:56 +00001670 pPg->pPrevCkpt = pPg->pNextCkpt = 0;
drh663fc632002-02-02 18:49:19 +00001671 }
drh03eb96a2002-11-10 23:32:56 +00001672 pPager->pCkpt = 0;
drh663fc632002-02-02 18:49:19 +00001673 }
drhfa86c412002-02-02 15:01:15 +00001674 return SQLITE_OK;
1675}
1676
1677/*
1678** Rollback a checkpoint.
1679*/
1680int sqlitepager_ckpt_rollback(Pager *pPager){
1681 int rc;
drh0f892532002-05-30 12:27:03 +00001682 if( pPager->ckptInUse ){
drh663fc632002-02-02 18:49:19 +00001683 rc = pager_ckpt_playback(pPager);
1684 sqlitepager_ckpt_commit(pPager);
1685 }else{
1686 rc = SQLITE_OK;
1687 }
drhfa86c412002-02-02 15:01:15 +00001688 return rc;
1689}
1690
drh74587e52002-08-13 00:01:16 +00001691#ifdef SQLITE_TEST
drhdd793422001-06-28 01:54:48 +00001692/*
1693** Print a listing of all referenced pages and their ref count.
1694*/
1695void sqlitepager_refdump(Pager *pPager){
1696 PgHdr *pPg;
1697 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1698 if( pPg->nRef<=0 ) continue;
1699 printf("PAGE %3d addr=0x%08x nRef=%d\n",
1700 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
1701 }
1702}
1703#endif