blob: a61511ee2b682f104d781b8d9fcda962295406bb [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drhb19a2bc2001-09-16 00:13:26 +00002** 2001 September 15
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drh4be295b2003-12-16 03:44:47 +000012** $Id: btree.c,v 1.97 2003/12/16 03:44:48 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
39** key and data for any entry are combined to form the "payload". Up to
40** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the
41** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes
42** then surplus bytes are stored on overflow pages. The payload for an
43** entry and the preceding pointer are combined to form a "Cell". Each
drhb19a2bc2001-09-16 00:13:26 +000044** page has a small header which contains the Ptr(N+1) pointer.
drh8b2f49b2001-06-08 00:21:52 +000045**
46** The first page of the file contains a magic string used to verify that
47** the file really is a valid BTree database, a pointer to a list of unused
48** pages in the file, and some meta information. The root of the first
49** BTree begins on page 2 of the file. (Pages are numbered beginning with
50** 1, not 0.) Thus a minimum database contains 2 pages.
drha059ad02001-04-17 20:09:11 +000051*/
52#include "sqliteInt.h"
53#include "pager.h"
54#include "btree.h"
55#include <assert.h>
56
paulb95a8862003-04-01 21:16:41 +000057/* Forward declarations */
58static BtOps sqliteBtreeOps;
59static BtCursorOps sqliteBtreeCursorOps;
60
drh8c42ca92001-06-22 19:15:00 +000061/*
drh0d316a42002-08-11 20:10:47 +000062** Macros used for byteswapping. B is a pointer to the Btree
63** structure. This is needed to access the Btree.needSwab boolean
64** in order to tell if byte swapping is needed or not.
65** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer.
66** SWAB32 byteswaps a 32-bit integer.
67*/
drhfd9903d2003-04-25 03:13:25 +000068#define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X))
drh0d316a42002-08-11 20:10:47 +000069#define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X))
70#define SWAB_ADD(B,X,A) \
71 if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }
72
73/*
74** The following global variable - available only if SQLITE_TEST is
75** defined - is used to determine whether new databases are created in
76** native byte order or in non-native byte order. Non-native byte order
77** databases are created for testing purposes only. Under normal operation,
78** only native byte-order databases should be created, but we should be
79** able to read or write existing databases regardless of the byteorder.
80*/
81#ifdef SQLITE_TEST
82int btree_native_byte_order = 1;
drh74587e52002-08-13 00:01:16 +000083#else
84# define btree_native_byte_order 1
drh0d316a42002-08-11 20:10:47 +000085#endif
86
87/*
drh365d68f2001-05-11 11:02:46 +000088** Forward declarations of structures used only in this file.
89*/
drhbd03cae2001-06-02 02:40:57 +000090typedef struct PageOne PageOne;
drh2af926b2001-05-15 00:39:25 +000091typedef struct MemPage MemPage;
drh365d68f2001-05-11 11:02:46 +000092typedef struct PageHdr PageHdr;
93typedef struct Cell Cell;
drh3b7511c2001-05-26 13:15:44 +000094typedef struct CellHdr CellHdr;
drh365d68f2001-05-11 11:02:46 +000095typedef struct FreeBlk FreeBlk;
drh2af926b2001-05-15 00:39:25 +000096typedef struct OverflowPage OverflowPage;
drh30e58752002-03-02 20:41:57 +000097typedef struct FreelistInfo FreelistInfo;
drh2af926b2001-05-15 00:39:25 +000098
99/*
100** All structures on a database page are aligned to 4-byte boundries.
101** This routine rounds up a number of bytes to the next multiple of 4.
drh306dc212001-05-21 13:45:10 +0000102**
103** This might need to change for computer architectures that require
104** and 8-byte alignment boundry for structures.
drh2af926b2001-05-15 00:39:25 +0000105*/
106#define ROUNDUP(X) ((X+3) & ~3)
drha059ad02001-04-17 20:09:11 +0000107
drh08ed44e2001-04-29 23:32:55 +0000108/*
drhbd03cae2001-06-02 02:40:57 +0000109** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000110** SQLite database in order to identify the file as a real database.
drh08ed44e2001-04-29 23:32:55 +0000111*/
drhbd03cae2001-06-02 02:40:57 +0000112static const char zMagicHeader[] =
drh80ff32f2001-11-04 18:32:46 +0000113 "** This file contains an SQLite 2.1 database **";
drhbd03cae2001-06-02 02:40:57 +0000114#define MAGIC_SIZE (sizeof(zMagicHeader))
drh08ed44e2001-04-29 23:32:55 +0000115
116/*
drh5e00f6c2001-09-13 13:46:56 +0000117** This is a magic integer also used to test the integrity of the database
drh8c42ca92001-06-22 19:15:00 +0000118** file. This integer is used in addition to the string above so that
119** if the file is written on a little-endian architecture and read
120** on a big-endian architectures (or vice versa) we can detect the
121** problem.
122**
123** The number used was obtained at random and has no special
drhb19a2bc2001-09-16 00:13:26 +0000124** significance other than the fact that it represents a different
125** integer on little-endian and big-endian machines.
drh8c42ca92001-06-22 19:15:00 +0000126*/
127#define MAGIC 0xdae37528
128
129/*
drhbd03cae2001-06-02 02:40:57 +0000130** The first page of the database file contains a magic header string
131** to identify the file as an SQLite database file. It also contains
132** a pointer to the first free page of the file. Page 2 contains the
drh8b2f49b2001-06-08 00:21:52 +0000133** root of the principle BTree. The file might contain other BTrees
134** rooted on pages above 2.
135**
136** The first page also contains SQLITE_N_BTREE_META integers that
137** can be used by higher-level routines.
drh08ed44e2001-04-29 23:32:55 +0000138**
drhbd03cae2001-06-02 02:40:57 +0000139** Remember that pages are numbered beginning with 1. (See pager.c
140** for additional information.) Page 0 does not exist and a page
141** number of 0 is used to mean "no such page".
142*/
143struct PageOne {
144 char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */
drh8c42ca92001-06-22 19:15:00 +0000145 int iMagic; /* Integer to verify correct byte order */
146 Pgno freeList; /* First free page in a list of all free pages */
drh2aa679f2001-06-25 02:11:07 +0000147 int nFree; /* Number of pages on the free list */
148 int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */
drhbd03cae2001-06-02 02:40:57 +0000149};
150
151/*
152** Each database page has a header that is an instance of this
153** structure.
drh08ed44e2001-04-29 23:32:55 +0000154**
drh8b2f49b2001-06-08 00:21:52 +0000155** PageHdr.firstFree is 0 if there is no free space on this page.
drh14acc042001-06-10 19:56:58 +0000156** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a
drh8b2f49b2001-06-08 00:21:52 +0000157** FreeBlk structure that describes the first block of free space.
158** All free space is defined by a linked list of FreeBlk structures.
drh08ed44e2001-04-29 23:32:55 +0000159**
drh8b2f49b2001-06-08 00:21:52 +0000160** Data is stored in a linked list of Cell structures. PageHdr.firstCell
drh14acc042001-06-10 19:56:58 +0000161** is the index into MemPage.u.aDisk[] of the first cell on the page. The
drh306dc212001-05-21 13:45:10 +0000162** Cells are kept in sorted order.
drh8b2f49b2001-06-08 00:21:52 +0000163**
164** A Cell contains all information about a database entry and a pointer
165** to a child page that contains other entries less than itself. In
166** other words, the i-th Cell contains both Ptr(i) and Key(i). The
167** right-most pointer of the page is contained in PageHdr.rightChild.
drh08ed44e2001-04-29 23:32:55 +0000168*/
drh365d68f2001-05-11 11:02:46 +0000169struct PageHdr {
drh5e2f8b92001-05-28 00:41:15 +0000170 Pgno rightChild; /* Child page that comes after all cells on this page */
drh14acc042001-06-10 19:56:58 +0000171 u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */
172 u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */
drh365d68f2001-05-11 11:02:46 +0000173};
drh306dc212001-05-21 13:45:10 +0000174
drh3b7511c2001-05-26 13:15:44 +0000175/*
176** Entries on a page of the database are called "Cells". Each Cell
177** has a header and data. This structure defines the header. The
drhbd03cae2001-06-02 02:40:57 +0000178** key and data (collectively the "payload") follow this header on
179** the database page.
180**
181** A definition of the complete Cell structure is given below. The
drh8c42ca92001-06-22 19:15:00 +0000182** header for the cell must be defined first in order to do some
drhbd03cae2001-06-02 02:40:57 +0000183** of the sizing #defines that follow.
drh3b7511c2001-05-26 13:15:44 +0000184*/
185struct CellHdr {
drh5e2f8b92001-05-28 00:41:15 +0000186 Pgno leftChild; /* Child page that comes before this cell */
drh3b7511c2001-05-26 13:15:44 +0000187 u16 nKey; /* Number of bytes in the key */
drh14acc042001-06-10 19:56:58 +0000188 u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */
drh58a11682001-11-10 13:51:08 +0000189 u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */
190 u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */
drh80ff32f2001-11-04 18:32:46 +0000191 u16 nData; /* Number of bytes of data */
drh8c42ca92001-06-22 19:15:00 +0000192};
drh58a11682001-11-10 13:51:08 +0000193
194/*
195** The key and data size are split into a lower 16-bit segment and an
196** upper 8-bit segment in order to pack them together into a smaller
197** space. The following macros reassembly a key or data size back
198** into an integer.
199*/
drh0d316a42002-08-11 20:10:47 +0000200#define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536)
201#define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)
drh3b7511c2001-05-26 13:15:44 +0000202
203/*
204** The minimum size of a complete Cell. The Cell must contain a header
drhbd03cae2001-06-02 02:40:57 +0000205** and at least 4 bytes of payload.
drh3b7511c2001-05-26 13:15:44 +0000206*/
207#define MIN_CELL_SIZE (sizeof(CellHdr)+4)
208
209/*
210** The maximum number of database entries that can be held in a single
211** page of the database.
212*/
213#define MX_CELL ((SQLITE_PAGE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)
214
215/*
drh6019e162001-07-02 17:51:45 +0000216** The amount of usable space on a single page of the BTree. This is the
217** page size minus the overhead of the page header.
218*/
219#define USABLE_SPACE (SQLITE_PAGE_SIZE - sizeof(PageHdr))
220
221/*
drh8c42ca92001-06-22 19:15:00 +0000222** The maximum amount of payload (in bytes) that can be stored locally for
223** a database entry. If the entry contains more data than this, the
drh3b7511c2001-05-26 13:15:44 +0000224** extra goes onto overflow pages.
drhbd03cae2001-06-02 02:40:57 +0000225**
226** This number is chosen so that at least 4 cells will fit on every page.
drh3b7511c2001-05-26 13:15:44 +0000227*/
drh6019e162001-07-02 17:51:45 +0000228#define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)
drh3b7511c2001-05-26 13:15:44 +0000229
drh306dc212001-05-21 13:45:10 +0000230/*
231** Data on a database page is stored as a linked list of Cell structures.
drh5e2f8b92001-05-28 00:41:15 +0000232** Both the key and the data are stored in aPayload[]. The key always comes
233** first. The aPayload[] field grows as necessary to hold the key and data,
drh306dc212001-05-21 13:45:10 +0000234** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and
drh3b7511c2001-05-26 13:15:44 +0000235** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
236** page number of the first overflow page.
237**
238** Though this structure is fixed in size, the Cell on the database
drhbd03cae2001-06-02 02:40:57 +0000239** page varies in size. Every cell has a CellHdr and at least 4 bytes
drh3b7511c2001-05-26 13:15:44 +0000240** of payload space. Additional payload bytes (up to the maximum of
241** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
242** needed.
drh306dc212001-05-21 13:45:10 +0000243*/
drh365d68f2001-05-11 11:02:46 +0000244struct Cell {
drh5e2f8b92001-05-28 00:41:15 +0000245 CellHdr h; /* The cell header */
246 char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */
247 Pgno ovfl; /* The first overflow page */
drh365d68f2001-05-11 11:02:46 +0000248};
drh306dc212001-05-21 13:45:10 +0000249
250/*
251** Free space on a page is remembered using a linked list of the FreeBlk
252** structures. Space on a database page is allocated in increments of
drh72f82862001-05-24 21:06:34 +0000253** at least 4 bytes and is always aligned to a 4-byte boundry. The
drh8b2f49b2001-06-08 00:21:52 +0000254** linked list of FreeBlks is always kept in order by address.
drh306dc212001-05-21 13:45:10 +0000255*/
drh365d68f2001-05-11 11:02:46 +0000256struct FreeBlk {
drh72f82862001-05-24 21:06:34 +0000257 u16 iSize; /* Number of bytes in this block of free space */
drh14acc042001-06-10 19:56:58 +0000258 u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */
drh365d68f2001-05-11 11:02:46 +0000259};
drh306dc212001-05-21 13:45:10 +0000260
261/*
drh14acc042001-06-10 19:56:58 +0000262** The number of bytes of payload that will fit on a single overflow page.
drh3b7511c2001-05-26 13:15:44 +0000263*/
264#define OVERFLOW_SIZE (SQLITE_PAGE_SIZE-sizeof(Pgno))
265
266/*
drh306dc212001-05-21 13:45:10 +0000267** When the key and data for a single entry in the BTree will not fit in
drh8c42ca92001-06-22 19:15:00 +0000268** the MX_LOCAL_PAYLOAD bytes of space available on the database page,
drh8b2f49b2001-06-08 00:21:52 +0000269** then all extra bytes are written to a linked list of overflow pages.
drh306dc212001-05-21 13:45:10 +0000270** Each overflow page is an instance of the following structure.
271**
272** Unused pages in the database are also represented by instances of
drhbd03cae2001-06-02 02:40:57 +0000273** the OverflowPage structure. The PageOne.freeList field is the
drh306dc212001-05-21 13:45:10 +0000274** page number of the first page in a linked list of unused database
275** pages.
276*/
drh2af926b2001-05-15 00:39:25 +0000277struct OverflowPage {
drh14acc042001-06-10 19:56:58 +0000278 Pgno iNext;
drh5e2f8b92001-05-28 00:41:15 +0000279 char aPayload[OVERFLOW_SIZE];
drh7e3b0a02001-04-28 16:52:40 +0000280};
drh7e3b0a02001-04-28 16:52:40 +0000281
282/*
drh30e58752002-03-02 20:41:57 +0000283** The PageOne.freeList field points to a linked list of overflow pages
284** hold information about free pages. The aPayload section of each
285** overflow page contains an instance of the following structure. The
286** aFree[] array holds the page number of nFree unused pages in the disk
287** file.
288*/
289struct FreelistInfo {
290 int nFree;
291 Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];
292};
293
294/*
drh7e3b0a02001-04-28 16:52:40 +0000295** For every page in the database file, an instance of the following structure
drh14acc042001-06-10 19:56:58 +0000296** is stored in memory. The u.aDisk[] array contains the raw bits read from
drh6446c4d2001-12-15 14:22:18 +0000297** the disk. The rest is auxiliary information held in memory only. The
drhbd03cae2001-06-02 02:40:57 +0000298** auxiliary info is only valid for regular database pages - it is not
299** used for overflow pages and pages on the freelist.
drh306dc212001-05-21 13:45:10 +0000300**
drhbd03cae2001-06-02 02:40:57 +0000301** Of particular interest in the auxiliary info is the apCell[] entry. Each
drh14acc042001-06-10 19:56:58 +0000302** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are
drh306dc212001-05-21 13:45:10 +0000303** put in this array so that they can be accessed in constant time, rather
drhbd03cae2001-06-02 02:40:57 +0000304** than in linear time which would be needed if we had to walk the linked
305** list on every access.
drh72f82862001-05-24 21:06:34 +0000306**
drh14acc042001-06-10 19:56:58 +0000307** Note that apCell[] contains enough space to hold up to two more Cells
308** than can possibly fit on one page. In the steady state, every apCell[]
309** points to memory inside u.aDisk[]. But in the middle of an insert
310** operation, some apCell[] entries may temporarily point to data space
311** outside of u.aDisk[]. This is a transient situation that is quickly
312** resolved. But while it is happening, it is possible for a database
313** page to hold as many as two more cells than it might otherwise hold.
drh18b81e52001-11-01 13:52:52 +0000314** The extra two entries in apCell[] are an allowance for this situation.
drh14acc042001-06-10 19:56:58 +0000315**
drh72f82862001-05-24 21:06:34 +0000316** The pParent field points back to the parent page. This allows us to
317** walk up the BTree from any leaf to the root. Care must be taken to
318** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000319** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000320*/
321struct MemPage {
drh14acc042001-06-10 19:56:58 +0000322 union {
323 char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */
324 PageHdr hdr; /* Overlay page header */
325 } u;
drh428ae8c2003-01-04 16:48:09 +0000326 u8 isInit; /* True if auxiliary data is initialized */
327 u8 idxShift; /* True if apCell[] indices have changed */
328 u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */
drh72f82862001-05-24 21:06:34 +0000329 MemPage *pParent; /* The parent of this page. NULL for root */
drh428ae8c2003-01-04 16:48:09 +0000330 int idxParent; /* Index in pParent->apCell[] of this node */
drh14acc042001-06-10 19:56:58 +0000331 int nFree; /* Number of free bytes in u.aDisk[] */
drh306dc212001-05-21 13:45:10 +0000332 int nCell; /* Number of entries on this page */
drh14acc042001-06-10 19:56:58 +0000333 Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */
drh8c42ca92001-06-22 19:15:00 +0000334};
drh7e3b0a02001-04-28 16:52:40 +0000335
336/*
drh3b7511c2001-05-26 13:15:44 +0000337** The in-memory image of a disk page has the auxiliary information appended
338** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
339** that extra information.
340*/
341#define EXTRA_SIZE (sizeof(MemPage)-SQLITE_PAGE_SIZE)
342
343/*
drha059ad02001-04-17 20:09:11 +0000344** Everything we need to know about an open database
345*/
346struct Btree {
paulb95a8862003-04-01 21:16:41 +0000347 BtOps *pOps; /* Function table */
drha059ad02001-04-17 20:09:11 +0000348 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000349 BtCursor *pCursor; /* A list of all open cursors */
drhbd03cae2001-06-02 02:40:57 +0000350 PageOne *page1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000351 u8 inTrans; /* True if a transaction is in progress */
352 u8 inCkpt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000353 u8 readOnly; /* True if the underlying file is readonly */
drh0d316a42002-08-11 20:10:47 +0000354 u8 needSwab; /* Need to byte-swapping */
drha059ad02001-04-17 20:09:11 +0000355};
356typedef Btree Bt;
357
drh365d68f2001-05-11 11:02:46 +0000358/*
359** A cursor is a pointer to a particular entry in the BTree.
360** The entry is identified by its MemPage and the index in
drh5e2f8b92001-05-28 00:41:15 +0000361** MemPage.apCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000362*/
drh72f82862001-05-24 21:06:34 +0000363struct BtCursor {
paulb95a8862003-04-01 21:16:41 +0000364 BtCursorOps *pOps; /* Function table */
drh5e2f8b92001-05-28 00:41:15 +0000365 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000366 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000367 BtCursor *pShared; /* Loop of cursors with the same root page */
drh8b2f49b2001-06-08 00:21:52 +0000368 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000369 MemPage *pPage; /* Page that contains the entry */
drh8c42ca92001-06-22 19:15:00 +0000370 int idx; /* Index of the entry in pPage->apCell[] */
drhecdc7532001-09-23 02:35:53 +0000371 u8 wrFlag; /* True if writable */
drh2dcc9aa2002-12-04 13:40:25 +0000372 u8 eSkip; /* Determines if next step operation is a no-op */
drh5e2f8b92001-05-28 00:41:15 +0000373 u8 iMatch; /* compare result from last sqliteBtreeMoveto() */
drh365d68f2001-05-11 11:02:46 +0000374};
drh7e3b0a02001-04-28 16:52:40 +0000375
drha059ad02001-04-17 20:09:11 +0000376/*
drh2dcc9aa2002-12-04 13:40:25 +0000377** Legal values for BtCursor.eSkip.
378*/
379#define SKIP_NONE 0 /* Always step the cursor */
380#define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */
381#define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */
382#define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */
383
paulb95a8862003-04-01 21:16:41 +0000384/* Forward declarations */
drh144f9ea2003-04-16 01:28:16 +0000385static int fileBtreeCloseCursor(BtCursor *pCur);
paulb95a8862003-04-01 21:16:41 +0000386
drh2dcc9aa2002-12-04 13:40:25 +0000387/*
drh0d316a42002-08-11 20:10:47 +0000388** Routines for byte swapping.
389*/
390u16 swab16(u16 x){
391 return ((x & 0xff)<<8) | ((x>>8)&0xff);
392}
393u32 swab32(u32 x){
394 return ((x & 0xff)<<24) | ((x & 0xff00)<<8) |
395 ((x>>8) & 0xff00) | ((x>>24)&0xff);
396}
397
398/*
drh3b7511c2001-05-26 13:15:44 +0000399** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000400** database page. The number returned includes the Cell header,
401** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000402** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000403** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000404*/
drh0d316a42002-08-11 20:10:47 +0000405static int cellSize(Btree *pBt, Cell *pCell){
406 int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
drh3b7511c2001-05-26 13:15:44 +0000407 if( n>MX_LOCAL_PAYLOAD ){
408 n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
409 }else{
410 n = ROUNDUP(n);
411 }
412 n += sizeof(CellHdr);
413 return n;
414}
415
416/*
drh72f82862001-05-24 21:06:34 +0000417** Defragment the page given. All Cells are moved to the
418** beginning of the page and all free space is collected
419** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000420*/
drh0d316a42002-08-11 20:10:47 +0000421static void defragmentPage(Btree *pBt, MemPage *pPage){
drh14acc042001-06-10 19:56:58 +0000422 int pc, i, n;
drh2af926b2001-05-15 00:39:25 +0000423 FreeBlk *pFBlk;
424 char newPage[SQLITE_PAGE_SIZE];
425
drh6019e162001-07-02 17:51:45 +0000426 assert( sqlitepager_iswriteable(pPage) );
drh7aa128d2002-06-21 13:09:16 +0000427 assert( pPage->isInit );
drhbd03cae2001-06-02 02:40:57 +0000428 pc = sizeof(PageHdr);
drh0d316a42002-08-11 20:10:47 +0000429 pPage->u.hdr.firstCell = SWAB16(pBt, pc);
drh14acc042001-06-10 19:56:58 +0000430 memcpy(newPage, pPage->u.aDisk, pc);
drh2af926b2001-05-15 00:39:25 +0000431 for(i=0; i<pPage->nCell; i++){
drh2aa679f2001-06-25 02:11:07 +0000432 Cell *pCell = pPage->apCell[i];
drh8c42ca92001-06-22 19:15:00 +0000433
434 /* This routine should never be called on an overfull page. The
435 ** following asserts verify that constraint. */
drh7c717f72001-06-24 20:39:41 +0000436 assert( Addr(pCell) > Addr(pPage) );
437 assert( Addr(pCell) < Addr(pPage) + SQLITE_PAGE_SIZE );
drh8c42ca92001-06-22 19:15:00 +0000438
drh0d316a42002-08-11 20:10:47 +0000439 n = cellSize(pBt, pCell);
440 pCell->h.iNext = SWAB16(pBt, pc + n);
drh2af926b2001-05-15 00:39:25 +0000441 memcpy(&newPage[pc], pCell, n);
drh14acc042001-06-10 19:56:58 +0000442 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc];
drh2af926b2001-05-15 00:39:25 +0000443 pc += n;
444 }
drh72f82862001-05-24 21:06:34 +0000445 assert( pPage->nFree==SQLITE_PAGE_SIZE-pc );
drh14acc042001-06-10 19:56:58 +0000446 memcpy(pPage->u.aDisk, newPage, pc);
drh2aa679f2001-06-25 02:11:07 +0000447 if( pPage->nCell>0 ){
448 pPage->apCell[pPage->nCell-1]->h.iNext = 0;
449 }
drh8c42ca92001-06-22 19:15:00 +0000450 pFBlk = (FreeBlk*)&pPage->u.aDisk[pc];
drh0d316a42002-08-11 20:10:47 +0000451 pFBlk->iSize = SWAB16(pBt, SQLITE_PAGE_SIZE - pc);
drh2af926b2001-05-15 00:39:25 +0000452 pFBlk->iNext = 0;
drh0d316a42002-08-11 20:10:47 +0000453 pPage->u.hdr.firstFree = SWAB16(pBt, pc);
drh2af926b2001-05-15 00:39:25 +0000454 memset(&pFBlk[1], 0, SQLITE_PAGE_SIZE - pc - sizeof(FreeBlk));
drh365d68f2001-05-11 11:02:46 +0000455}
456
drha059ad02001-04-17 20:09:11 +0000457/*
drh8b2f49b2001-06-08 00:21:52 +0000458** Allocate nByte bytes of space on a page. nByte must be a
459** multiple of 4.
drhbd03cae2001-06-02 02:40:57 +0000460**
drh14acc042001-06-10 19:56:58 +0000461** Return the index into pPage->u.aDisk[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000462** the new allocation. Or return 0 if there is not enough free
463** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000464**
drh72f82862001-05-24 21:06:34 +0000465** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000466** nBytes of contiguous free space, then this routine automatically
467** calls defragementPage() to consolidate all free space before
468** allocating the new chunk.
drh7e3b0a02001-04-28 16:52:40 +0000469*/
drh0d316a42002-08-11 20:10:47 +0000470static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){
drh2af926b2001-05-15 00:39:25 +0000471 FreeBlk *p;
472 u16 *pIdx;
473 int start;
drh0d316a42002-08-11 20:10:47 +0000474 int iSize;
drh44ce7e22003-06-17 02:57:17 +0000475#ifndef NDEBUG
476 int cnt = 0;
477#endif
drh72f82862001-05-24 21:06:34 +0000478
drh6019e162001-07-02 17:51:45 +0000479 assert( sqlitepager_iswriteable(pPage) );
drh5e2f8b92001-05-28 00:41:15 +0000480 assert( nByte==ROUNDUP(nByte) );
drh7aa128d2002-06-21 13:09:16 +0000481 assert( pPage->isInit );
drh14acc042001-06-10 19:56:58 +0000482 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
483 pIdx = &pPage->u.hdr.firstFree;
drh0d316a42002-08-11 20:10:47 +0000484 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
485 while( (iSize = SWAB16(pBt, p->iSize))<nByte ){
drh8c42ca92001-06-22 19:15:00 +0000486 assert( cnt++ < SQLITE_PAGE_SIZE/4 );
drh2af926b2001-05-15 00:39:25 +0000487 if( p->iNext==0 ){
drh0d316a42002-08-11 20:10:47 +0000488 defragmentPage(pBt, pPage);
drh14acc042001-06-10 19:56:58 +0000489 pIdx = &pPage->u.hdr.firstFree;
drh2af926b2001-05-15 00:39:25 +0000490 }else{
491 pIdx = &p->iNext;
492 }
drh0d316a42002-08-11 20:10:47 +0000493 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
drh2af926b2001-05-15 00:39:25 +0000494 }
drh0d316a42002-08-11 20:10:47 +0000495 if( iSize==nByte ){
496 start = SWAB16(pBt, *pIdx);
drh2af926b2001-05-15 00:39:25 +0000497 *pIdx = p->iNext;
498 }else{
drh8c42ca92001-06-22 19:15:00 +0000499 FreeBlk *pNew;
drh0d316a42002-08-11 20:10:47 +0000500 start = SWAB16(pBt, *pIdx);
drh8c42ca92001-06-22 19:15:00 +0000501 pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte];
drh72f82862001-05-24 21:06:34 +0000502 pNew->iNext = p->iNext;
drh0d316a42002-08-11 20:10:47 +0000503 pNew->iSize = SWAB16(pBt, iSize - nByte);
504 *pIdx = SWAB16(pBt, start + nByte);
drh2af926b2001-05-15 00:39:25 +0000505 }
506 pPage->nFree -= nByte;
507 return start;
drh7e3b0a02001-04-28 16:52:40 +0000508}
509
510/*
drh14acc042001-06-10 19:56:58 +0000511** Return a section of the MemPage.u.aDisk[] to the freelist.
512** The first byte of the new free block is pPage->u.aDisk[start]
513** and the size of the block is "size" bytes. Size must be
514** a multiple of 4.
drh306dc212001-05-21 13:45:10 +0000515**
516** Most of the effort here is involved in coalesing adjacent
517** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000518*/
drh0d316a42002-08-11 20:10:47 +0000519static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){
drh2af926b2001-05-15 00:39:25 +0000520 int end = start + size;
521 u16 *pIdx, idx;
522 FreeBlk *pFBlk;
523 FreeBlk *pNew;
524 FreeBlk *pNext;
drh0d316a42002-08-11 20:10:47 +0000525 int iSize;
drh2af926b2001-05-15 00:39:25 +0000526
drh6019e162001-07-02 17:51:45 +0000527 assert( sqlitepager_iswriteable(pPage) );
drh2af926b2001-05-15 00:39:25 +0000528 assert( size == ROUNDUP(size) );
529 assert( start == ROUNDUP(start) );
drh7aa128d2002-06-21 13:09:16 +0000530 assert( pPage->isInit );
drh14acc042001-06-10 19:56:58 +0000531 pIdx = &pPage->u.hdr.firstFree;
drh0d316a42002-08-11 20:10:47 +0000532 idx = SWAB16(pBt, *pIdx);
drh2af926b2001-05-15 00:39:25 +0000533 while( idx!=0 && idx<start ){
drh14acc042001-06-10 19:56:58 +0000534 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +0000535 iSize = SWAB16(pBt, pFBlk->iSize);
536 if( idx + iSize == start ){
537 pFBlk->iSize = SWAB16(pBt, iSize + size);
538 if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){
539 pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size];
540 if( pBt->needSwab ){
drhfd9903d2003-04-25 03:13:25 +0000541 pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size);
drh0d316a42002-08-11 20:10:47 +0000542 }else{
543 pFBlk->iSize += pNext->iSize;
544 }
drh2af926b2001-05-15 00:39:25 +0000545 pFBlk->iNext = pNext->iNext;
546 }
547 pPage->nFree += size;
548 return;
549 }
550 pIdx = &pFBlk->iNext;
drh0d316a42002-08-11 20:10:47 +0000551 idx = SWAB16(pBt, *pIdx);
drh2af926b2001-05-15 00:39:25 +0000552 }
drh14acc042001-06-10 19:56:58 +0000553 pNew = (FreeBlk*)&pPage->u.aDisk[start];
drh2af926b2001-05-15 00:39:25 +0000554 if( idx != end ){
drh0d316a42002-08-11 20:10:47 +0000555 pNew->iSize = SWAB16(pBt, size);
556 pNew->iNext = SWAB16(pBt, idx);
drh2af926b2001-05-15 00:39:25 +0000557 }else{
drh14acc042001-06-10 19:56:58 +0000558 pNext = (FreeBlk*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +0000559 pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize));
drh2af926b2001-05-15 00:39:25 +0000560 pNew->iNext = pNext->iNext;
561 }
drh0d316a42002-08-11 20:10:47 +0000562 *pIdx = SWAB16(pBt, start);
drh2af926b2001-05-15 00:39:25 +0000563 pPage->nFree += size;
drh7e3b0a02001-04-28 16:52:40 +0000564}
565
566/*
567** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000568**
drhbd03cae2001-06-02 02:40:57 +0000569** The pParent parameter must be a pointer to the MemPage which
570** is the parent of the page being initialized. The root of the
drh8b2f49b2001-06-08 00:21:52 +0000571** BTree (usually page 2) has no parent and so for that page,
572** pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000573**
drh72f82862001-05-24 21:06:34 +0000574** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000575** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000576** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
577** guarantee that the page is well-formed. It only shows that
578** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000579*/
drh0d316a42002-08-11 20:10:47 +0000580static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){
drh14acc042001-06-10 19:56:58 +0000581 int idx; /* An index into pPage->u.aDisk[] */
582 Cell *pCell; /* A pointer to a Cell in pPage->u.aDisk[] */
583 FreeBlk *pFBlk; /* A pointer to a free block in pPage->u.aDisk[] */
drh5e2f8b92001-05-28 00:41:15 +0000584 int sz; /* The size of a Cell in bytes */
585 int freeSpace; /* Amount of free space on the page */
drh2af926b2001-05-15 00:39:25 +0000586
drh5e2f8b92001-05-28 00:41:15 +0000587 if( pPage->pParent ){
588 assert( pPage->pParent==pParent );
589 return SQLITE_OK;
590 }
591 if( pParent ){
592 pPage->pParent = pParent;
593 sqlitepager_ref(pParent);
594 }
595 if( pPage->isInit ) return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000596 pPage->isInit = 1;
drh7e3b0a02001-04-28 16:52:40 +0000597 pPage->nCell = 0;
drh6019e162001-07-02 17:51:45 +0000598 freeSpace = USABLE_SPACE;
drh0d316a42002-08-11 20:10:47 +0000599 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
drh7e3b0a02001-04-28 16:52:40 +0000600 while( idx!=0 ){
drh8c42ca92001-06-22 19:15:00 +0000601 if( idx>SQLITE_PAGE_SIZE-MIN_CELL_SIZE ) goto page_format_error;
drhbd03cae2001-06-02 02:40:57 +0000602 if( idx<sizeof(PageHdr) ) goto page_format_error;
drh8c42ca92001-06-22 19:15:00 +0000603 if( idx!=ROUNDUP(idx) ) goto page_format_error;
drh14acc042001-06-10 19:56:58 +0000604 pCell = (Cell*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +0000605 sz = cellSize(pBt, pCell);
drh5e2f8b92001-05-28 00:41:15 +0000606 if( idx+sz > SQLITE_PAGE_SIZE ) goto page_format_error;
607 freeSpace -= sz;
608 pPage->apCell[pPage->nCell++] = pCell;
drh0d316a42002-08-11 20:10:47 +0000609 idx = SWAB16(pBt, pCell->h.iNext);
drh2af926b2001-05-15 00:39:25 +0000610 }
611 pPage->nFree = 0;
drh0d316a42002-08-11 20:10:47 +0000612 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
drh2af926b2001-05-15 00:39:25 +0000613 while( idx!=0 ){
drh0d316a42002-08-11 20:10:47 +0000614 int iNext;
drh2af926b2001-05-15 00:39:25 +0000615 if( idx>SQLITE_PAGE_SIZE-sizeof(FreeBlk) ) goto page_format_error;
drhbd03cae2001-06-02 02:40:57 +0000616 if( idx<sizeof(PageHdr) ) goto page_format_error;
drh14acc042001-06-10 19:56:58 +0000617 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +0000618 pPage->nFree += SWAB16(pBt, pFBlk->iSize);
619 iNext = SWAB16(pBt, pFBlk->iNext);
620 if( iNext>0 && iNext <= idx ) goto page_format_error;
621 idx = iNext;
drh7e3b0a02001-04-28 16:52:40 +0000622 }
drh8b2f49b2001-06-08 00:21:52 +0000623 if( pPage->nCell==0 && pPage->nFree==0 ){
624 /* As a special case, an uninitialized root page appears to be
625 ** an empty database */
626 return SQLITE_OK;
627 }
drh5e2f8b92001-05-28 00:41:15 +0000628 if( pPage->nFree!=freeSpace ) goto page_format_error;
drh7e3b0a02001-04-28 16:52:40 +0000629 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +0000630
631page_format_error:
632 return SQLITE_CORRUPT;
drh7e3b0a02001-04-28 16:52:40 +0000633}
634
635/*
drh8b2f49b2001-06-08 00:21:52 +0000636** Set up a raw page so that it looks like a database page holding
637** no entries.
drhbd03cae2001-06-02 02:40:57 +0000638*/
drh0d316a42002-08-11 20:10:47 +0000639static void zeroPage(Btree *pBt, MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +0000640 PageHdr *pHdr;
641 FreeBlk *pFBlk;
drh6019e162001-07-02 17:51:45 +0000642 assert( sqlitepager_iswriteable(pPage) );
drhbd03cae2001-06-02 02:40:57 +0000643 memset(pPage, 0, SQLITE_PAGE_SIZE);
drh14acc042001-06-10 19:56:58 +0000644 pHdr = &pPage->u.hdr;
drhbd03cae2001-06-02 02:40:57 +0000645 pHdr->firstCell = 0;
drh0d316a42002-08-11 20:10:47 +0000646 pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr));
drhbd03cae2001-06-02 02:40:57 +0000647 pFBlk = (FreeBlk*)&pHdr[1];
648 pFBlk->iNext = 0;
drh0d316a42002-08-11 20:10:47 +0000649 pPage->nFree = SQLITE_PAGE_SIZE - sizeof(*pHdr);
650 pFBlk->iSize = SWAB16(pBt, pPage->nFree);
drh8c42ca92001-06-22 19:15:00 +0000651 pPage->nCell = 0;
652 pPage->isOverfull = 0;
drhbd03cae2001-06-02 02:40:57 +0000653}
654
655/*
drh72f82862001-05-24 21:06:34 +0000656** This routine is called when the reference count for a page
657** reaches zero. We need to unref the pParent pointer when that
658** happens.
659*/
660static void pageDestructor(void *pData){
661 MemPage *pPage = (MemPage*)pData;
662 if( pPage->pParent ){
663 MemPage *pParent = pPage->pParent;
664 pPage->pParent = 0;
665 sqlitepager_unref(pParent);
666 }
667}
668
669/*
drh306dc212001-05-21 13:45:10 +0000670** Open a new database.
671**
672** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000673** for accessing the database. We do not open the database file
674** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000675**
676** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000677** a new database with a random name is created. This randomly named
678** database file will be deleted when sqliteBtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000679*/
drh6019e162001-07-02 17:51:45 +0000680int sqliteBtreeOpen(
681 const char *zFilename, /* Name of the file containing the BTree database */
drhda47d772002-12-02 04:25:19 +0000682 int omitJournal, /* if TRUE then do not journal this file */
drh6019e162001-07-02 17:51:45 +0000683 int nCache, /* How many pages in the page cache */
684 Btree **ppBtree /* Pointer to new Btree object written here */
685){
drha059ad02001-04-17 20:09:11 +0000686 Btree *pBt;
drh8c42ca92001-06-22 19:15:00 +0000687 int rc;
drha059ad02001-04-17 20:09:11 +0000688
drhd62d3d02003-01-24 12:14:20 +0000689 /*
690 ** The following asserts make sure that structures used by the btree are
691 ** the right size. This is to guard against size changes that result
692 ** when compiling on a different architecture.
693 */
694 assert( sizeof(u32)==4 );
695 assert( sizeof(u16)==2 );
696 assert( sizeof(Pgno)==4 );
697 assert( sizeof(PageHdr)==8 );
698 assert( sizeof(CellHdr)==12 );
699 assert( sizeof(FreeBlk)==4 );
700 assert( sizeof(OverflowPage)==SQLITE_PAGE_SIZE );
701 assert( sizeof(FreelistInfo)==OVERFLOW_SIZE );
702 assert( sizeof(ptr)==sizeof(char*) );
703 assert( sizeof(uptr)==sizeof(ptr) );
704
drha059ad02001-04-17 20:09:11 +0000705 pBt = sqliteMalloc( sizeof(*pBt) );
706 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000707 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000708 return SQLITE_NOMEM;
709 }
drh6019e162001-07-02 17:51:45 +0000710 if( nCache<10 ) nCache = 10;
drhda47d772002-12-02 04:25:19 +0000711 rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
712 !omitJournal);
drha059ad02001-04-17 20:09:11 +0000713 if( rc!=SQLITE_OK ){
714 if( pBt->pPager ) sqlitepager_close(pBt->pPager);
715 sqliteFree(pBt);
716 *ppBtree = 0;
717 return rc;
718 }
drh72f82862001-05-24 21:06:34 +0000719 sqlitepager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000720 pBt->pCursor = 0;
721 pBt->page1 = 0;
drh5df72a52002-06-06 23:16:05 +0000722 pBt->readOnly = sqlitepager_isreadonly(pBt->pPager);
paulb95a8862003-04-01 21:16:41 +0000723 pBt->pOps = &sqliteBtreeOps;
drha059ad02001-04-17 20:09:11 +0000724 *ppBtree = pBt;
725 return SQLITE_OK;
726}
727
728/*
729** Close an open database and invalidate all cursors.
730*/
drh144f9ea2003-04-16 01:28:16 +0000731static int fileBtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000732 while( pBt->pCursor ){
drh144f9ea2003-04-16 01:28:16 +0000733 fileBtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000734 }
735 sqlitepager_close(pBt->pPager);
736 sqliteFree(pBt);
737 return SQLITE_OK;
738}
739
740/*
drhda47d772002-12-02 04:25:19 +0000741** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000742**
743** The maximum number of cache pages is set to the absolute
744** value of mxPage. If mxPage is negative, the pager will
745** operate asynchronously - it will not stop to do fsync()s
746** to insure data is written to the disk surface before
747** continuing. Transactions still work if synchronous is off,
748** and the database cannot be corrupted if this program
749** crashes. But if the operating system crashes or there is
750** an abrupt power failure when synchronous is off, the database
751** could be left in an inconsistent and unrecoverable state.
752** Synchronous is on by default so database corruption is not
753** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000754*/
drh144f9ea2003-04-16 01:28:16 +0000755static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){
drhf57b14a2001-09-14 18:54:08 +0000756 sqlitepager_set_cachesize(pBt->pPager, mxPage);
757 return SQLITE_OK;
758}
759
760/*
drh973b6e32003-02-12 14:09:42 +0000761** Change the way data is synced to disk in order to increase or decrease
762** how well the database resists damage due to OS crashes and power
763** failures. Level 1 is the same as asynchronous (no syncs() occur and
764** there is a high probability of damage) Level 2 is the default. There
765** is a very low but non-zero probability of damage. Level 3 reduces the
766** probability of damage to near zero but with a write performance reduction.
767*/
drh144f9ea2003-04-16 01:28:16 +0000768static int fileBtreeSetSafetyLevel(Btree *pBt, int level){
drh973b6e32003-02-12 14:09:42 +0000769 sqlitepager_set_safety_level(pBt->pPager, level);
770 return SQLITE_OK;
771}
772
773/*
drh306dc212001-05-21 13:45:10 +0000774** Get a reference to page1 of the database file. This will
775** also acquire a readlock on that file.
776**
777** SQLITE_OK is returned on success. If the file is not a
778** well-formed database file, then SQLITE_CORRUPT is returned.
779** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
780** is returned if we run out of memory. SQLITE_PROTOCOL is returned
781** if there is a locking protocol violation.
782*/
783static int lockBtree(Btree *pBt){
784 int rc;
785 if( pBt->page1 ) return SQLITE_OK;
drh8c42ca92001-06-22 19:15:00 +0000786 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1);
drh306dc212001-05-21 13:45:10 +0000787 if( rc!=SQLITE_OK ) return rc;
drh306dc212001-05-21 13:45:10 +0000788
789 /* Do some checking to help insure the file we opened really is
790 ** a valid database file.
791 */
792 if( sqlitepager_pagecount(pBt->pPager)>0 ){
drhbd03cae2001-06-02 02:40:57 +0000793 PageOne *pP1 = pBt->page1;
drh0d316a42002-08-11 20:10:47 +0000794 if( strcmp(pP1->zMagic,zMagicHeader)!=0 ||
795 (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){
drh306dc212001-05-21 13:45:10 +0000796 rc = SQLITE_CORRUPT;
drh72f82862001-05-24 21:06:34 +0000797 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +0000798 }
drh0d316a42002-08-11 20:10:47 +0000799 pBt->needSwab = pP1->iMagic!=MAGIC;
drh306dc212001-05-21 13:45:10 +0000800 }
801 return rc;
802
drh72f82862001-05-24 21:06:34 +0000803page1_init_failed:
drh306dc212001-05-21 13:45:10 +0000804 sqlitepager_unref(pBt->page1);
805 pBt->page1 = 0;
drh72f82862001-05-24 21:06:34 +0000806 return rc;
drh306dc212001-05-21 13:45:10 +0000807}
808
809/*
drhb8ca3072001-12-05 00:21:20 +0000810** If there are no outstanding cursors and we are not in the middle
811** of a transaction but there is a read lock on the database, then
812** this routine unrefs the first page of the database file which
813** has the effect of releasing the read lock.
814**
815** If there are any outstanding cursors, this routine is a no-op.
816**
817** If there is a transaction in progress, this routine is a no-op.
818*/
819static void unlockBtreeIfUnused(Btree *pBt){
820 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){
821 sqlitepager_unref(pBt->page1);
822 pBt->page1 = 0;
823 pBt->inTrans = 0;
drh663fc632002-02-02 18:49:19 +0000824 pBt->inCkpt = 0;
drhb8ca3072001-12-05 00:21:20 +0000825 }
826}
827
828/*
drh8c42ca92001-06-22 19:15:00 +0000829** Create a new database by initializing the first two pages of the
830** file.
drh8b2f49b2001-06-08 00:21:52 +0000831*/
832static int newDatabase(Btree *pBt){
833 MemPage *pRoot;
834 PageOne *pP1;
drh8c42ca92001-06-22 19:15:00 +0000835 int rc;
drh7c717f72001-06-24 20:39:41 +0000836 if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK;
drh8b2f49b2001-06-08 00:21:52 +0000837 pP1 = pBt->page1;
838 rc = sqlitepager_write(pBt->page1);
839 if( rc ) return rc;
drh8c42ca92001-06-22 19:15:00 +0000840 rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot);
drh8b2f49b2001-06-08 00:21:52 +0000841 if( rc ) return rc;
842 rc = sqlitepager_write(pRoot);
843 if( rc ){
844 sqlitepager_unref(pRoot);
845 return rc;
846 }
847 strcpy(pP1->zMagic, zMagicHeader);
drh0d316a42002-08-11 20:10:47 +0000848 if( btree_native_byte_order ){
849 pP1->iMagic = MAGIC;
850 pBt->needSwab = 0;
851 }else{
852 pP1->iMagic = swab32(MAGIC);
853 pBt->needSwab = 1;
854 }
drh0d316a42002-08-11 20:10:47 +0000855 zeroPage(pBt, pRoot);
drh8b2f49b2001-06-08 00:21:52 +0000856 sqlitepager_unref(pRoot);
857 return SQLITE_OK;
858}
859
860/*
drh72f82862001-05-24 21:06:34 +0000861** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +0000862**
863** A transaction must be started before attempting any changes
864** to the database. None of the following routines will work
865** unless a transaction is started first:
866**
867** sqliteBtreeCreateTable()
drhc6b52df2002-01-04 03:09:29 +0000868** sqliteBtreeCreateIndex()
drh8b2f49b2001-06-08 00:21:52 +0000869** sqliteBtreeClearTable()
870** sqliteBtreeDropTable()
871** sqliteBtreeInsert()
872** sqliteBtreeDelete()
873** sqliteBtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +0000874*/
drh144f9ea2003-04-16 01:28:16 +0000875static int fileBtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000876 int rc;
877 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +0000878 if( pBt->readOnly ) return SQLITE_READONLY;
drha059ad02001-04-17 20:09:11 +0000879 if( pBt->page1==0 ){
drh7e3b0a02001-04-28 16:52:40 +0000880 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +0000881 if( rc!=SQLITE_OK ){
882 return rc;
883 }
drha059ad02001-04-17 20:09:11 +0000884 }
drhf74b8d92002-09-01 23:20:45 +0000885 rc = sqlitepager_begin(pBt->page1);
886 if( rc==SQLITE_OK ){
887 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +0000888 }
drhb8ca3072001-12-05 00:21:20 +0000889 if( rc==SQLITE_OK ){
890 pBt->inTrans = 1;
drh663fc632002-02-02 18:49:19 +0000891 pBt->inCkpt = 0;
drhb8ca3072001-12-05 00:21:20 +0000892 }else{
893 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +0000894 }
drhb8ca3072001-12-05 00:21:20 +0000895 return rc;
drha059ad02001-04-17 20:09:11 +0000896}
897
898/*
drh2aa679f2001-06-25 02:11:07 +0000899** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +0000900**
901** This will release the write lock on the database file. If there
902** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +0000903*/
drh144f9ea2003-04-16 01:28:16 +0000904static int fileBtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000905 int rc;
drh5df72a52002-06-06 23:16:05 +0000906 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +0000907 pBt->inTrans = 0;
drh663fc632002-02-02 18:49:19 +0000908 pBt->inCkpt = 0;
drh5e00f6c2001-09-13 13:46:56 +0000909 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +0000910 return rc;
911}
912
913/*
drhecdc7532001-09-23 02:35:53 +0000914** Rollback the transaction in progress. All cursors will be
915** invalided by this operation. Any attempt to use a cursor
916** that was open at the beginning of this operation will result
917** in an error.
drh5e00f6c2001-09-13 13:46:56 +0000918**
919** This will release the write lock on the database file. If there
920** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +0000921*/
drh144f9ea2003-04-16 01:28:16 +0000922static int fileBtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000923 int rc;
drhecdc7532001-09-23 02:35:53 +0000924 BtCursor *pCur;
drh7c717f72001-06-24 20:39:41 +0000925 if( pBt->inTrans==0 ) return SQLITE_OK;
926 pBt->inTrans = 0;
drh663fc632002-02-02 18:49:19 +0000927 pBt->inCkpt = 0;
drh3a840692003-01-29 22:58:26 +0000928 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager);
drhecdc7532001-09-23 02:35:53 +0000929 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drh3a840692003-01-29 22:58:26 +0000930 if( pCur->pPage && pCur->pPage->isInit==0 ){
drhecdc7532001-09-23 02:35:53 +0000931 sqlitepager_unref(pCur->pPage);
932 pCur->pPage = 0;
933 }
934 }
drh5e00f6c2001-09-13 13:46:56 +0000935 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +0000936 return rc;
937}
938
939/*
drh663fc632002-02-02 18:49:19 +0000940** Set the checkpoint for the current transaction. The checkpoint serves
941** as a sub-transaction that can be rolled back independently of the
942** main transaction. You must start a transaction before starting a
943** checkpoint. The checkpoint is ended automatically if the transaction
944** commits or rolls back.
945**
946** Only one checkpoint may be active at a time. It is an error to try
947** to start a new checkpoint if another checkpoint is already active.
948*/
drh144f9ea2003-04-16 01:28:16 +0000949static int fileBtreeBeginCkpt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +0000950 int rc;
drh0d65dc02002-02-03 00:56:09 +0000951 if( !pBt->inTrans || pBt->inCkpt ){
drhf74b8d92002-09-01 23:20:45 +0000952 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +0000953 }
drh5df72a52002-06-06 23:16:05 +0000954 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager);
drh663fc632002-02-02 18:49:19 +0000955 pBt->inCkpt = 1;
956 return rc;
957}
958
959
960/*
961** Commit a checkpoint to transaction currently in progress. If no
962** checkpoint is active, this is a no-op.
963*/
drh144f9ea2003-04-16 01:28:16 +0000964static int fileBtreeCommitCkpt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +0000965 int rc;
drh5df72a52002-06-06 23:16:05 +0000966 if( pBt->inCkpt && !pBt->readOnly ){
drh663fc632002-02-02 18:49:19 +0000967 rc = sqlitepager_ckpt_commit(pBt->pPager);
968 }else{
969 rc = SQLITE_OK;
970 }
drh0d65dc02002-02-03 00:56:09 +0000971 pBt->inCkpt = 0;
drh663fc632002-02-02 18:49:19 +0000972 return rc;
973}
974
975/*
976** Rollback the checkpoint to the current transaction. If there
977** is no active checkpoint or transaction, this routine is a no-op.
978**
979** All cursors will be invalided by this operation. Any attempt
980** to use a cursor that was open at the beginning of this operation
981** will result in an error.
982*/
drh144f9ea2003-04-16 01:28:16 +0000983static int fileBtreeRollbackCkpt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +0000984 int rc;
985 BtCursor *pCur;
drh5df72a52002-06-06 23:16:05 +0000986 if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK;
drh3a840692003-01-29 22:58:26 +0000987 rc = sqlitepager_ckpt_rollback(pBt->pPager);
drh663fc632002-02-02 18:49:19 +0000988 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drh3a840692003-01-29 22:58:26 +0000989 if( pCur->pPage && pCur->pPage->isInit==0 ){
drh663fc632002-02-02 18:49:19 +0000990 sqlitepager_unref(pCur->pPage);
991 pCur->pPage = 0;
992 }
993 }
drh0d65dc02002-02-03 00:56:09 +0000994 pBt->inCkpt = 0;
drh663fc632002-02-02 18:49:19 +0000995 return rc;
996}
997
998/*
drh8b2f49b2001-06-08 00:21:52 +0000999** Create a new cursor for the BTree whose root is on the page
1000** iTable. The act of acquiring a cursor gets a read lock on
1001** the database file.
drh1bee3d72001-10-15 00:44:35 +00001002**
1003** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001004** If wrFlag==1, then the cursor can be used for reading or for
1005** writing if other conditions for writing are also met. These
1006** are the conditions that must be met in order for writing to
1007** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001008**
drhf74b8d92002-09-01 23:20:45 +00001009** 1: The cursor must have been opened with wrFlag==1
1010**
1011** 2: No other cursors may be open with wrFlag==0 on the same table
1012**
1013** 3: The database must be writable (not on read-only media)
1014**
1015** 4: There must be an active transaction.
1016**
1017** Condition 2 warrants further discussion. If any cursor is opened
1018** on a table with wrFlag==0, that prevents all other cursors from
1019** writing to that table. This is a kind of "read-lock". When a cursor
1020** is opened with wrFlag==0 it is guaranteed that the table will not
1021** change as long as the cursor is open. This allows the cursor to
1022** do a sequential scan of the table without having to worry about
1023** entries being inserted or deleted during the scan. Cursors should
1024** be opened with wrFlag==0 only if this read-lock property is needed.
1025** That is to say, cursors should be opened with wrFlag==0 only if they
1026** intend to use the sqliteBtreeNext() system call. All other cursors
1027** should be opened with wrFlag==1 even if they never really intend
1028** to write.
1029**
drh6446c4d2001-12-15 14:22:18 +00001030** No checking is done to make sure that page iTable really is the
1031** root page of a b-tree. If it is not, then the cursor acquired
1032** will not work correctly.
drha059ad02001-04-17 20:09:11 +00001033*/
drh144f9ea2003-04-16 01:28:16 +00001034static int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){
drha059ad02001-04-17 20:09:11 +00001035 int rc;
drhf74b8d92002-09-01 23:20:45 +00001036 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001037
drha059ad02001-04-17 20:09:11 +00001038 if( pBt->page1==0 ){
1039 rc = lockBtree(pBt);
1040 if( rc!=SQLITE_OK ){
1041 *ppCur = 0;
1042 return rc;
1043 }
1044 }
1045 pCur = sqliteMalloc( sizeof(*pCur) );
1046 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001047 rc = SQLITE_NOMEM;
1048 goto create_cursor_exception;
1049 }
drh8b2f49b2001-06-08 00:21:52 +00001050 pCur->pgnoRoot = (Pgno)iTable;
drh8c42ca92001-06-22 19:15:00 +00001051 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001052 if( rc!=SQLITE_OK ){
1053 goto create_cursor_exception;
1054 }
drh0d316a42002-08-11 20:10:47 +00001055 rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0);
drhbd03cae2001-06-02 02:40:57 +00001056 if( rc!=SQLITE_OK ){
1057 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001058 }
paulb95a8862003-04-01 21:16:41 +00001059 pCur->pOps = &sqliteBtreeCursorOps;
drh14acc042001-06-10 19:56:58 +00001060 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001061 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001062 pCur->idx = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001063 pCur->eSkip = SKIP_INVALID;
drha059ad02001-04-17 20:09:11 +00001064 pCur->pNext = pBt->pCursor;
1065 if( pCur->pNext ){
1066 pCur->pNext->pPrev = pCur;
1067 }
drh14acc042001-06-10 19:56:58 +00001068 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001069 pRing = pBt->pCursor;
1070 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1071 if( pRing ){
1072 pCur->pShared = pRing->pShared;
1073 pRing->pShared = pCur;
1074 }else{
1075 pCur->pShared = pCur;
1076 }
drha059ad02001-04-17 20:09:11 +00001077 pBt->pCursor = pCur;
drh2af926b2001-05-15 00:39:25 +00001078 *ppCur = pCur;
1079 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001080
1081create_cursor_exception:
1082 *ppCur = 0;
1083 if( pCur ){
1084 if( pCur->pPage ) sqlitepager_unref(pCur->pPage);
1085 sqliteFree(pCur);
1086 }
drh5e00f6c2001-09-13 13:46:56 +00001087 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001088 return rc;
drha059ad02001-04-17 20:09:11 +00001089}
1090
1091/*
drh5e00f6c2001-09-13 13:46:56 +00001092** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001093** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001094*/
drh144f9ea2003-04-16 01:28:16 +00001095static int fileBtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001096 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001097 if( pCur->pPrev ){
1098 pCur->pPrev->pNext = pCur->pNext;
1099 }else{
1100 pBt->pCursor = pCur->pNext;
1101 }
1102 if( pCur->pNext ){
1103 pCur->pNext->pPrev = pCur->pPrev;
1104 }
drhecdc7532001-09-23 02:35:53 +00001105 if( pCur->pPage ){
1106 sqlitepager_unref(pCur->pPage);
1107 }
drhf74b8d92002-09-01 23:20:45 +00001108 if( pCur->pShared!=pCur ){
1109 BtCursor *pRing = pCur->pShared;
1110 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1111 pRing->pShared = pCur->pShared;
1112 }
drh5e00f6c2001-09-13 13:46:56 +00001113 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001114 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001115 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001116}
1117
drh7e3b0a02001-04-28 16:52:40 +00001118/*
drh5e2f8b92001-05-28 00:41:15 +00001119** Make a temporary cursor by filling in the fields of pTempCur.
1120** The temporary cursor is not on the cursor list for the Btree.
1121*/
drh14acc042001-06-10 19:56:58 +00001122static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001123 memcpy(pTempCur, pCur, sizeof(*pCur));
1124 pTempCur->pNext = 0;
1125 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001126 if( pTempCur->pPage ){
1127 sqlitepager_ref(pTempCur->pPage);
1128 }
drh5e2f8b92001-05-28 00:41:15 +00001129}
1130
1131/*
drhbd03cae2001-06-02 02:40:57 +00001132** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001133** function above.
1134*/
drh14acc042001-06-10 19:56:58 +00001135static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001136 if( pCur->pPage ){
1137 sqlitepager_unref(pCur->pPage);
1138 }
drh5e2f8b92001-05-28 00:41:15 +00001139}
1140
1141/*
drhbd03cae2001-06-02 02:40:57 +00001142** Set *pSize to the number of bytes of key in the entry the
1143** cursor currently points to. Always return SQLITE_OK.
1144** Failure is not possible. If the cursor is not currently
1145** pointing to an entry (which can happen, for example, if
1146** the database is empty) then *pSize is set to 0.
drh7e3b0a02001-04-28 16:52:40 +00001147*/
drh144f9ea2003-04-16 01:28:16 +00001148static int fileBtreeKeySize(BtCursor *pCur, int *pSize){
drh2af926b2001-05-15 00:39:25 +00001149 Cell *pCell;
1150 MemPage *pPage;
1151
1152 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001153 assert( pPage!=0 );
1154 if( pCur->idx >= pPage->nCell ){
drh72f82862001-05-24 21:06:34 +00001155 *pSize = 0;
1156 }else{
drh5e2f8b92001-05-28 00:41:15 +00001157 pCell = pPage->apCell[pCur->idx];
drh0d316a42002-08-11 20:10:47 +00001158 *pSize = NKEY(pCur->pBt, pCell->h);
drh72f82862001-05-24 21:06:34 +00001159 }
1160 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001161}
drh2af926b2001-05-15 00:39:25 +00001162
drh72f82862001-05-24 21:06:34 +00001163/*
1164** Read payload information from the entry that the pCur cursor is
1165** pointing to. Begin reading the payload at "offset" and read
1166** a total of "amt" bytes. Put the result in zBuf.
1167**
1168** This routine does not make a distinction between key and data.
1169** It just reads bytes from the payload area.
1170*/
drh2af926b2001-05-15 00:39:25 +00001171static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){
drh5e2f8b92001-05-28 00:41:15 +00001172 char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001173 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001174 int rc;
drh0d316a42002-08-11 20:10:47 +00001175 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001176 assert( pCur!=0 && pCur->pPage!=0 );
drh8c42ca92001-06-22 19:15:00 +00001177 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1178 aPayload = pCur->pPage->apCell[pCur->idx]->aPayload;
drh2af926b2001-05-15 00:39:25 +00001179 if( offset<MX_LOCAL_PAYLOAD ){
1180 int a = amt;
1181 if( a+offset>MX_LOCAL_PAYLOAD ){
1182 a = MX_LOCAL_PAYLOAD - offset;
1183 }
drh5e2f8b92001-05-28 00:41:15 +00001184 memcpy(zBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001185 if( a==amt ){
1186 return SQLITE_OK;
1187 }
drh2aa679f2001-06-25 02:11:07 +00001188 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001189 zBuf += a;
1190 amt -= a;
drhdd793422001-06-28 01:54:48 +00001191 }else{
1192 offset -= MX_LOCAL_PAYLOAD;
drhbd03cae2001-06-02 02:40:57 +00001193 }
1194 if( amt>0 ){
drh0d316a42002-08-11 20:10:47 +00001195 nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl);
drh2af926b2001-05-15 00:39:25 +00001196 }
1197 while( amt>0 && nextPage ){
1198 OverflowPage *pOvfl;
drh0d316a42002-08-11 20:10:47 +00001199 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
drh2af926b2001-05-15 00:39:25 +00001200 if( rc!=0 ){
1201 return rc;
1202 }
drh0d316a42002-08-11 20:10:47 +00001203 nextPage = SWAB32(pBt, pOvfl->iNext);
drh2af926b2001-05-15 00:39:25 +00001204 if( offset<OVERFLOW_SIZE ){
1205 int a = amt;
1206 if( a + offset > OVERFLOW_SIZE ){
1207 a = OVERFLOW_SIZE - offset;
1208 }
drh5e2f8b92001-05-28 00:41:15 +00001209 memcpy(zBuf, &pOvfl->aPayload[offset], a);
drh2aa679f2001-06-25 02:11:07 +00001210 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001211 amt -= a;
1212 zBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001213 }else{
1214 offset -= OVERFLOW_SIZE;
drh2af926b2001-05-15 00:39:25 +00001215 }
1216 sqlitepager_unref(pOvfl);
1217 }
drha7fcb052001-12-14 15:09:55 +00001218 if( amt>0 ){
1219 return SQLITE_CORRUPT;
1220 }
1221 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001222}
1223
drh72f82862001-05-24 21:06:34 +00001224/*
drh5e00f6c2001-09-13 13:46:56 +00001225** Read part of the key associated with cursor pCur. A maximum
drh72f82862001-05-24 21:06:34 +00001226** of "amt" bytes will be transfered into zBuf[]. The transfer
drh5e00f6c2001-09-13 13:46:56 +00001227** begins at "offset". The number of bytes actually read is
drh8c1238a2003-01-02 14:43:55 +00001228** returned.
1229**
1230** Change: It used to be that the amount returned will be smaller
1231** than the amount requested if there are not enough bytes in the key
1232** to satisfy the request. But now, it must be the case that there
1233** is enough data available to satisfy the request. If not, an exception
1234** is raised. The change was made in an effort to boost performance
1235** by eliminating unneeded tests.
drh72f82862001-05-24 21:06:34 +00001236*/
drh144f9ea2003-04-16 01:28:16 +00001237static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){
drh72f82862001-05-24 21:06:34 +00001238 MemPage *pPage;
drha059ad02001-04-17 20:09:11 +00001239
drh8c1238a2003-01-02 14:43:55 +00001240 assert( amt>=0 );
1241 assert( offset>=0 );
1242 assert( pCur->pPage!=0 );
drh72f82862001-05-24 21:06:34 +00001243 pPage = pCur->pPage;
drh72f82862001-05-24 21:06:34 +00001244 if( pCur->idx >= pPage->nCell ){
drh5e00f6c2001-09-13 13:46:56 +00001245 return 0;
drh72f82862001-05-24 21:06:34 +00001246 }
drh8c1238a2003-01-02 14:43:55 +00001247 assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) );
drh5e00f6c2001-09-13 13:46:56 +00001248 getPayload(pCur, offset, amt, zBuf);
1249 return amt;
drh72f82862001-05-24 21:06:34 +00001250}
1251
1252/*
drhbd03cae2001-06-02 02:40:57 +00001253** Set *pSize to the number of bytes of data in the entry the
1254** cursor currently points to. Always return SQLITE_OK.
1255** Failure is not possible. If the cursor is not currently
1256** pointing to an entry (which can happen, for example, if
1257** the database is empty) then *pSize is set to 0.
drh72f82862001-05-24 21:06:34 +00001258*/
drh144f9ea2003-04-16 01:28:16 +00001259static int fileBtreeDataSize(BtCursor *pCur, int *pSize){
drh72f82862001-05-24 21:06:34 +00001260 Cell *pCell;
1261 MemPage *pPage;
1262
1263 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001264 assert( pPage!=0 );
1265 if( pCur->idx >= pPage->nCell ){
drh72f82862001-05-24 21:06:34 +00001266 *pSize = 0;
1267 }else{
drh5e2f8b92001-05-28 00:41:15 +00001268 pCell = pPage->apCell[pCur->idx];
drh0d316a42002-08-11 20:10:47 +00001269 *pSize = NDATA(pCur->pBt, pCell->h);
drh72f82862001-05-24 21:06:34 +00001270 }
1271 return SQLITE_OK;
1272}
1273
1274/*
drh5e00f6c2001-09-13 13:46:56 +00001275** Read part of the data associated with cursor pCur. A maximum
drh72f82862001-05-24 21:06:34 +00001276** of "amt" bytes will be transfered into zBuf[]. The transfer
drh5e00f6c2001-09-13 13:46:56 +00001277** begins at "offset". The number of bytes actually read is
1278** returned. The amount returned will be smaller than the
1279** amount requested if there are not enough bytes in the data
1280** to satisfy the request.
drh72f82862001-05-24 21:06:34 +00001281*/
drh144f9ea2003-04-16 01:28:16 +00001282static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){
drh72f82862001-05-24 21:06:34 +00001283 Cell *pCell;
1284 MemPage *pPage;
1285
drh8c1238a2003-01-02 14:43:55 +00001286 assert( amt>=0 );
1287 assert( offset>=0 );
1288 assert( pCur->pPage!=0 );
drh72f82862001-05-24 21:06:34 +00001289 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001290 if( pCur->idx >= pPage->nCell ){
drh5e00f6c2001-09-13 13:46:56 +00001291 return 0;
drh72f82862001-05-24 21:06:34 +00001292 }
drh5e2f8b92001-05-28 00:41:15 +00001293 pCell = pPage->apCell[pCur->idx];
drh8c1238a2003-01-02 14:43:55 +00001294 assert( amt+offset <= NDATA(pCur->pBt, pCell->h) );
drh0d316a42002-08-11 20:10:47 +00001295 getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf);
drh5e00f6c2001-09-13 13:46:56 +00001296 return amt;
drh72f82862001-05-24 21:06:34 +00001297}
drha059ad02001-04-17 20:09:11 +00001298
drh2af926b2001-05-15 00:39:25 +00001299/*
drh8721ce42001-11-07 14:22:00 +00001300** Compare an external key against the key on the entry that pCur points to.
1301**
1302** The external key is pKey and is nKey bytes long. The last nIgnore bytes
1303** of the key associated with pCur are ignored, as if they do not exist.
1304** (The normal case is for nIgnore to be zero in which case the entire
1305** internal key is used in the comparison.)
1306**
1307** The comparison result is written to *pRes as follows:
drh2af926b2001-05-15 00:39:25 +00001308**
drh717e6402001-09-27 03:22:32 +00001309** *pRes<0 This means pCur<pKey
1310**
1311** *pRes==0 This means pCur==pKey for all nKey bytes
1312**
1313** *pRes>0 This means pCur>pKey
1314**
drh8721ce42001-11-07 14:22:00 +00001315** When one key is an exact prefix of the other, the shorter key is
1316** considered less than the longer one. In order to be equal the
1317** keys must be exactly the same length. (The length of the pCur key
1318** is the actual key length minus nIgnore bytes.)
drh2af926b2001-05-15 00:39:25 +00001319*/
drh144f9ea2003-04-16 01:28:16 +00001320static int fileBtreeKeyCompare(
drh8721ce42001-11-07 14:22:00 +00001321 BtCursor *pCur, /* Pointer to entry to compare against */
1322 const void *pKey, /* Key to compare against entry that pCur points to */
1323 int nKey, /* Number of bytes in pKey */
1324 int nIgnore, /* Ignore this many bytes at the end of pCur */
1325 int *pResult /* Write the result here */
drh5c4d9702001-08-20 00:33:58 +00001326){
drh2af926b2001-05-15 00:39:25 +00001327 Pgno nextPage;
drh8721ce42001-11-07 14:22:00 +00001328 int n, c, rc, nLocal;
drh2af926b2001-05-15 00:39:25 +00001329 Cell *pCell;
drh0d316a42002-08-11 20:10:47 +00001330 Btree *pBt = pCur->pBt;
drh717e6402001-09-27 03:22:32 +00001331 const char *zKey = (const char*)pKey;
drh2af926b2001-05-15 00:39:25 +00001332
1333 assert( pCur->pPage );
1334 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drhbd03cae2001-06-02 02:40:57 +00001335 pCell = pCur->pPage->apCell[pCur->idx];
drh0d316a42002-08-11 20:10:47 +00001336 nLocal = NKEY(pBt, pCell->h) - nIgnore;
drh8721ce42001-11-07 14:22:00 +00001337 if( nLocal<0 ) nLocal = 0;
1338 n = nKey<nLocal ? nKey : nLocal;
drh2af926b2001-05-15 00:39:25 +00001339 if( n>MX_LOCAL_PAYLOAD ){
1340 n = MX_LOCAL_PAYLOAD;
1341 }
drh717e6402001-09-27 03:22:32 +00001342 c = memcmp(pCell->aPayload, zKey, n);
drh2af926b2001-05-15 00:39:25 +00001343 if( c!=0 ){
1344 *pResult = c;
1345 return SQLITE_OK;
1346 }
drh717e6402001-09-27 03:22:32 +00001347 zKey += n;
drh2af926b2001-05-15 00:39:25 +00001348 nKey -= n;
drh8721ce42001-11-07 14:22:00 +00001349 nLocal -= n;
drh0d316a42002-08-11 20:10:47 +00001350 nextPage = SWAB32(pBt, pCell->ovfl);
drh8721ce42001-11-07 14:22:00 +00001351 while( nKey>0 && nLocal>0 ){
drh2af926b2001-05-15 00:39:25 +00001352 OverflowPage *pOvfl;
1353 if( nextPage==0 ){
1354 return SQLITE_CORRUPT;
1355 }
drh0d316a42002-08-11 20:10:47 +00001356 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
drh72f82862001-05-24 21:06:34 +00001357 if( rc ){
drh2af926b2001-05-15 00:39:25 +00001358 return rc;
1359 }
drh0d316a42002-08-11 20:10:47 +00001360 nextPage = SWAB32(pBt, pOvfl->iNext);
drh8721ce42001-11-07 14:22:00 +00001361 n = nKey<nLocal ? nKey : nLocal;
drh2af926b2001-05-15 00:39:25 +00001362 if( n>OVERFLOW_SIZE ){
1363 n = OVERFLOW_SIZE;
1364 }
drh717e6402001-09-27 03:22:32 +00001365 c = memcmp(pOvfl->aPayload, zKey, n);
drh2af926b2001-05-15 00:39:25 +00001366 sqlitepager_unref(pOvfl);
1367 if( c!=0 ){
1368 *pResult = c;
1369 return SQLITE_OK;
1370 }
1371 nKey -= n;
drh8721ce42001-11-07 14:22:00 +00001372 nLocal -= n;
drh717e6402001-09-27 03:22:32 +00001373 zKey += n;
drh2af926b2001-05-15 00:39:25 +00001374 }
drh717e6402001-09-27 03:22:32 +00001375 if( c==0 ){
drh8721ce42001-11-07 14:22:00 +00001376 c = nLocal - nKey;
drh717e6402001-09-27 03:22:32 +00001377 }
drh2af926b2001-05-15 00:39:25 +00001378 *pResult = c;
1379 return SQLITE_OK;
1380}
1381
drh72f82862001-05-24 21:06:34 +00001382/*
drh8178a752003-01-05 21:41:40 +00001383** Move the cursor down to a new child page. The newPgno argument is the
1384** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001385*/
drh5e2f8b92001-05-28 00:41:15 +00001386static int moveToChild(BtCursor *pCur, int newPgno){
drh72f82862001-05-24 21:06:34 +00001387 int rc;
1388 MemPage *pNewPage;
drh0d316a42002-08-11 20:10:47 +00001389 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001390
drh8178a752003-01-05 21:41:40 +00001391 newPgno = SWAB32(pBt, newPgno);
drh0d316a42002-08-11 20:10:47 +00001392 rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage);
drh6019e162001-07-02 17:51:45 +00001393 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00001394 rc = initPage(pBt, pNewPage, newPgno, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001395 if( rc ) return rc;
drh428ae8c2003-01-04 16:48:09 +00001396 assert( pCur->idx>=pCur->pPage->nCell
1397 || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) );
1398 assert( pCur->idx<pCur->pPage->nCell
1399 || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) );
1400 pNewPage->idxParent = pCur->idx;
1401 pCur->pPage->idxShift = 0;
drh72f82862001-05-24 21:06:34 +00001402 sqlitepager_unref(pCur->pPage);
1403 pCur->pPage = pNewPage;
1404 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001405 if( pNewPage->nCell<1 ){
1406 return SQLITE_CORRUPT;
1407 }
drh72f82862001-05-24 21:06:34 +00001408 return SQLITE_OK;
1409}
1410
1411/*
drh5e2f8b92001-05-28 00:41:15 +00001412** Move the cursor up to the parent page.
1413**
1414** pCur->idx is set to the cell index that contains the pointer
1415** to the page we are coming from. If we are coming from the
1416** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001417** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001418*/
drh8178a752003-01-05 21:41:40 +00001419static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001420 Pgno oldPgno;
1421 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001422 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001423 int idxParent;
drh8178a752003-01-05 21:41:40 +00001424 pPage = pCur->pPage;
1425 assert( pPage!=0 );
1426 pParent = pPage->pParent;
1427 assert( pParent!=0 );
1428 idxParent = pPage->idxParent;
drh72f82862001-05-24 21:06:34 +00001429 sqlitepager_ref(pParent);
drh8178a752003-01-05 21:41:40 +00001430 sqlitepager_unref(pPage);
drh72f82862001-05-24 21:06:34 +00001431 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001432 assert( pParent->idxShift==0 );
1433 if( pParent->idxShift==0 ){
1434 pCur->idx = idxParent;
1435#ifndef NDEBUG
1436 /* Verify that pCur->idx is the correct index to point back to the child
1437 ** page we just came from
1438 */
drh8178a752003-01-05 21:41:40 +00001439 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
drh428ae8c2003-01-04 16:48:09 +00001440 if( pCur->idx<pParent->nCell ){
1441 assert( pParent->apCell[idxParent]->h.leftChild==oldPgno );
1442 }else{
1443 assert( pParent->u.hdr.rightChild==oldPgno );
1444 }
1445#endif
1446 }else{
1447 /* The MemPage.idxShift flag indicates that cell indices might have
1448 ** changed since idxParent was set and hence idxParent might be out
1449 ** of date. So recompute the parent cell index by scanning all cells
1450 ** and locating the one that points to the child we just came from.
1451 */
1452 int i;
1453 pCur->idx = pParent->nCell;
drh8178a752003-01-05 21:41:40 +00001454 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
drh428ae8c2003-01-04 16:48:09 +00001455 for(i=0; i<pParent->nCell; i++){
1456 if( pParent->apCell[i]->h.leftChild==oldPgno ){
1457 pCur->idx = i;
1458 break;
1459 }
drh72f82862001-05-24 21:06:34 +00001460 }
1461 }
1462}
1463
1464/*
1465** Move the cursor to the root page
1466*/
drh5e2f8b92001-05-28 00:41:15 +00001467static int moveToRoot(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001468 MemPage *pNew;
drhbd03cae2001-06-02 02:40:57 +00001469 int rc;
drh0d316a42002-08-11 20:10:47 +00001470 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001471
drh0d316a42002-08-11 20:10:47 +00001472 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew);
drhbd03cae2001-06-02 02:40:57 +00001473 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00001474 rc = initPage(pBt, pNew, pCur->pgnoRoot, 0);
drh6019e162001-07-02 17:51:45 +00001475 if( rc ) return rc;
drh72f82862001-05-24 21:06:34 +00001476 sqlitepager_unref(pCur->pPage);
1477 pCur->pPage = pNew;
1478 pCur->idx = 0;
1479 return SQLITE_OK;
1480}
drh2af926b2001-05-15 00:39:25 +00001481
drh5e2f8b92001-05-28 00:41:15 +00001482/*
1483** Move the cursor down to the left-most leaf entry beneath the
1484** entry to which it is currently pointing.
1485*/
1486static int moveToLeftmost(BtCursor *pCur){
1487 Pgno pgno;
1488 int rc;
1489
1490 while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
drh8178a752003-01-05 21:41:40 +00001491 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001492 if( rc ) return rc;
1493 }
1494 return SQLITE_OK;
1495}
1496
drh2dcc9aa2002-12-04 13:40:25 +00001497/*
1498** Move the cursor down to the right-most leaf entry beneath the
1499** page to which it is currently pointing. Notice the difference
1500** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1501** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1502** finds the right-most entry beneath the *page*.
1503*/
1504static int moveToRightmost(BtCursor *pCur){
1505 Pgno pgno;
1506 int rc;
1507
1508 while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){
drh428ae8c2003-01-04 16:48:09 +00001509 pCur->idx = pCur->pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001510 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001511 if( rc ) return rc;
1512 }
1513 pCur->idx = pCur->pPage->nCell - 1;
1514 return SQLITE_OK;
1515}
1516
drh5e00f6c2001-09-13 13:46:56 +00001517/* Move the cursor to the first entry in the table. Return SQLITE_OK
1518** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001519** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001520*/
drh144f9ea2003-04-16 01:28:16 +00001521static int fileBtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001522 int rc;
drhecdc7532001-09-23 02:35:53 +00001523 if( pCur->pPage==0 ) return SQLITE_ABORT;
drh5e00f6c2001-09-13 13:46:56 +00001524 rc = moveToRoot(pCur);
1525 if( rc ) return rc;
1526 if( pCur->pPage->nCell==0 ){
1527 *pRes = 1;
1528 return SQLITE_OK;
1529 }
1530 *pRes = 0;
1531 rc = moveToLeftmost(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00001532 pCur->eSkip = SKIP_NONE;
drh5e00f6c2001-09-13 13:46:56 +00001533 return rc;
1534}
drh5e2f8b92001-05-28 00:41:15 +00001535
drh9562b552002-02-19 15:00:07 +00001536/* Move the cursor to the last entry in the table. Return SQLITE_OK
1537** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001538** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001539*/
drh144f9ea2003-04-16 01:28:16 +00001540static int fileBtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001541 int rc;
drh9562b552002-02-19 15:00:07 +00001542 if( pCur->pPage==0 ) return SQLITE_ABORT;
1543 rc = moveToRoot(pCur);
1544 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00001545 assert( pCur->pPage->isInit );
drh9562b552002-02-19 15:00:07 +00001546 if( pCur->pPage->nCell==0 ){
1547 *pRes = 1;
1548 return SQLITE_OK;
1549 }
1550 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001551 rc = moveToRightmost(pCur);
1552 pCur->eSkip = SKIP_NONE;
drh9562b552002-02-19 15:00:07 +00001553 return rc;
1554}
1555
drha059ad02001-04-17 20:09:11 +00001556/* Move the cursor so that it points to an entry near pKey.
drh72f82862001-05-24 21:06:34 +00001557** Return a success code.
1558**
drh5e2f8b92001-05-28 00:41:15 +00001559** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001560** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001561** were present. The cursor might point to an entry that comes
1562** before or after the key.
1563**
drhbd03cae2001-06-02 02:40:57 +00001564** The result of comparing the key with the entry to which the
1565** cursor is left pointing is stored in pCur->iMatch. The same
1566** value is also written to *pRes if pRes!=NULL. The meaning of
1567** this value is as follows:
1568**
1569** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001570** is smaller than pKey or if the table is empty
1571** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001572**
1573** *pRes==0 The cursor is left pointing at an entry that
1574** exactly matches pKey.
1575**
1576** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001577** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001578*/
drh144f9ea2003-04-16 01:28:16 +00001579static
1580int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001581 int rc;
drhecdc7532001-09-23 02:35:53 +00001582 if( pCur->pPage==0 ) return SQLITE_ABORT;
drh2dcc9aa2002-12-04 13:40:25 +00001583 pCur->eSkip = SKIP_NONE;
drh5e2f8b92001-05-28 00:41:15 +00001584 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001585 if( rc ) return rc;
1586 for(;;){
1587 int lwr, upr;
1588 Pgno chldPg;
1589 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001590 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001591 lwr = 0;
1592 upr = pPage->nCell-1;
1593 while( lwr<=upr ){
drh72f82862001-05-24 21:06:34 +00001594 pCur->idx = (lwr+upr)/2;
drh144f9ea2003-04-16 01:28:16 +00001595 rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c);
drh72f82862001-05-24 21:06:34 +00001596 if( rc ) return rc;
1597 if( c==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001598 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001599 if( pRes ) *pRes = 0;
1600 return SQLITE_OK;
1601 }
1602 if( c<0 ){
1603 lwr = pCur->idx+1;
1604 }else{
1605 upr = pCur->idx-1;
1606 }
1607 }
1608 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00001609 assert( pPage->isInit );
drh72f82862001-05-24 21:06:34 +00001610 if( lwr>=pPage->nCell ){
drh14acc042001-06-10 19:56:58 +00001611 chldPg = pPage->u.hdr.rightChild;
drh72f82862001-05-24 21:06:34 +00001612 }else{
drh5e2f8b92001-05-28 00:41:15 +00001613 chldPg = pPage->apCell[lwr]->h.leftChild;
drh72f82862001-05-24 21:06:34 +00001614 }
1615 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001616 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001617 if( pRes ) *pRes = c;
1618 return SQLITE_OK;
1619 }
drh428ae8c2003-01-04 16:48:09 +00001620 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00001621 rc = moveToChild(pCur, chldPg);
drh72f82862001-05-24 21:06:34 +00001622 if( rc ) return rc;
1623 }
drhbd03cae2001-06-02 02:40:57 +00001624 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00001625}
1626
1627/*
drhbd03cae2001-06-02 02:40:57 +00001628** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00001629** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00001630** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00001631** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00001632*/
drh144f9ea2003-04-16 01:28:16 +00001633static int fileBtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00001634 int rc;
drh8178a752003-01-05 21:41:40 +00001635 MemPage *pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001636 assert( pRes!=0 );
drh8178a752003-01-05 21:41:40 +00001637 if( pPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00001638 *pRes = 1;
drhecdc7532001-09-23 02:35:53 +00001639 return SQLITE_ABORT;
1640 }
drh8178a752003-01-05 21:41:40 +00001641 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00001642 assert( pCur->eSkip!=SKIP_INVALID );
drh8178a752003-01-05 21:41:40 +00001643 if( pPage->nCell==0 ){
drh8c1238a2003-01-02 14:43:55 +00001644 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001645 return SQLITE_OK;
1646 }
drh8178a752003-01-05 21:41:40 +00001647 assert( pCur->idx<pPage->nCell );
drh2dcc9aa2002-12-04 13:40:25 +00001648 if( pCur->eSkip==SKIP_NEXT ){
1649 pCur->eSkip = SKIP_NONE;
drh8c1238a2003-01-02 14:43:55 +00001650 *pRes = 0;
drh72f82862001-05-24 21:06:34 +00001651 return SQLITE_OK;
1652 }
drh2dcc9aa2002-12-04 13:40:25 +00001653 pCur->eSkip = SKIP_NONE;
drh72f82862001-05-24 21:06:34 +00001654 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00001655 if( pCur->idx>=pPage->nCell ){
1656 if( pPage->u.hdr.rightChild ){
1657 rc = moveToChild(pCur, pPage->u.hdr.rightChild);
drh5e2f8b92001-05-28 00:41:15 +00001658 if( rc ) return rc;
1659 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001660 *pRes = 0;
1661 return rc;
drh72f82862001-05-24 21:06:34 +00001662 }
drh5e2f8b92001-05-28 00:41:15 +00001663 do{
drh8178a752003-01-05 21:41:40 +00001664 if( pPage->pParent==0 ){
drh8c1238a2003-01-02 14:43:55 +00001665 *pRes = 1;
drh5e2f8b92001-05-28 00:41:15 +00001666 return SQLITE_OK;
1667 }
drh8178a752003-01-05 21:41:40 +00001668 moveToParent(pCur);
1669 pPage = pCur->pPage;
1670 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00001671 *pRes = 0;
drh8178a752003-01-05 21:41:40 +00001672 return SQLITE_OK;
1673 }
1674 *pRes = 0;
1675 if( pPage->u.hdr.rightChild==0 ){
1676 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00001677 }
drh5e2f8b92001-05-28 00:41:15 +00001678 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001679 return rc;
drh72f82862001-05-24 21:06:34 +00001680}
1681
drh3b7511c2001-05-26 13:15:44 +00001682/*
drh2dcc9aa2002-12-04 13:40:25 +00001683** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00001684** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00001685** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00001686** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00001687*/
drh144f9ea2003-04-16 01:28:16 +00001688static int fileBtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00001689 int rc;
1690 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00001691 MemPage *pPage;
1692 pPage = pCur->pPage;
1693 if( pPage==0 ){
1694 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001695 return SQLITE_ABORT;
1696 }
drh8178a752003-01-05 21:41:40 +00001697 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00001698 assert( pCur->eSkip!=SKIP_INVALID );
drh8178a752003-01-05 21:41:40 +00001699 if( pPage->nCell==0 ){
1700 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001701 return SQLITE_OK;
1702 }
1703 if( pCur->eSkip==SKIP_PREV ){
1704 pCur->eSkip = SKIP_NONE;
drh8178a752003-01-05 21:41:40 +00001705 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001706 return SQLITE_OK;
1707 }
1708 pCur->eSkip = SKIP_NONE;
1709 assert( pCur->idx>=0 );
drh8178a752003-01-05 21:41:40 +00001710 if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1711 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001712 if( rc ) return rc;
1713 rc = moveToRightmost(pCur);
1714 }else{
1715 while( pCur->idx==0 ){
drh8178a752003-01-05 21:41:40 +00001716 if( pPage->pParent==0 ){
drh2dcc9aa2002-12-04 13:40:25 +00001717 if( pRes ) *pRes = 1;
1718 return SQLITE_OK;
1719 }
drh8178a752003-01-05 21:41:40 +00001720 moveToParent(pCur);
1721 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001722 }
1723 pCur->idx--;
1724 rc = SQLITE_OK;
1725 }
drh8178a752003-01-05 21:41:40 +00001726 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001727 return rc;
1728}
1729
1730/*
drh3b7511c2001-05-26 13:15:44 +00001731** Allocate a new page from the database file.
1732**
1733** The new page is marked as dirty. (In other words, sqlitepager_write()
1734** has already been called on the new page.) The new page has also
1735** been referenced and the calling routine is responsible for calling
1736** sqlitepager_unref() on the new page when it is done.
1737**
1738** SQLITE_OK is returned on success. Any other return value indicates
1739** an error. *ppPage and *pPgno are undefined in the event of an error.
1740** Do not invoke sqlitepager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00001741**
drh199e3cf2002-07-18 11:01:47 +00001742** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1743** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00001744** attempt to keep related pages close to each other in the database file,
1745** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00001746*/
drh199e3cf2002-07-18 11:01:47 +00001747static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drhbd03cae2001-06-02 02:40:57 +00001748 PageOne *pPage1 = pBt->page1;
drh8c42ca92001-06-22 19:15:00 +00001749 int rc;
drh3b7511c2001-05-26 13:15:44 +00001750 if( pPage1->freeList ){
1751 OverflowPage *pOvfl;
drh30e58752002-03-02 20:41:57 +00001752 FreelistInfo *pInfo;
1753
drh3b7511c2001-05-26 13:15:44 +00001754 rc = sqlitepager_write(pPage1);
1755 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00001756 SWAB_ADD(pBt, pPage1->nFree, -1);
1757 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1758 (void**)&pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001759 if( rc ) return rc;
1760 rc = sqlitepager_write(pOvfl);
1761 if( rc ){
1762 sqlitepager_unref(pOvfl);
1763 return rc;
1764 }
drh30e58752002-03-02 20:41:57 +00001765 pInfo = (FreelistInfo*)pOvfl->aPayload;
1766 if( pInfo->nFree==0 ){
drh0d316a42002-08-11 20:10:47 +00001767 *pPgno = SWAB32(pBt, pPage1->freeList);
drh30e58752002-03-02 20:41:57 +00001768 pPage1->freeList = pOvfl->iNext;
1769 *ppPage = (MemPage*)pOvfl;
1770 }else{
drh0d316a42002-08-11 20:10:47 +00001771 int closest, n;
1772 n = SWAB32(pBt, pInfo->nFree);
1773 if( n>1 && nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00001774 int i, dist;
1775 closest = 0;
drh0d316a42002-08-11 20:10:47 +00001776 dist = SWAB32(pBt, pInfo->aFree[0]) - nearby;
drhbea00b92002-07-08 10:59:50 +00001777 if( dist<0 ) dist = -dist;
drh0d316a42002-08-11 20:10:47 +00001778 for(i=1; i<n; i++){
1779 int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby;
drhbea00b92002-07-08 10:59:50 +00001780 if( d2<0 ) d2 = -d2;
1781 if( d2<dist ) closest = i;
1782 }
1783 }else{
1784 closest = 0;
1785 }
drh0d316a42002-08-11 20:10:47 +00001786 SWAB_ADD(pBt, pInfo->nFree, -1);
1787 *pPgno = SWAB32(pBt, pInfo->aFree[closest]);
1788 pInfo->aFree[closest] = pInfo->aFree[n-1];
drh30e58752002-03-02 20:41:57 +00001789 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1790 sqlitepager_unref(pOvfl);
1791 if( rc==SQLITE_OK ){
1792 sqlitepager_dont_rollback(*ppPage);
1793 rc = sqlitepager_write(*ppPage);
1794 }
1795 }
drh3b7511c2001-05-26 13:15:44 +00001796 }else{
drh2aa679f2001-06-25 02:11:07 +00001797 *pPgno = sqlitepager_pagecount(pBt->pPager) + 1;
drh8c42ca92001-06-22 19:15:00 +00001798 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
drh3b7511c2001-05-26 13:15:44 +00001799 if( rc ) return rc;
1800 rc = sqlitepager_write(*ppPage);
1801 }
1802 return rc;
1803}
1804
1805/*
1806** Add a page of the database file to the freelist. Either pgno or
1807** pPage but not both may be 0.
drh5e2f8b92001-05-28 00:41:15 +00001808**
drhdd793422001-06-28 01:54:48 +00001809** sqlitepager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00001810*/
1811static int freePage(Btree *pBt, void *pPage, Pgno pgno){
drhbd03cae2001-06-02 02:40:57 +00001812 PageOne *pPage1 = pBt->page1;
drh3b7511c2001-05-26 13:15:44 +00001813 OverflowPage *pOvfl = (OverflowPage*)pPage;
1814 int rc;
drhdd793422001-06-28 01:54:48 +00001815 int needUnref = 0;
1816 MemPage *pMemPage;
drh8b2f49b2001-06-08 00:21:52 +00001817
drh3b7511c2001-05-26 13:15:44 +00001818 if( pgno==0 ){
1819 assert( pOvfl!=0 );
1820 pgno = sqlitepager_pagenumber(pOvfl);
1821 }
drh2aa679f2001-06-25 02:11:07 +00001822 assert( pgno>2 );
drhda47d772002-12-02 04:25:19 +00001823 assert( sqlitepager_pagenumber(pOvfl)==pgno );
drh193a6b42002-07-07 16:52:46 +00001824 pMemPage = (MemPage*)pPage;
1825 pMemPage->isInit = 0;
1826 if( pMemPage->pParent ){
1827 sqlitepager_unref(pMemPage->pParent);
1828 pMemPage->pParent = 0;
1829 }
drh3b7511c2001-05-26 13:15:44 +00001830 rc = sqlitepager_write(pPage1);
1831 if( rc ){
1832 return rc;
1833 }
drh0d316a42002-08-11 20:10:47 +00001834 SWAB_ADD(pBt, pPage1->nFree, 1);
1835 if( pPage1->nFree!=0 && pPage1->freeList!=0 ){
drh30e58752002-03-02 20:41:57 +00001836 OverflowPage *pFreeIdx;
drh0d316a42002-08-11 20:10:47 +00001837 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1838 (void**)&pFreeIdx);
drh30e58752002-03-02 20:41:57 +00001839 if( rc==SQLITE_OK ){
1840 FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload;
drh0d316a42002-08-11 20:10:47 +00001841 int n = SWAB32(pBt, pInfo->nFree);
1842 if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){
drh30e58752002-03-02 20:41:57 +00001843 rc = sqlitepager_write(pFreeIdx);
1844 if( rc==SQLITE_OK ){
drh0d316a42002-08-11 20:10:47 +00001845 pInfo->aFree[n] = SWAB32(pBt, pgno);
1846 SWAB_ADD(pBt, pInfo->nFree, 1);
drh30e58752002-03-02 20:41:57 +00001847 sqlitepager_unref(pFreeIdx);
1848 sqlitepager_dont_write(pBt->pPager, pgno);
1849 return rc;
1850 }
1851 }
1852 sqlitepager_unref(pFreeIdx);
1853 }
1854 }
drh3b7511c2001-05-26 13:15:44 +00001855 if( pOvfl==0 ){
1856 assert( pgno>0 );
drh8c42ca92001-06-22 19:15:00 +00001857 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001858 if( rc ) return rc;
drhdd793422001-06-28 01:54:48 +00001859 needUnref = 1;
drh3b7511c2001-05-26 13:15:44 +00001860 }
1861 rc = sqlitepager_write(pOvfl);
1862 if( rc ){
drhdd793422001-06-28 01:54:48 +00001863 if( needUnref ) sqlitepager_unref(pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001864 return rc;
1865 }
drh14acc042001-06-10 19:56:58 +00001866 pOvfl->iNext = pPage1->freeList;
drh0d316a42002-08-11 20:10:47 +00001867 pPage1->freeList = SWAB32(pBt, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001868 memset(pOvfl->aPayload, 0, OVERFLOW_SIZE);
drhdd793422001-06-28 01:54:48 +00001869 if( needUnref ) rc = sqlitepager_unref(pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001870 return rc;
1871}
1872
1873/*
1874** Erase all the data out of a cell. This involves returning overflow
1875** pages back the freelist.
1876*/
1877static int clearCell(Btree *pBt, Cell *pCell){
1878 Pager *pPager = pBt->pPager;
1879 OverflowPage *pOvfl;
drh3b7511c2001-05-26 13:15:44 +00001880 Pgno ovfl, nextOvfl;
1881 int rc;
1882
drh0d316a42002-08-11 20:10:47 +00001883 if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){
drh5e2f8b92001-05-28 00:41:15 +00001884 return SQLITE_OK;
1885 }
drh0d316a42002-08-11 20:10:47 +00001886 ovfl = SWAB32(pBt, pCell->ovfl);
drh3b7511c2001-05-26 13:15:44 +00001887 pCell->ovfl = 0;
1888 while( ovfl ){
drh8c42ca92001-06-22 19:15:00 +00001889 rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001890 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00001891 nextOvfl = SWAB32(pBt, pOvfl->iNext);
drhbd03cae2001-06-02 02:40:57 +00001892 rc = freePage(pBt, pOvfl, ovfl);
1893 if( rc ) return rc;
drhdd793422001-06-28 01:54:48 +00001894 sqlitepager_unref(pOvfl);
drh3b7511c2001-05-26 13:15:44 +00001895 ovfl = nextOvfl;
drh3b7511c2001-05-26 13:15:44 +00001896 }
drh5e2f8b92001-05-28 00:41:15 +00001897 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00001898}
1899
1900/*
1901** Create a new cell from key and data. Overflow pages are allocated as
1902** necessary and linked to this cell.
1903*/
1904static int fillInCell(
1905 Btree *pBt, /* The whole Btree. Needed to allocate pages */
1906 Cell *pCell, /* Populate this Cell structure */
drh5c4d9702001-08-20 00:33:58 +00001907 const void *pKey, int nKey, /* The key */
1908 const void *pData,int nData /* The data */
drh3b7511c2001-05-26 13:15:44 +00001909){
drhdd793422001-06-28 01:54:48 +00001910 OverflowPage *pOvfl, *pPrior;
drh3b7511c2001-05-26 13:15:44 +00001911 Pgno *pNext;
1912 int spaceLeft;
drh8c42ca92001-06-22 19:15:00 +00001913 int n, rc;
drh3b7511c2001-05-26 13:15:44 +00001914 int nPayload;
drh5c4d9702001-08-20 00:33:58 +00001915 const char *pPayload;
drh3b7511c2001-05-26 13:15:44 +00001916 char *pSpace;
drh199e3cf2002-07-18 11:01:47 +00001917 Pgno nearby = 0;
drh3b7511c2001-05-26 13:15:44 +00001918
drh5e2f8b92001-05-28 00:41:15 +00001919 pCell->h.leftChild = 0;
drh0d316a42002-08-11 20:10:47 +00001920 pCell->h.nKey = SWAB16(pBt, nKey & 0xffff);
drh80ff32f2001-11-04 18:32:46 +00001921 pCell->h.nKeyHi = nKey >> 16;
drh0d316a42002-08-11 20:10:47 +00001922 pCell->h.nData = SWAB16(pBt, nData & 0xffff);
drh80ff32f2001-11-04 18:32:46 +00001923 pCell->h.nDataHi = nData >> 16;
drh3b7511c2001-05-26 13:15:44 +00001924 pCell->h.iNext = 0;
1925
1926 pNext = &pCell->ovfl;
drh5e2f8b92001-05-28 00:41:15 +00001927 pSpace = pCell->aPayload;
drh3b7511c2001-05-26 13:15:44 +00001928 spaceLeft = MX_LOCAL_PAYLOAD;
1929 pPayload = pKey;
1930 pKey = 0;
1931 nPayload = nKey;
drhdd793422001-06-28 01:54:48 +00001932 pPrior = 0;
drh3b7511c2001-05-26 13:15:44 +00001933 while( nPayload>0 ){
1934 if( spaceLeft==0 ){
drh199e3cf2002-07-18 11:01:47 +00001935 rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby);
drh3b7511c2001-05-26 13:15:44 +00001936 if( rc ){
1937 *pNext = 0;
drhbea00b92002-07-08 10:59:50 +00001938 }else{
drh199e3cf2002-07-18 11:01:47 +00001939 nearby = *pNext;
drhdd793422001-06-28 01:54:48 +00001940 }
1941 if( pPrior ) sqlitepager_unref(pPrior);
1942 if( rc ){
drh5e2f8b92001-05-28 00:41:15 +00001943 clearCell(pBt, pCell);
drh3b7511c2001-05-26 13:15:44 +00001944 return rc;
1945 }
drh0d316a42002-08-11 20:10:47 +00001946 if( pBt->needSwab ) *pNext = swab32(*pNext);
drhdd793422001-06-28 01:54:48 +00001947 pPrior = pOvfl;
drh3b7511c2001-05-26 13:15:44 +00001948 spaceLeft = OVERFLOW_SIZE;
drh5e2f8b92001-05-28 00:41:15 +00001949 pSpace = pOvfl->aPayload;
drh8c42ca92001-06-22 19:15:00 +00001950 pNext = &pOvfl->iNext;
drh3b7511c2001-05-26 13:15:44 +00001951 }
1952 n = nPayload;
1953 if( n>spaceLeft ) n = spaceLeft;
1954 memcpy(pSpace, pPayload, n);
1955 nPayload -= n;
1956 if( nPayload==0 && pData ){
1957 pPayload = pData;
1958 nPayload = nData;
1959 pData = 0;
1960 }else{
1961 pPayload += n;
1962 }
1963 spaceLeft -= n;
1964 pSpace += n;
1965 }
drhdd793422001-06-28 01:54:48 +00001966 *pNext = 0;
1967 if( pPrior ){
1968 sqlitepager_unref(pPrior);
1969 }
drh3b7511c2001-05-26 13:15:44 +00001970 return SQLITE_OK;
1971}
1972
1973/*
drhbd03cae2001-06-02 02:40:57 +00001974** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00001975** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00001976** pointer in the third argument.
1977*/
drh428ae8c2003-01-04 16:48:09 +00001978static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){
drhbd03cae2001-06-02 02:40:57 +00001979 MemPage *pThis;
1980
drhdd793422001-06-28 01:54:48 +00001981 if( pgno==0 ) return;
1982 assert( pPager!=0 );
drhbd03cae2001-06-02 02:40:57 +00001983 pThis = sqlitepager_lookup(pPager, pgno);
drh6019e162001-07-02 17:51:45 +00001984 if( pThis && pThis->isInit ){
drhdd793422001-06-28 01:54:48 +00001985 if( pThis->pParent!=pNewParent ){
1986 if( pThis->pParent ) sqlitepager_unref(pThis->pParent);
1987 pThis->pParent = pNewParent;
1988 if( pNewParent ) sqlitepager_ref(pNewParent);
1989 }
drh428ae8c2003-01-04 16:48:09 +00001990 pThis->idxParent = idx;
drhdd793422001-06-28 01:54:48 +00001991 sqlitepager_unref(pThis);
drhbd03cae2001-06-02 02:40:57 +00001992 }
1993}
1994
1995/*
1996** Reparent all children of the given page to be the given page.
1997** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00001998** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00001999**
2000** This routine gets called after you memcpy() one page into
2001** another.
2002*/
drh0d316a42002-08-11 20:10:47 +00002003static void reparentChildPages(Btree *pBt, MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002004 int i;
drh0d316a42002-08-11 20:10:47 +00002005 Pager *pPager = pBt->pPager;
drhbd03cae2001-06-02 02:40:57 +00002006 for(i=0; i<pPage->nCell; i++){
drh428ae8c2003-01-04 16:48:09 +00002007 reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002008 }
drh428ae8c2003-01-04 16:48:09 +00002009 reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i);
2010 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002011}
2012
2013/*
2014** Remove the i-th cell from pPage. This routine effects pPage only.
2015** The cell content is not freed or deallocated. It is assumed that
2016** the cell content has been copied someplace else. This routine just
2017** removes the reference to the cell from pPage.
2018**
2019** "sz" must be the number of bytes in the cell.
2020**
2021** Do not bother maintaining the integrity of the linked list of Cells.
drh8c42ca92001-06-22 19:15:00 +00002022** Only the pPage->apCell[] array is important. The relinkCellList()
2023** routine will be called soon after this routine in order to rebuild
2024** the linked list.
drh14acc042001-06-10 19:56:58 +00002025*/
drh0d316a42002-08-11 20:10:47 +00002026static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){
drh14acc042001-06-10 19:56:58 +00002027 int j;
drh8c42ca92001-06-22 19:15:00 +00002028 assert( idx>=0 && idx<pPage->nCell );
drh0d316a42002-08-11 20:10:47 +00002029 assert( sz==cellSize(pBt, pPage->apCell[idx]) );
drh6019e162001-07-02 17:51:45 +00002030 assert( sqlitepager_iswriteable(pPage) );
drh0d316a42002-08-11 20:10:47 +00002031 freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz);
drh7c717f72001-06-24 20:39:41 +00002032 for(j=idx; j<pPage->nCell-1; j++){
drh14acc042001-06-10 19:56:58 +00002033 pPage->apCell[j] = pPage->apCell[j+1];
2034 }
2035 pPage->nCell--;
drh428ae8c2003-01-04 16:48:09 +00002036 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002037}
2038
2039/*
2040** Insert a new cell on pPage at cell index "i". pCell points to the
2041** content of the cell.
2042**
2043** If the cell content will fit on the page, then put it there. If it
2044** will not fit, then just make pPage->apCell[i] point to the content
2045** and set pPage->isOverfull.
2046**
2047** Do not bother maintaining the integrity of the linked list of Cells.
drh8c42ca92001-06-22 19:15:00 +00002048** Only the pPage->apCell[] array is important. The relinkCellList()
2049** routine will be called soon after this routine in order to rebuild
2050** the linked list.
drh14acc042001-06-10 19:56:58 +00002051*/
drh0d316a42002-08-11 20:10:47 +00002052static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){
drh14acc042001-06-10 19:56:58 +00002053 int idx, j;
2054 assert( i>=0 && i<=pPage->nCell );
drh0d316a42002-08-11 20:10:47 +00002055 assert( sz==cellSize(pBt, pCell) );
drh6019e162001-07-02 17:51:45 +00002056 assert( sqlitepager_iswriteable(pPage) );
drh0d316a42002-08-11 20:10:47 +00002057 idx = allocateSpace(pBt, pPage, sz);
drh14acc042001-06-10 19:56:58 +00002058 for(j=pPage->nCell; j>i; j--){
2059 pPage->apCell[j] = pPage->apCell[j-1];
2060 }
2061 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002062 if( idx<=0 ){
2063 pPage->isOverfull = 1;
2064 pPage->apCell[i] = pCell;
2065 }else{
2066 memcpy(&pPage->u.aDisk[idx], pCell, sz);
drh8c42ca92001-06-22 19:15:00 +00002067 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx];
drh14acc042001-06-10 19:56:58 +00002068 }
drh428ae8c2003-01-04 16:48:09 +00002069 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002070}
2071
2072/*
2073** Rebuild the linked list of cells on a page so that the cells
drh8c42ca92001-06-22 19:15:00 +00002074** occur in the order specified by the pPage->apCell[] array.
2075** Invoke this routine once to repair damage after one or more
2076** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002077*/
drh0d316a42002-08-11 20:10:47 +00002078static void relinkCellList(Btree *pBt, MemPage *pPage){
drh14acc042001-06-10 19:56:58 +00002079 int i;
2080 u16 *pIdx;
drh6019e162001-07-02 17:51:45 +00002081 assert( sqlitepager_iswriteable(pPage) );
drh14acc042001-06-10 19:56:58 +00002082 pIdx = &pPage->u.hdr.firstCell;
2083 for(i=0; i<pPage->nCell; i++){
drh7c717f72001-06-24 20:39:41 +00002084 int idx = Addr(pPage->apCell[i]) - Addr(pPage);
drh8c42ca92001-06-22 19:15:00 +00002085 assert( idx>0 && idx<SQLITE_PAGE_SIZE );
drh0d316a42002-08-11 20:10:47 +00002086 *pIdx = SWAB16(pBt, idx);
drh14acc042001-06-10 19:56:58 +00002087 pIdx = &pPage->apCell[i]->h.iNext;
2088 }
2089 *pIdx = 0;
2090}
2091
2092/*
2093** Make a copy of the contents of pFrom into pTo. The pFrom->apCell[]
drh5e00f6c2001-09-13 13:46:56 +00002094** pointers that point into pFrom->u.aDisk[] must be adjusted to point
drhdd793422001-06-28 01:54:48 +00002095** into pTo->u.aDisk[] instead. But some pFrom->apCell[] entries might
drh14acc042001-06-10 19:56:58 +00002096** not point to pFrom->u.aDisk[]. Those are unchanged.
2097*/
2098static void copyPage(MemPage *pTo, MemPage *pFrom){
2099 uptr from, to;
2100 int i;
2101 memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_PAGE_SIZE);
drhdd793422001-06-28 01:54:48 +00002102 pTo->pParent = 0;
drh14acc042001-06-10 19:56:58 +00002103 pTo->isInit = 1;
2104 pTo->nCell = pFrom->nCell;
2105 pTo->nFree = pFrom->nFree;
2106 pTo->isOverfull = pFrom->isOverfull;
drh7c717f72001-06-24 20:39:41 +00002107 to = Addr(pTo);
2108 from = Addr(pFrom);
drh14acc042001-06-10 19:56:58 +00002109 for(i=0; i<pTo->nCell; i++){
drh7c717f72001-06-24 20:39:41 +00002110 uptr x = Addr(pFrom->apCell[i]);
drh8c42ca92001-06-22 19:15:00 +00002111 if( x>from && x<from+SQLITE_PAGE_SIZE ){
2112 *((uptr*)&pTo->apCell[i]) = x + to - from;
drhdd793422001-06-28 01:54:48 +00002113 }else{
2114 pTo->apCell[i] = pFrom->apCell[i];
drh14acc042001-06-10 19:56:58 +00002115 }
2116 }
drhbd03cae2001-06-02 02:40:57 +00002117}
2118
2119/*
drhc3b70572003-01-04 19:44:07 +00002120** The following parameters determine how many adjacent pages get involved
2121** in a balancing operation. NN is the number of neighbors on either side
2122** of the page that participate in the balancing operation. NB is the
2123** total number of pages that participate, including the target page and
2124** NN neighbors on either side.
2125**
2126** The minimum value of NN is 1 (of course). Increasing NN above 1
2127** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2128** in exchange for a larger degradation in INSERT and UPDATE performance.
2129** The value of NN appears to give the best results overall.
2130*/
2131#define NN 1 /* Number of neighbors on either side of pPage */
2132#define NB (NN*2+1) /* Total pages involved in the balance */
2133
2134/*
drh8b2f49b2001-06-08 00:21:52 +00002135** This routine redistributes Cells on pPage and up to two siblings
2136** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002137** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002138** though both siblings might come from one side if pPage is the first
2139** or last child of its parent. If pPage has fewer than two siblings
2140** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002141** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002142**
2143** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002144** one in an effort to keep pages between 66% and 100% full. The root page
2145** is special and is allowed to be less than 66% full. If pPage is
2146** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002147** or decreased by one, as necessary, to keep the root page from being
2148** overfull or empty.
2149**
drh14acc042001-06-10 19:56:58 +00002150** This routine calls relinkCellList() on its input page regardless of
2151** whether or not it does any real balancing. Client routines will typically
2152** invoke insertCell() or dropCell() before calling this routine, so we
2153** need to call relinkCellList() to clean up the mess that those other
2154** routines left behind.
2155**
2156** pCur is left pointing to the same cell as when this routine was called
drh8c42ca92001-06-22 19:15:00 +00002157** even if that cell gets moved to a different page. pCur may be NULL.
2158** Set the pCur parameter to NULL if you do not care about keeping track
2159** of a cell as that will save this routine the work of keeping track of it.
drh14acc042001-06-10 19:56:58 +00002160**
drh8b2f49b2001-06-08 00:21:52 +00002161** Note that when this routine is called, some of the Cells on pPage
drh14acc042001-06-10 19:56:58 +00002162** might not actually be stored in pPage->u.aDisk[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002163** if the page is overfull. Part of the job of this routine is to
drh14acc042001-06-10 19:56:58 +00002164** make sure all Cells for pPage once again fit in pPage->u.aDisk[].
2165**
drh8c42ca92001-06-22 19:15:00 +00002166** In the course of balancing the siblings of pPage, the parent of pPage
2167** might become overfull or underfull. If that happens, then this routine
2168** is called recursively on the parent.
2169**
drh5e00f6c2001-09-13 13:46:56 +00002170** If this routine fails for any reason, it might leave the database
2171** in a corrupted state. So if this routine fails, the database should
2172** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002173*/
drh14acc042001-06-10 19:56:58 +00002174static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){
drh8b2f49b2001-06-08 00:21:52 +00002175 MemPage *pParent; /* The parent of pPage */
drh8b2f49b2001-06-08 00:21:52 +00002176 int nCell; /* Number of cells in apCell[] */
2177 int nOld; /* Number of pages in apOld[] */
2178 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002179 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002180 int i, j, k; /* Loop counters */
2181 int idx; /* Index of pPage in pParent->apCell[] */
2182 int nxDiv; /* Next divider slot in pParent->apCell[] */
2183 int rc; /* The return code */
2184 int iCur; /* apCell[iCur] is the cell of the cursor */
drh5edc3122001-09-13 21:53:09 +00002185 MemPage *pOldCurPage; /* The cursor originally points to this page */
drh6019e162001-07-02 17:51:45 +00002186 int subtotal; /* Subtotal of bytes in cells on one page */
drh9ca7d3b2001-06-28 11:50:21 +00002187 MemPage *extraUnref = 0; /* A page that needs to be unref-ed */
drhc3b70572003-01-04 19:44:07 +00002188 MemPage *apOld[NB]; /* pPage and up to two siblings */
2189 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
2190 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2191 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2192 int idxDiv[NB]; /* Indices of divider cells in pParent */
2193 Cell *apDiv[NB]; /* Divider cells in pParent */
2194 Cell aTemp[NB]; /* Temporary holding area for apDiv[] */
2195 int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */
2196 int szNew[NB+1]; /* Combined size of cells place on i-th page */
2197 MemPage aOld[NB]; /* Temporary copies of pPage and its siblings */
2198 Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
2199 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh8b2f49b2001-06-08 00:21:52 +00002200
drh14acc042001-06-10 19:56:58 +00002201 /*
2202 ** Return without doing any work if pPage is neither overfull nor
2203 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002204 */
drh6019e162001-07-02 17:51:45 +00002205 assert( sqlitepager_iswriteable(pPage) );
drha1b351a2001-09-14 16:42:12 +00002206 if( !pPage->isOverfull && pPage->nFree<SQLITE_PAGE_SIZE/2
2207 && pPage->nCell>=2){
drh0d316a42002-08-11 20:10:47 +00002208 relinkCellList(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00002209 return SQLITE_OK;
2210 }
2211
2212 /*
drh14acc042001-06-10 19:56:58 +00002213 ** Find the parent of the page to be balanceed.
2214 ** If there is no parent, it means this page is the root page and
drh8b2f49b2001-06-08 00:21:52 +00002215 ** special rules apply.
2216 */
drh14acc042001-06-10 19:56:58 +00002217 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002218 if( pParent==0 ){
2219 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002220 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002221 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002222 if( pPage->nCell==0 ){
drh14acc042001-06-10 19:56:58 +00002223 if( pPage->u.hdr.rightChild ){
2224 /*
2225 ** The root page is empty. Copy the one child page
drh8b2f49b2001-06-08 00:21:52 +00002226 ** into the root page and return. This reduces the depth
2227 ** of the BTree by one.
2228 */
drh0d316a42002-08-11 20:10:47 +00002229 pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild);
drh8c42ca92001-06-22 19:15:00 +00002230 rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild);
drh8b2f49b2001-06-08 00:21:52 +00002231 if( rc ) return rc;
2232 memcpy(pPage, pChild, SQLITE_PAGE_SIZE);
2233 pPage->isInit = 0;
drh0d316a42002-08-11 20:10:47 +00002234 rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0);
drh6019e162001-07-02 17:51:45 +00002235 assert( rc==SQLITE_OK );
drh0d316a42002-08-11 20:10:47 +00002236 reparentChildPages(pBt, pPage);
drh5edc3122001-09-13 21:53:09 +00002237 if( pCur && pCur->pPage==pChild ){
2238 sqlitepager_unref(pChild);
2239 pCur->pPage = pPage;
2240 sqlitepager_ref(pPage);
2241 }
drh8b2f49b2001-06-08 00:21:52 +00002242 freePage(pBt, pChild, pgnoChild);
2243 sqlitepager_unref(pChild);
drhefc251d2001-07-01 22:12:01 +00002244 }else{
drh0d316a42002-08-11 20:10:47 +00002245 relinkCellList(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00002246 }
2247 return SQLITE_OK;
2248 }
drh14acc042001-06-10 19:56:58 +00002249 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002250 /* It is OK for the root page to be less than half full.
2251 */
drh0d316a42002-08-11 20:10:47 +00002252 relinkCellList(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00002253 return SQLITE_OK;
2254 }
drh14acc042001-06-10 19:56:58 +00002255 /*
2256 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002257 ** When this happens, Create a new child page and copy the
2258 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002259 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002260 ** child. Then fall thru to the code below which will cause
2261 ** the overfull child page to be split.
2262 */
drh14acc042001-06-10 19:56:58 +00002263 rc = sqlitepager_write(pPage);
2264 if( rc ) return rc;
drhbea00b92002-07-08 10:59:50 +00002265 rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage));
drh8b2f49b2001-06-08 00:21:52 +00002266 if( rc ) return rc;
drh6019e162001-07-02 17:51:45 +00002267 assert( sqlitepager_iswriteable(pChild) );
drh14acc042001-06-10 19:56:58 +00002268 copyPage(pChild, pPage);
2269 pChild->pParent = pPage;
drhbb49aba2003-01-04 18:53:27 +00002270 pChild->idxParent = 0;
drhdd793422001-06-28 01:54:48 +00002271 sqlitepager_ref(pPage);
drh14acc042001-06-10 19:56:58 +00002272 pChild->isOverfull = 1;
drh5edc3122001-09-13 21:53:09 +00002273 if( pCur && pCur->pPage==pPage ){
2274 sqlitepager_unref(pPage);
drh14acc042001-06-10 19:56:58 +00002275 pCur->pPage = pChild;
drh9ca7d3b2001-06-28 11:50:21 +00002276 }else{
2277 extraUnref = pChild;
drh8b2f49b2001-06-08 00:21:52 +00002278 }
drh0d316a42002-08-11 20:10:47 +00002279 zeroPage(pBt, pPage);
2280 pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild);
drh8b2f49b2001-06-08 00:21:52 +00002281 pParent = pPage;
2282 pPage = pChild;
drh8b2f49b2001-06-08 00:21:52 +00002283 }
drh6019e162001-07-02 17:51:45 +00002284 rc = sqlitepager_write(pParent);
2285 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002286 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002287
drh8b2f49b2001-06-08 00:21:52 +00002288 /*
drh14acc042001-06-10 19:56:58 +00002289 ** Find the Cell in the parent page whose h.leftChild points back
2290 ** to pPage. The "idx" variable is the index of that cell. If pPage
2291 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002292 */
drhbb49aba2003-01-04 18:53:27 +00002293 if( pParent->idxShift ){
2294 Pgno pgno, swabPgno;
2295 pgno = sqlitepager_pagenumber(pPage);
2296 swabPgno = SWAB32(pBt, pgno);
2297 for(idx=0; idx<pParent->nCell; idx++){
2298 if( pParent->apCell[idx]->h.leftChild==swabPgno ){
2299 break;
2300 }
drh8b2f49b2001-06-08 00:21:52 +00002301 }
drhbb49aba2003-01-04 18:53:27 +00002302 assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno );
2303 }else{
2304 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002305 }
drh8b2f49b2001-06-08 00:21:52 +00002306
2307 /*
drh14acc042001-06-10 19:56:58 +00002308 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002309 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002310 */
drh14acc042001-06-10 19:56:58 +00002311 nOld = nNew = 0;
2312 sqlitepager_ref(pParent);
2313
2314 /*
2315 ** Find sibling pages to pPage and the Cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002316 ** the siblings. An attempt is made to find NN siblings on either
2317 ** side of pPage. More siblings are taken from one side, however, if
2318 ** pPage there are fewer than NN siblings on the other side. If pParent
2319 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002320 */
drhc3b70572003-01-04 19:44:07 +00002321 nxDiv = idx - NN;
2322 if( nxDiv + NB > pParent->nCell ){
2323 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002324 }
drhc3b70572003-01-04 19:44:07 +00002325 if( nxDiv<0 ){
2326 nxDiv = 0;
2327 }
drh8b2f49b2001-06-08 00:21:52 +00002328 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002329 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002330 if( k<pParent->nCell ){
2331 idxDiv[i] = k;
2332 apDiv[i] = pParent->apCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002333 nDiv++;
drh0d316a42002-08-11 20:10:47 +00002334 pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild);
drh14acc042001-06-10 19:56:58 +00002335 }else if( k==pParent->nCell ){
drh0d316a42002-08-11 20:10:47 +00002336 pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild);
drh14acc042001-06-10 19:56:58 +00002337 }else{
2338 break;
drh8b2f49b2001-06-08 00:21:52 +00002339 }
drh8c42ca92001-06-22 19:15:00 +00002340 rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]);
drh14acc042001-06-10 19:56:58 +00002341 if( rc ) goto balance_cleanup;
drh0d316a42002-08-11 20:10:47 +00002342 rc = initPage(pBt, apOld[i], pgnoOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002343 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002344 apOld[i]->idxParent = k;
drh14acc042001-06-10 19:56:58 +00002345 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002346 }
2347
2348 /*
drh14acc042001-06-10 19:56:58 +00002349 ** Set iCur to be the index in apCell[] of the cell that the cursor
2350 ** is pointing to. We will need this later on in order to keep the
drh5edc3122001-09-13 21:53:09 +00002351 ** cursor pointing at the same cell. If pCur points to a page that
2352 ** has no involvement with this rebalancing, then set iCur to a large
2353 ** number so that the iCur==j tests always fail in the main cell
2354 ** distribution loop below.
drh14acc042001-06-10 19:56:58 +00002355 */
2356 if( pCur ){
drh5edc3122001-09-13 21:53:09 +00002357 iCur = 0;
2358 for(i=0; i<nOld; i++){
2359 if( pCur->pPage==apOld[i] ){
2360 iCur += pCur->idx;
2361 break;
2362 }
2363 iCur += apOld[i]->nCell;
2364 if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){
2365 break;
2366 }
2367 iCur++;
drh14acc042001-06-10 19:56:58 +00002368 }
drh5edc3122001-09-13 21:53:09 +00002369 pOldCurPage = pCur->pPage;
drh14acc042001-06-10 19:56:58 +00002370 }
2371
2372 /*
2373 ** Make copies of the content of pPage and its siblings into aOld[].
2374 ** The rest of this function will use data from the copies rather
2375 ** that the original pages since the original pages will be in the
2376 ** process of being overwritten.
2377 */
2378 for(i=0; i<nOld; i++){
2379 copyPage(&aOld[i], apOld[i]);
drh14acc042001-06-10 19:56:58 +00002380 }
2381
2382 /*
2383 ** Load pointers to all cells on sibling pages and the divider cells
2384 ** into the local apCell[] array. Make copies of the divider cells
2385 ** into aTemp[] and remove the the divider Cells from pParent.
drh8b2f49b2001-06-08 00:21:52 +00002386 */
2387 nCell = 0;
2388 for(i=0; i<nOld; i++){
drh6b308672002-07-08 02:16:37 +00002389 MemPage *pOld = &aOld[i];
drh8b2f49b2001-06-08 00:21:52 +00002390 for(j=0; j<pOld->nCell; j++){
drh14acc042001-06-10 19:56:58 +00002391 apCell[nCell] = pOld->apCell[j];
drh0d316a42002-08-11 20:10:47 +00002392 szCell[nCell] = cellSize(pBt, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002393 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002394 }
2395 if( i<nOld-1 ){
drh0d316a42002-08-11 20:10:47 +00002396 szCell[nCell] = cellSize(pBt, apDiv[i]);
drh8c42ca92001-06-22 19:15:00 +00002397 memcpy(&aTemp[i], apDiv[i], szCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002398 apCell[nCell] = &aTemp[i];
drh0d316a42002-08-11 20:10:47 +00002399 dropCell(pBt, pParent, nxDiv, szCell[nCell]);
2400 assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] );
drh14acc042001-06-10 19:56:58 +00002401 apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild;
2402 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002403 }
2404 }
2405
2406 /*
drh6019e162001-07-02 17:51:45 +00002407 ** Figure out the number of pages needed to hold all nCell cells.
2408 ** Store this number in "k". Also compute szNew[] which is the total
2409 ** size of all cells on the i-th page and cntNew[] which is the index
2410 ** in apCell[] of the cell that divides path i from path i+1.
2411 ** cntNew[k] should equal nCell.
2412 **
2413 ** This little patch of code is critical for keeping the tree
2414 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00002415 */
drh6019e162001-07-02 17:51:45 +00002416 for(subtotal=k=i=0; i<nCell; i++){
2417 subtotal += szCell[i];
2418 if( subtotal > USABLE_SPACE ){
2419 szNew[k] = subtotal - szCell[i];
2420 cntNew[k] = i;
2421 subtotal = 0;
2422 k++;
2423 }
2424 }
2425 szNew[k] = subtotal;
2426 cntNew[k] = nCell;
2427 k++;
2428 for(i=k-1; i>0; i--){
2429 while( szNew[i]<USABLE_SPACE/2 ){
2430 cntNew[i-1]--;
2431 assert( cntNew[i-1]>0 );
2432 szNew[i] += szCell[cntNew[i-1]];
2433 szNew[i-1] -= szCell[cntNew[i-1]-1];
2434 }
2435 }
2436 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00002437
2438 /*
drh6b308672002-07-08 02:16:37 +00002439 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00002440 */
drh14acc042001-06-10 19:56:58 +00002441 for(i=0; i<k; i++){
drh6b308672002-07-08 02:16:37 +00002442 if( i<nOld ){
2443 apNew[i] = apOld[i];
2444 pgnoNew[i] = pgnoOld[i];
2445 apOld[i] = 0;
2446 sqlitepager_write(apNew[i]);
2447 }else{
drhbea00b92002-07-08 10:59:50 +00002448 rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00002449 if( rc ) goto balance_cleanup;
2450 }
drh14acc042001-06-10 19:56:58 +00002451 nNew++;
drh0d316a42002-08-11 20:10:47 +00002452 zeroPage(pBt, apNew[i]);
drh6019e162001-07-02 17:51:45 +00002453 apNew[i]->isInit = 1;
drh8b2f49b2001-06-08 00:21:52 +00002454 }
2455
drh6b308672002-07-08 02:16:37 +00002456 /* Free any old pages that were not reused as new pages.
2457 */
2458 while( i<nOld ){
2459 rc = freePage(pBt, apOld[i], pgnoOld[i]);
2460 if( rc ) goto balance_cleanup;
2461 sqlitepager_unref(apOld[i]);
2462 apOld[i] = 0;
2463 i++;
2464 }
2465
drh8b2f49b2001-06-08 00:21:52 +00002466 /*
drhf9ffac92002-03-02 19:00:31 +00002467 ** Put the new pages in accending order. This helps to
2468 ** keep entries in the disk file in order so that a scan
2469 ** of the table is a linear scan through the file. That
2470 ** in turn helps the operating system to deliver pages
2471 ** from the disk more rapidly.
2472 **
2473 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00002474 ** n is never more than NB (a small constant), that should
2475 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00002476 **
drhc3b70572003-01-04 19:44:07 +00002477 ** When NB==3, this one optimization makes the database
2478 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00002479 */
2480 for(i=0; i<k-1; i++){
2481 int minV = pgnoNew[i];
2482 int minI = i;
2483 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00002484 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00002485 minI = j;
2486 minV = pgnoNew[j];
2487 }
2488 }
2489 if( minI>i ){
2490 int t;
2491 MemPage *pT;
2492 t = pgnoNew[i];
2493 pT = apNew[i];
2494 pgnoNew[i] = pgnoNew[minI];
2495 apNew[i] = apNew[minI];
2496 pgnoNew[minI] = t;
2497 apNew[minI] = pT;
2498 }
2499 }
2500
2501 /*
drh14acc042001-06-10 19:56:58 +00002502 ** Evenly distribute the data in apCell[] across the new pages.
2503 ** Insert divider cells into pParent as necessary.
2504 */
2505 j = 0;
2506 for(i=0; i<nNew; i++){
2507 MemPage *pNew = apNew[i];
drh6019e162001-07-02 17:51:45 +00002508 while( j<cntNew[i] ){
2509 assert( pNew->nFree>=szCell[j] );
drh14acc042001-06-10 19:56:58 +00002510 if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; }
drh0d316a42002-08-11 20:10:47 +00002511 insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]);
drh14acc042001-06-10 19:56:58 +00002512 j++;
2513 }
drh6019e162001-07-02 17:51:45 +00002514 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00002515 assert( !pNew->isOverfull );
drh0d316a42002-08-11 20:10:47 +00002516 relinkCellList(pBt, pNew);
drh14acc042001-06-10 19:56:58 +00002517 if( i<nNew-1 && j<nCell ){
2518 pNew->u.hdr.rightChild = apCell[j]->h.leftChild;
drh0d316a42002-08-11 20:10:47 +00002519 apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]);
drh14acc042001-06-10 19:56:58 +00002520 if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; }
drh0d316a42002-08-11 20:10:47 +00002521 insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]);
drh14acc042001-06-10 19:56:58 +00002522 j++;
2523 nxDiv++;
2524 }
2525 }
drh6019e162001-07-02 17:51:45 +00002526 assert( j==nCell );
drh6b308672002-07-08 02:16:37 +00002527 apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild;
drh14acc042001-06-10 19:56:58 +00002528 if( nxDiv==pParent->nCell ){
drh0d316a42002-08-11 20:10:47 +00002529 pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00002530 }else{
drh0d316a42002-08-11 20:10:47 +00002531 pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00002532 }
2533 if( pCur ){
drh3fc190c2001-09-14 03:24:23 +00002534 if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){
2535 assert( pCur->pPage==pOldCurPage );
2536 pCur->idx += nNew - nOld;
2537 }else{
2538 assert( pOldCurPage!=0 );
2539 sqlitepager_ref(pCur->pPage);
2540 sqlitepager_unref(pOldCurPage);
2541 }
drh14acc042001-06-10 19:56:58 +00002542 }
2543
2544 /*
2545 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00002546 */
2547 for(i=0; i<nNew; i++){
drh0d316a42002-08-11 20:10:47 +00002548 reparentChildPages(pBt, apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00002549 }
drh0d316a42002-08-11 20:10:47 +00002550 reparentChildPages(pBt, pParent);
drh8b2f49b2001-06-08 00:21:52 +00002551
2552 /*
drh14acc042001-06-10 19:56:58 +00002553 ** balance the parent page.
drh8b2f49b2001-06-08 00:21:52 +00002554 */
drh5edc3122001-09-13 21:53:09 +00002555 rc = balance(pBt, pParent, pCur);
drh8b2f49b2001-06-08 00:21:52 +00002556
2557 /*
drh14acc042001-06-10 19:56:58 +00002558 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00002559 */
drh14acc042001-06-10 19:56:58 +00002560balance_cleanup:
drh9ca7d3b2001-06-28 11:50:21 +00002561 if( extraUnref ){
2562 sqlitepager_unref(extraUnref);
2563 }
drh8b2f49b2001-06-08 00:21:52 +00002564 for(i=0; i<nOld; i++){
drh6b308672002-07-08 02:16:37 +00002565 if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00002566 }
drh14acc042001-06-10 19:56:58 +00002567 for(i=0; i<nNew; i++){
2568 sqlitepager_unref(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00002569 }
drh14acc042001-06-10 19:56:58 +00002570 if( pCur && pCur->pPage==0 ){
2571 pCur->pPage = pParent;
2572 pCur->idx = 0;
2573 }else{
2574 sqlitepager_unref(pParent);
drh8b2f49b2001-06-08 00:21:52 +00002575 }
2576 return rc;
2577}
2578
2579/*
drhf74b8d92002-09-01 23:20:45 +00002580** This routine checks all cursors that point to the same table
2581** as pCur points to. If any of those cursors were opened with
2582** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
2583** cursors point to the same table were opened with wrFlag==1
2584** then this routine returns SQLITE_OK.
2585**
2586** In addition to checking for read-locks (where a read-lock
2587** means a cursor opened with wrFlag==0) this routine also moves
2588** all cursors other than pCur so that they are pointing to the
2589** first Cell on root page. This is necessary because an insert
2590** or delete might change the number of cells on a page or delete
2591** a page entirely and we do not want to leave any cursors
2592** pointing to non-existant pages or cells.
2593*/
2594static int checkReadLocks(BtCursor *pCur){
2595 BtCursor *p;
2596 assert( pCur->wrFlag );
2597 for(p=pCur->pShared; p!=pCur; p=p->pShared){
2598 assert( p );
2599 assert( p->pgnoRoot==pCur->pgnoRoot );
2600 if( p->wrFlag==0 ) return SQLITE_LOCKED;
2601 if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){
2602 moveToRoot(p);
2603 }
2604 }
2605 return SQLITE_OK;
2606}
2607
2608/*
drh3b7511c2001-05-26 13:15:44 +00002609** Insert a new record into the BTree. The key is given by (pKey,nKey)
2610** and the data is given by (pData,nData). The cursor is used only to
2611** define what database the record should be inserted into. The cursor
drh14acc042001-06-10 19:56:58 +00002612** is left pointing at the new record.
drh3b7511c2001-05-26 13:15:44 +00002613*/
drh144f9ea2003-04-16 01:28:16 +00002614static int fileBtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00002615 BtCursor *pCur, /* Insert data into the table of this cursor */
drhbe0072d2001-09-13 14:46:09 +00002616 const void *pKey, int nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00002617 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00002618){
2619 Cell newCell;
2620 int rc;
2621 int loc;
drh14acc042001-06-10 19:56:58 +00002622 int szNew;
drh3b7511c2001-05-26 13:15:44 +00002623 MemPage *pPage;
2624 Btree *pBt = pCur->pBt;
2625
drhecdc7532001-09-23 02:35:53 +00002626 if( pCur->pPage==0 ){
2627 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2628 }
drhf74b8d92002-09-01 23:20:45 +00002629 if( !pBt->inTrans || nKey+nData==0 ){
2630 /* Must start a transaction before doing an insert */
2631 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002632 }
drhf74b8d92002-09-01 23:20:45 +00002633 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00002634 if( !pCur->wrFlag ){
2635 return SQLITE_PERM; /* Cursor not open for writing */
2636 }
drhf74b8d92002-09-01 23:20:45 +00002637 if( checkReadLocks(pCur) ){
2638 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2639 }
drh144f9ea2003-04-16 01:28:16 +00002640 rc = fileBtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00002641 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00002642 pPage = pCur->pPage;
drh7aa128d2002-06-21 13:09:16 +00002643 assert( pPage->isInit );
drh14acc042001-06-10 19:56:58 +00002644 rc = sqlitepager_write(pPage);
drhbd03cae2001-06-02 02:40:57 +00002645 if( rc ) return rc;
drh3b7511c2001-05-26 13:15:44 +00002646 rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData);
2647 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002648 szNew = cellSize(pBt, &newCell);
drh3b7511c2001-05-26 13:15:44 +00002649 if( loc==0 ){
drh14acc042001-06-10 19:56:58 +00002650 newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild;
2651 rc = clearCell(pBt, pPage->apCell[pCur->idx]);
drh5e2f8b92001-05-28 00:41:15 +00002652 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002653 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx]));
drh7c717f72001-06-24 20:39:41 +00002654 }else if( loc<0 && pPage->nCell>0 ){
drh14acc042001-06-10 19:56:58 +00002655 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */
2656 pCur->idx++;
2657 }else{
2658 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */
drh3b7511c2001-05-26 13:15:44 +00002659 }
drh0d316a42002-08-11 20:10:47 +00002660 insertCell(pBt, pPage, pCur->idx, &newCell, szNew);
drh14acc042001-06-10 19:56:58 +00002661 rc = balance(pCur->pBt, pPage, pCur);
drh3fc190c2001-09-14 03:24:23 +00002662 /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
2663 /* fflush(stdout); */
drh2dcc9aa2002-12-04 13:40:25 +00002664 pCur->eSkip = SKIP_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00002665 return rc;
2666}
2667
2668/*
drhbd03cae2001-06-02 02:40:57 +00002669** Delete the entry that the cursor is pointing to.
drh5e2f8b92001-05-28 00:41:15 +00002670**
drhbd03cae2001-06-02 02:40:57 +00002671** The cursor is left pointing at either the next or the previous
2672** entry. If the cursor is left pointing to the next entry, then
drh2dcc9aa2002-12-04 13:40:25 +00002673** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to
drhbd03cae2001-06-02 02:40:57 +00002674** sqliteBtreeNext() to be a no-op. That way, you can always call
2675** sqliteBtreeNext() after a delete and the cursor will be left
drh2dcc9aa2002-12-04 13:40:25 +00002676** pointing to the first entry after the deleted entry. Similarly,
2677** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to
2678** the entry prior to the deleted entry so that a subsequent call to
2679** sqliteBtreePrevious() will always leave the cursor pointing at the
2680** entry immediately before the one that was deleted.
drh3b7511c2001-05-26 13:15:44 +00002681*/
drh144f9ea2003-04-16 01:28:16 +00002682static int fileBtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00002683 MemPage *pPage = pCur->pPage;
2684 Cell *pCell;
2685 int rc;
drh8c42ca92001-06-22 19:15:00 +00002686 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00002687 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00002688
drh7aa128d2002-06-21 13:09:16 +00002689 assert( pPage->isInit );
drhecdc7532001-09-23 02:35:53 +00002690 if( pCur->pPage==0 ){
2691 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2692 }
drhf74b8d92002-09-01 23:20:45 +00002693 if( !pBt->inTrans ){
2694 /* Must start a transaction before doing a delete */
2695 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002696 }
drhf74b8d92002-09-01 23:20:45 +00002697 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00002698 if( pCur->idx >= pPage->nCell ){
2699 return SQLITE_ERROR; /* The cursor is not pointing to anything */
2700 }
drhecdc7532001-09-23 02:35:53 +00002701 if( !pCur->wrFlag ){
2702 return SQLITE_PERM; /* Did not open this cursor for writing */
2703 }
drhf74b8d92002-09-01 23:20:45 +00002704 if( checkReadLocks(pCur) ){
2705 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2706 }
drhbd03cae2001-06-02 02:40:57 +00002707 rc = sqlitepager_write(pPage);
2708 if( rc ) return rc;
drh5e2f8b92001-05-28 00:41:15 +00002709 pCell = pPage->apCell[pCur->idx];
drh0d316a42002-08-11 20:10:47 +00002710 pgnoChild = SWAB32(pBt, pCell->h.leftChild);
2711 clearCell(pBt, pCell);
drh14acc042001-06-10 19:56:58 +00002712 if( pgnoChild ){
2713 /*
drh5e00f6c2001-09-13 13:46:56 +00002714 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00002715 ** do something we will leave a hole on an internal page.
2716 ** We have to fill the hole by moving in a cell from a leaf. The
2717 ** next Cell after the one to be deleted is guaranteed to exist and
2718 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00002719 */
drh14acc042001-06-10 19:56:58 +00002720 BtCursor leafCur;
2721 Cell *pNext;
2722 int szNext;
drh8c1238a2003-01-02 14:43:55 +00002723 int notUsed;
drh14acc042001-06-10 19:56:58 +00002724 getTempCursor(pCur, &leafCur);
drh144f9ea2003-04-16 01:28:16 +00002725 rc = fileBtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00002726 if( rc!=SQLITE_OK ){
2727 return SQLITE_CORRUPT;
drh5e2f8b92001-05-28 00:41:15 +00002728 }
drh6019e162001-07-02 17:51:45 +00002729 rc = sqlitepager_write(leafCur.pPage);
2730 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002731 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
drh8c42ca92001-06-22 19:15:00 +00002732 pNext = leafCur.pPage->apCell[leafCur.idx];
drh0d316a42002-08-11 20:10:47 +00002733 szNext = cellSize(pBt, pNext);
2734 pNext->h.leftChild = SWAB32(pBt, pgnoChild);
2735 insertCell(pBt, pPage, pCur->idx, pNext, szNext);
2736 rc = balance(pBt, pPage, pCur);
drh5e2f8b92001-05-28 00:41:15 +00002737 if( rc ) return rc;
drh2dcc9aa2002-12-04 13:40:25 +00002738 pCur->eSkip = SKIP_NEXT;
drh0d316a42002-08-11 20:10:47 +00002739 dropCell(pBt, leafCur.pPage, leafCur.idx, szNext);
2740 rc = balance(pBt, leafCur.pPage, pCur);
drh8c42ca92001-06-22 19:15:00 +00002741 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00002742 }else{
drh0d316a42002-08-11 20:10:47 +00002743 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
drh5edc3122001-09-13 21:53:09 +00002744 if( pCur->idx>=pPage->nCell ){
2745 pCur->idx = pPage->nCell-1;
drhf5bf0a72001-11-23 00:24:12 +00002746 if( pCur->idx<0 ){
2747 pCur->idx = 0;
drh2dcc9aa2002-12-04 13:40:25 +00002748 pCur->eSkip = SKIP_NEXT;
drhf5bf0a72001-11-23 00:24:12 +00002749 }else{
drh2dcc9aa2002-12-04 13:40:25 +00002750 pCur->eSkip = SKIP_PREV;
drhf5bf0a72001-11-23 00:24:12 +00002751 }
drh6019e162001-07-02 17:51:45 +00002752 }else{
drh2dcc9aa2002-12-04 13:40:25 +00002753 pCur->eSkip = SKIP_NEXT;
drh6019e162001-07-02 17:51:45 +00002754 }
drh0d316a42002-08-11 20:10:47 +00002755 rc = balance(pBt, pPage, pCur);
drh5e2f8b92001-05-28 00:41:15 +00002756 }
drh5e2f8b92001-05-28 00:41:15 +00002757 return rc;
drh3b7511c2001-05-26 13:15:44 +00002758}
drh8b2f49b2001-06-08 00:21:52 +00002759
2760/*
drhc6b52df2002-01-04 03:09:29 +00002761** Create a new BTree table. Write into *piTable the page
2762** number for the root page of the new table.
2763**
2764** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00002765** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00002766** are restricted to having a 4-byte integer key and arbitrary data and
2767** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00002768** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00002769*/
drh144f9ea2003-04-16 01:28:16 +00002770static int fileBtreeCreateTable(Btree *pBt, int *piTable){
drh8b2f49b2001-06-08 00:21:52 +00002771 MemPage *pRoot;
2772 Pgno pgnoRoot;
2773 int rc;
2774 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00002775 /* Must start a transaction first */
2776 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002777 }
drh5df72a52002-06-06 23:16:05 +00002778 if( pBt->readOnly ){
2779 return SQLITE_READONLY;
2780 }
drhbea00b92002-07-08 10:59:50 +00002781 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
drh8b2f49b2001-06-08 00:21:52 +00002782 if( rc ) return rc;
drh6019e162001-07-02 17:51:45 +00002783 assert( sqlitepager_iswriteable(pRoot) );
drh0d316a42002-08-11 20:10:47 +00002784 zeroPage(pBt, pRoot);
drh8b2f49b2001-06-08 00:21:52 +00002785 sqlitepager_unref(pRoot);
2786 *piTable = (int)pgnoRoot;
2787 return SQLITE_OK;
2788}
2789
2790/*
2791** Erase the given database page and all its children. Return
2792** the page to the freelist.
2793*/
drh2aa679f2001-06-25 02:11:07 +00002794static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){
drh8b2f49b2001-06-08 00:21:52 +00002795 MemPage *pPage;
2796 int rc;
drh8b2f49b2001-06-08 00:21:52 +00002797 Cell *pCell;
2798 int idx;
2799
drh8c42ca92001-06-22 19:15:00 +00002800 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage);
drh8b2f49b2001-06-08 00:21:52 +00002801 if( rc ) return rc;
drh6019e162001-07-02 17:51:45 +00002802 rc = sqlitepager_write(pPage);
2803 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002804 rc = initPage(pBt, pPage, pgno, 0);
drh7aa128d2002-06-21 13:09:16 +00002805 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002806 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
drh8b2f49b2001-06-08 00:21:52 +00002807 while( idx>0 ){
drh14acc042001-06-10 19:56:58 +00002808 pCell = (Cell*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +00002809 idx = SWAB16(pBt, pCell->h.iNext);
drh8b2f49b2001-06-08 00:21:52 +00002810 if( pCell->h.leftChild ){
drh0d316a42002-08-11 20:10:47 +00002811 rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
drh8b2f49b2001-06-08 00:21:52 +00002812 if( rc ) return rc;
2813 }
drh8c42ca92001-06-22 19:15:00 +00002814 rc = clearCell(pBt, pCell);
drh8b2f49b2001-06-08 00:21:52 +00002815 if( rc ) return rc;
2816 }
drh2aa679f2001-06-25 02:11:07 +00002817 if( pPage->u.hdr.rightChild ){
drh0d316a42002-08-11 20:10:47 +00002818 rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
drh2aa679f2001-06-25 02:11:07 +00002819 if( rc ) return rc;
2820 }
2821 if( freePageFlag ){
2822 rc = freePage(pBt, pPage, pgno);
2823 }else{
drh0d316a42002-08-11 20:10:47 +00002824 zeroPage(pBt, pPage);
drh2aa679f2001-06-25 02:11:07 +00002825 }
drhdd793422001-06-28 01:54:48 +00002826 sqlitepager_unref(pPage);
drh2aa679f2001-06-25 02:11:07 +00002827 return rc;
drh8b2f49b2001-06-08 00:21:52 +00002828}
2829
2830/*
2831** Delete all information from a single table in the database.
2832*/
drh144f9ea2003-04-16 01:28:16 +00002833static int fileBtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00002834 int rc;
drhf74b8d92002-09-01 23:20:45 +00002835 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00002836 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00002837 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002838 }
drhf74b8d92002-09-01 23:20:45 +00002839 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2840 if( pCur->pgnoRoot==(Pgno)iTable ){
2841 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
2842 moveToRoot(pCur);
2843 }
drhecdc7532001-09-23 02:35:53 +00002844 }
drh2aa679f2001-06-25 02:11:07 +00002845 rc = clearDatabasePage(pBt, (Pgno)iTable, 0);
drh8b2f49b2001-06-08 00:21:52 +00002846 if( rc ){
drh144f9ea2003-04-16 01:28:16 +00002847 fileBtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00002848 }
drh8c42ca92001-06-22 19:15:00 +00002849 return rc;
drh8b2f49b2001-06-08 00:21:52 +00002850}
2851
2852/*
2853** Erase all information in a table and add the root of the table to
2854** the freelist. Except, the root of the principle table (the one on
2855** page 2) is never added to the freelist.
2856*/
drh144f9ea2003-04-16 01:28:16 +00002857static int fileBtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00002858 int rc;
2859 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00002860 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00002861 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00002862 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002863 }
drhf74b8d92002-09-01 23:20:45 +00002864 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2865 if( pCur->pgnoRoot==(Pgno)iTable ){
2866 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
2867 }
drh5df72a52002-06-06 23:16:05 +00002868 }
drh8c42ca92001-06-22 19:15:00 +00002869 rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage);
drh2aa679f2001-06-25 02:11:07 +00002870 if( rc ) return rc;
drh144f9ea2003-04-16 01:28:16 +00002871 rc = fileBtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00002872 if( rc ) return rc;
2873 if( iTable>2 ){
2874 rc = freePage(pBt, pPage, iTable);
2875 }else{
drh0d316a42002-08-11 20:10:47 +00002876 zeroPage(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00002877 }
drhdd793422001-06-28 01:54:48 +00002878 sqlitepager_unref(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002879 return rc;
2880}
2881
drh001bbcb2003-03-19 03:14:00 +00002882#if 0 /* UNTESTED */
2883/*
2884** Copy all cell data from one database file into another.
2885** pages back the freelist.
2886*/
2887static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){
2888 Pager *pFromPager = pBtFrom->pPager;
2889 OverflowPage *pOvfl;
2890 Pgno ovfl, nextOvfl;
2891 Pgno *pPrev;
2892 int rc = SQLITE_OK;
2893 MemPage *pNew, *pPrevPg;
2894 Pgno new;
2895
2896 if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){
2897 return SQLITE_OK;
2898 }
2899 pPrev = &pCell->ovfl;
2900 pPrevPg = 0;
2901 ovfl = SWAB32(pBtTo, pCell->ovfl);
2902 while( ovfl && rc==SQLITE_OK ){
2903 rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl);
2904 if( rc ) return rc;
2905 nextOvfl = SWAB32(pBtFrom, pOvfl->iNext);
2906 rc = allocatePage(pBtTo, &pNew, &new, 0);
2907 if( rc==SQLITE_OK ){
2908 rc = sqlitepager_write(pNew);
2909 if( rc==SQLITE_OK ){
2910 memcpy(pNew, pOvfl, SQLITE_PAGE_SIZE);
2911 *pPrev = SWAB32(pBtTo, new);
2912 if( pPrevPg ){
2913 sqlitepager_unref(pPrevPg);
2914 }
2915 pPrev = &pOvfl->iNext;
2916 pPrevPg = pNew;
2917 }
2918 }
2919 sqlitepager_unref(pOvfl);
2920 ovfl = nextOvfl;
2921 }
2922 if( pPrevPg ){
2923 sqlitepager_unref(pPrevPg);
2924 }
2925 return rc;
2926}
2927#endif
2928
2929
2930#if 0 /* UNTESTED */
2931/*
2932** Copy a page of data from one database over to another.
2933*/
2934static int copyDatabasePage(
2935 Btree *pBtFrom,
2936 Pgno pgnoFrom,
2937 Btree *pBtTo,
2938 Pgno *pTo
2939){
2940 MemPage *pPageFrom, *pPage;
2941 Pgno to;
2942 int rc;
2943 Cell *pCell;
2944 int idx;
2945
2946 rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom);
2947 if( rc ) return rc;
2948 rc = allocatePage(pBt, &pPage, pTo, 0);
2949 if( rc==SQLITE_OK ){
2950 rc = sqlitepager_write(pPage);
2951 }
2952 if( rc==SQLITE_OK ){
2953 memcpy(pPage, pPageFrom, SQLITE_PAGE_SIZE);
2954 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2955 while( idx>0 ){
2956 pCell = (Cell*)&pPage->u.aDisk[idx];
2957 idx = SWAB16(pBt, pCell->h.iNext);
2958 if( pCell->h.leftChild ){
2959 Pgno newChld;
2960 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild),
2961 pBtTo, &newChld);
2962 if( rc ) return rc;
2963 pCell->h.leftChild = SWAB32(pBtFrom, newChld);
2964 }
2965 rc = copyCell(pBtFrom, pBtTo, pCell);
2966 if( rc ) return rc;
2967 }
2968 if( pPage->u.hdr.rightChild ){
2969 Pgno newChld;
2970 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild),
2971 pBtTo, &newChld);
2972 if( rc ) return rc;
2973 pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild);
2974 }
2975 }
2976 sqlitepager_unref(pPage);
2977 return rc;
2978}
2979#endif
2980
drh8b2f49b2001-06-08 00:21:52 +00002981/*
2982** Read the meta-information out of a database file.
2983*/
drh144f9ea2003-04-16 01:28:16 +00002984static int fileBtreeGetMeta(Btree *pBt, int *aMeta){
drh8b2f49b2001-06-08 00:21:52 +00002985 PageOne *pP1;
2986 int rc;
drh0d316a42002-08-11 20:10:47 +00002987 int i;
drh8b2f49b2001-06-08 00:21:52 +00002988
drh8c42ca92001-06-22 19:15:00 +00002989 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00002990 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00002991 aMeta[0] = SWAB32(pBt, pP1->nFree);
2992 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
2993 aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]);
2994 }
drh8b2f49b2001-06-08 00:21:52 +00002995 sqlitepager_unref(pP1);
2996 return SQLITE_OK;
2997}
2998
2999/*
3000** Write meta-information back into the database.
3001*/
drh144f9ea2003-04-16 01:28:16 +00003002static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){
drh8b2f49b2001-06-08 00:21:52 +00003003 PageOne *pP1;
drh0d316a42002-08-11 20:10:47 +00003004 int rc, i;
drh8b2f49b2001-06-08 00:21:52 +00003005 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003006 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003007 }
drh8b2f49b2001-06-08 00:21:52 +00003008 pP1 = pBt->page1;
3009 rc = sqlitepager_write(pP1);
drh9adf9ac2002-05-15 11:44:13 +00003010 if( rc ) return rc;
drh0d316a42002-08-11 20:10:47 +00003011 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
3012 pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]);
3013 }
drh8b2f49b2001-06-08 00:21:52 +00003014 return SQLITE_OK;
3015}
drh8c42ca92001-06-22 19:15:00 +00003016
drh5eddca62001-06-30 21:53:53 +00003017/******************************************************************************
3018** The complete implementation of the BTree subsystem is above this line.
3019** All the code the follows is for testing and troubleshooting the BTree
3020** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003021******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003022
drh8c42ca92001-06-22 19:15:00 +00003023/*
3024** Print a disassembly of the given page on standard output. This routine
3025** is used for debugging and testing only.
3026*/
drhaaab5722002-02-19 13:39:21 +00003027#ifdef SQLITE_TEST
drh144f9ea2003-04-16 01:28:16 +00003028static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003029 int rc;
3030 MemPage *pPage;
3031 int i, j;
3032 int nFree;
3033 u16 idx;
3034 char range[20];
3035 unsigned char payload[20];
3036 rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage);
3037 if( rc ){
3038 return rc;
3039 }
drh6019e162001-07-02 17:51:45 +00003040 if( recursive ) printf("PAGE %d:\n", pgno);
drh8c42ca92001-06-22 19:15:00 +00003041 i = 0;
drh0d316a42002-08-11 20:10:47 +00003042 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
drh8c42ca92001-06-22 19:15:00 +00003043 while( idx>0 && idx<=SQLITE_PAGE_SIZE-MIN_CELL_SIZE ){
3044 Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
drh0d316a42002-08-11 20:10:47 +00003045 int sz = cellSize(pBt, pCell);
drh8c42ca92001-06-22 19:15:00 +00003046 sprintf(range,"%d..%d", idx, idx+sz-1);
drh0d316a42002-08-11 20:10:47 +00003047 sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
drh8c42ca92001-06-22 19:15:00 +00003048 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3049 memcpy(payload, pCell->aPayload, sz);
3050 for(j=0; j<sz; j++){
3051 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3052 }
3053 payload[sz] = 0;
3054 printf(
drh6019e162001-07-02 17:51:45 +00003055 "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n",
drh0d316a42002-08-11 20:10:47 +00003056 i, range, (int)pCell->h.leftChild,
3057 NKEY(pBt, pCell->h), NDATA(pBt, pCell->h),
drh2aa679f2001-06-25 02:11:07 +00003058 payload
drh8c42ca92001-06-22 19:15:00 +00003059 );
drh6019e162001-07-02 17:51:45 +00003060 if( pPage->isInit && pPage->apCell[i]!=pCell ){
drh2aa679f2001-06-25 02:11:07 +00003061 printf("**** apCell[%d] does not match on prior entry ****\n", i);
3062 }
drh7c717f72001-06-24 20:39:41 +00003063 i++;
drh0d316a42002-08-11 20:10:47 +00003064 idx = SWAB16(pBt, pCell->h.iNext);
drh8c42ca92001-06-22 19:15:00 +00003065 }
3066 if( idx!=0 ){
3067 printf("ERROR: next cell index out of range: %d\n", idx);
3068 }
drh0d316a42002-08-11 20:10:47 +00003069 printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild));
drh8c42ca92001-06-22 19:15:00 +00003070 nFree = 0;
3071 i = 0;
drh0d316a42002-08-11 20:10:47 +00003072 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
drh8c42ca92001-06-22 19:15:00 +00003073 while( idx>0 && idx<SQLITE_PAGE_SIZE ){
3074 FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx];
3075 sprintf(range,"%d..%d", idx, idx+p->iSize-1);
drh0d316a42002-08-11 20:10:47 +00003076 nFree += SWAB16(pBt, p->iSize);
drh8c42ca92001-06-22 19:15:00 +00003077 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh0d316a42002-08-11 20:10:47 +00003078 i, range, SWAB16(pBt, p->iSize), nFree);
3079 idx = SWAB16(pBt, p->iNext);
drh2aa679f2001-06-25 02:11:07 +00003080 i++;
drh8c42ca92001-06-22 19:15:00 +00003081 }
3082 if( idx!=0 ){
3083 printf("ERROR: next freeblock index out of range: %d\n", idx);
3084 }
drh6019e162001-07-02 17:51:45 +00003085 if( recursive && pPage->u.hdr.rightChild!=0 ){
drh0d316a42002-08-11 20:10:47 +00003086 idx = SWAB16(pBt, pPage->u.hdr.firstCell);
drh6019e162001-07-02 17:51:45 +00003087 while( idx>0 && idx<SQLITE_PAGE_SIZE-MIN_CELL_SIZE ){
3088 Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
drh144f9ea2003-04-16 01:28:16 +00003089 fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
drh0d316a42002-08-11 20:10:47 +00003090 idx = SWAB16(pBt, pCell->h.iNext);
drh6019e162001-07-02 17:51:45 +00003091 }
drh144f9ea2003-04-16 01:28:16 +00003092 fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
drh6019e162001-07-02 17:51:45 +00003093 }
drh8c42ca92001-06-22 19:15:00 +00003094 sqlitepager_unref(pPage);
3095 return SQLITE_OK;
3096}
drhaaab5722002-02-19 13:39:21 +00003097#endif
drh8c42ca92001-06-22 19:15:00 +00003098
drhaaab5722002-02-19 13:39:21 +00003099#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003100/*
drh2aa679f2001-06-25 02:11:07 +00003101** Fill aResult[] with information about the entry and page that the
3102** cursor is pointing to.
3103**
3104** aResult[0] = The page number
3105** aResult[1] = The entry number
3106** aResult[2] = Total number of entries on this page
3107** aResult[3] = Size of this entry
3108** aResult[4] = Number of free bytes on this page
3109** aResult[5] = Number of free blocks on the page
3110** aResult[6] = Page number of the left child of this entry
3111** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003112**
3113** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003114*/
drh144f9ea2003-04-16 01:28:16 +00003115static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003116 int cnt, idx;
3117 MemPage *pPage = pCur->pPage;
drh0d316a42002-08-11 20:10:47 +00003118 Btree *pBt = pCur->pBt;
drh2aa679f2001-06-25 02:11:07 +00003119 aResult[0] = sqlitepager_pagenumber(pPage);
drh8c42ca92001-06-22 19:15:00 +00003120 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003121 aResult[2] = pPage->nCell;
3122 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh0d316a42002-08-11 20:10:47 +00003123 aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]);
3124 aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild);
drh2aa679f2001-06-25 02:11:07 +00003125 }else{
3126 aResult[3] = 0;
3127 aResult[6] = 0;
3128 }
3129 aResult[4] = pPage->nFree;
3130 cnt = 0;
drh0d316a42002-08-11 20:10:47 +00003131 idx = SWAB16(pBt, pPage->u.hdr.firstFree);
drh2aa679f2001-06-25 02:11:07 +00003132 while( idx>0 && idx<SQLITE_PAGE_SIZE ){
3133 cnt++;
drh0d316a42002-08-11 20:10:47 +00003134 idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext);
drh2aa679f2001-06-25 02:11:07 +00003135 }
3136 aResult[5] = cnt;
drh0d316a42002-08-11 20:10:47 +00003137 aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild);
drh8c42ca92001-06-22 19:15:00 +00003138 return SQLITE_OK;
3139}
drhaaab5722002-02-19 13:39:21 +00003140#endif
drhdd793422001-06-28 01:54:48 +00003141
drhaaab5722002-02-19 13:39:21 +00003142#ifdef SQLITE_TEST
drhdd793422001-06-28 01:54:48 +00003143/*
drh5eddca62001-06-30 21:53:53 +00003144** Return the pager associated with a BTree. This routine is used for
3145** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003146*/
drh144f9ea2003-04-16 01:28:16 +00003147static Pager *fileBtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003148 return pBt->pPager;
3149}
drhaaab5722002-02-19 13:39:21 +00003150#endif
drh5eddca62001-06-30 21:53:53 +00003151
3152/*
3153** This structure is passed around through all the sanity checking routines
3154** in order to keep track of some global state information.
3155*/
drhaaab5722002-02-19 13:39:21 +00003156typedef struct IntegrityCk IntegrityCk;
3157struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003158 Btree *pBt; /* The tree being checked out */
3159 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3160 int nPage; /* Number of pages in the database */
3161 int *anRef; /* Number of times each page is referenced */
3162 int nTreePage; /* Number of BTree pages */
3163 int nByte; /* Number of bytes of data stored on BTree pages */
3164 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003165};
3166
3167/*
3168** Append a message to the error message string.
3169*/
drhaaab5722002-02-19 13:39:21 +00003170static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003171 if( pCheck->zErrMsg ){
3172 char *zOld = pCheck->zErrMsg;
3173 pCheck->zErrMsg = 0;
drh41743982003-12-06 21:43:55 +00003174 sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003175 sqliteFree(zOld);
3176 }else{
drh41743982003-12-06 21:43:55 +00003177 sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003178 }
3179}
3180
3181/*
3182** Add 1 to the reference count for page iPage. If this is the second
3183** reference to the page, add an error message to pCheck->zErrMsg.
3184** Return 1 if there are 2 ore more references to the page and 0 if
3185** if this is the first reference to the page.
3186**
3187** Also check that the page number is in bounds.
3188*/
drhaaab5722002-02-19 13:39:21 +00003189static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003190 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003191 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003192 char zBuf[100];
3193 sprintf(zBuf, "invalid page number %d", iPage);
3194 checkAppendMsg(pCheck, zContext, zBuf);
3195 return 1;
3196 }
3197 if( pCheck->anRef[iPage]==1 ){
3198 char zBuf[100];
3199 sprintf(zBuf, "2nd reference to page %d", iPage);
3200 checkAppendMsg(pCheck, zContext, zBuf);
3201 return 1;
3202 }
3203 return (pCheck->anRef[iPage]++)>1;
3204}
3205
3206/*
3207** Check the integrity of the freelist or of an overflow page list.
3208** Verify that the number of pages on the list is N.
3209*/
drh30e58752002-03-02 20:41:57 +00003210static void checkList(
3211 IntegrityCk *pCheck, /* Integrity checking context */
3212 int isFreeList, /* True for a freelist. False for overflow page list */
3213 int iPage, /* Page number for first page in the list */
3214 int N, /* Expected number of pages in the list */
3215 char *zContext /* Context for error messages */
3216){
3217 int i;
drh5eddca62001-06-30 21:53:53 +00003218 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003219 while( N-- > 0 ){
drh5eddca62001-06-30 21:53:53 +00003220 OverflowPage *pOvfl;
3221 if( iPage<1 ){
3222 sprintf(zMsg, "%d pages missing from overflow list", N+1);
3223 checkAppendMsg(pCheck, zContext, zMsg);
3224 break;
3225 }
3226 if( checkRef(pCheck, iPage, zContext) ) break;
3227 if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3228 sprintf(zMsg, "failed to get page %d", iPage);
3229 checkAppendMsg(pCheck, zContext, zMsg);
3230 break;
3231 }
drh30e58752002-03-02 20:41:57 +00003232 if( isFreeList ){
3233 FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload;
drh0d316a42002-08-11 20:10:47 +00003234 int n = SWAB32(pCheck->pBt, pInfo->nFree);
3235 for(i=0; i<n; i++){
drh8bf8dc92003-05-17 17:35:10 +00003236 checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext);
drh30e58752002-03-02 20:41:57 +00003237 }
drh0d316a42002-08-11 20:10:47 +00003238 N -= n;
drh30e58752002-03-02 20:41:57 +00003239 }
drh0d316a42002-08-11 20:10:47 +00003240 iPage = SWAB32(pCheck->pBt, pOvfl->iNext);
drh5eddca62001-06-30 21:53:53 +00003241 sqlitepager_unref(pOvfl);
3242 }
3243}
3244
3245/*
drh1bffb9c2002-02-03 17:37:36 +00003246** Return negative if zKey1<zKey2.
3247** Return zero if zKey1==zKey2.
3248** Return positive if zKey1>zKey2.
3249*/
3250static int keyCompare(
3251 const char *zKey1, int nKey1,
3252 const char *zKey2, int nKey2
3253){
3254 int min = nKey1>nKey2 ? nKey2 : nKey1;
3255 int c = memcmp(zKey1, zKey2, min);
3256 if( c==0 ){
3257 c = nKey1 - nKey2;
3258 }
3259 return c;
3260}
3261
3262/*
drh5eddca62001-06-30 21:53:53 +00003263** Do various sanity checks on a single page of a tree. Return
3264** the tree depth. Root pages return 0. Parents of root pages
3265** return 1, and so forth.
3266**
3267** These checks are done:
3268**
3269** 1. Make sure that cells and freeblocks do not overlap
3270** but combine to completely cover the page.
3271** 2. Make sure cell keys are in order.
3272** 3. Make sure no key is less than or equal to zLowerBound.
3273** 4. Make sure no key is greater than or equal to zUpperBound.
3274** 5. Check the integrity of overflow pages.
3275** 6. Recursively call checkTreePage on all children.
3276** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003277** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003278** the root of the tree.
3279*/
3280static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003281 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003282 int iPage, /* Page number of the page to check */
3283 MemPage *pParent, /* Parent page */
3284 char *zParentContext, /* Parent context */
3285 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003286 int nLower, /* Number of characters in zLowerBound */
3287 char *zUpperBound, /* All keys should be less than this, if not NULL */
3288 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003289){
3290 MemPage *pPage;
3291 int i, rc, depth, d2, pgno;
3292 char *zKey1, *zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003293 int nKey1, nKey2;
drh5eddca62001-06-30 21:53:53 +00003294 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003295 Btree *pBt;
drh5eddca62001-06-30 21:53:53 +00003296 char zMsg[100];
3297 char zContext[100];
3298 char hit[SQLITE_PAGE_SIZE];
3299
3300 /* Check that the page exists
3301 */
drh0d316a42002-08-11 20:10:47 +00003302 cur.pBt = pBt = pCheck->pBt;
drh5eddca62001-06-30 21:53:53 +00003303 if( iPage==0 ) return 0;
3304 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3305 sprintf(zContext, "On tree page %d: ", iPage);
3306 if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){
3307 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3308 checkAppendMsg(pCheck, zContext, zMsg);
3309 return 0;
3310 }
drh0d316a42002-08-11 20:10:47 +00003311 if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003312 sprintf(zMsg, "initPage() returns error code %d", rc);
3313 checkAppendMsg(pCheck, zContext, zMsg);
3314 sqlitepager_unref(pPage);
3315 return 0;
3316 }
3317
3318 /* Check out all the cells.
3319 */
3320 depth = 0;
drh1bffb9c2002-02-03 17:37:36 +00003321 if( zLowerBound ){
3322 zKey1 = sqliteMalloc( nLower+1 );
3323 memcpy(zKey1, zLowerBound, nLower);
3324 zKey1[nLower] = 0;
3325 }else{
3326 zKey1 = 0;
3327 }
3328 nKey1 = nLower;
drh5eddca62001-06-30 21:53:53 +00003329 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003330 for(i=0; i<pPage->nCell; i++){
3331 Cell *pCell = pPage->apCell[i];
3332 int sz;
3333
3334 /* Check payload overflow pages
3335 */
drh0d316a42002-08-11 20:10:47 +00003336 nKey2 = NKEY(pBt, pCell->h);
3337 sz = nKey2 + NDATA(pBt, pCell->h);
drh5eddca62001-06-30 21:53:53 +00003338 sprintf(zContext, "On page %d cell %d: ", iPage, i);
3339 if( sz>MX_LOCAL_PAYLOAD ){
3340 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
drh0d316a42002-08-11 20:10:47 +00003341 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00003342 }
3343
3344 /* Check that keys are in the right order
3345 */
3346 cur.idx = i;
drh8c1238a2003-01-02 14:43:55 +00003347 zKey2 = sqliteMallocRaw( nKey2+1 );
drh1bffb9c2002-02-03 17:37:36 +00003348 getPayload(&cur, 0, nKey2, zKey2);
3349 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
drh5eddca62001-06-30 21:53:53 +00003350 checkAppendMsg(pCheck, zContext, "Key is out of order");
3351 }
3352
3353 /* Check sanity of left child page.
3354 */
drh0d316a42002-08-11 20:10:47 +00003355 pgno = SWAB32(pBt, pCell->h.leftChild);
drh1bffb9c2002-02-03 17:37:36 +00003356 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
drh5eddca62001-06-30 21:53:53 +00003357 if( i>0 && d2!=depth ){
3358 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3359 }
3360 depth = d2;
3361 sqliteFree(zKey1);
3362 zKey1 = zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003363 nKey1 = nKey2;
drh5eddca62001-06-30 21:53:53 +00003364 }
drh0d316a42002-08-11 20:10:47 +00003365 pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
drh5eddca62001-06-30 21:53:53 +00003366 sprintf(zContext, "On page %d at right child: ", iPage);
drh1bffb9c2002-02-03 17:37:36 +00003367 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
drh5eddca62001-06-30 21:53:53 +00003368 sqliteFree(zKey1);
3369
3370 /* Check for complete coverage of the page
3371 */
3372 memset(hit, 0, sizeof(hit));
3373 memset(hit, 1, sizeof(PageHdr));
drh0d316a42002-08-11 20:10:47 +00003374 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_PAGE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003375 Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3376 int j;
drh0d316a42002-08-11 20:10:47 +00003377 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3378 i = SWAB16(pBt, pCell->h.iNext);
drh5eddca62001-06-30 21:53:53 +00003379 }
drh0d316a42002-08-11 20:10:47 +00003380 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_PAGE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003381 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3382 int j;
drh0d316a42002-08-11 20:10:47 +00003383 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3384 i = SWAB16(pBt,pFBlk->iNext);
drh5eddca62001-06-30 21:53:53 +00003385 }
3386 for(i=0; i<SQLITE_PAGE_SIZE; i++){
3387 if( hit[i]==0 ){
3388 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3389 checkAppendMsg(pCheck, zMsg, 0);
3390 break;
3391 }else if( hit[i]>1 ){
3392 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3393 checkAppendMsg(pCheck, zMsg, 0);
3394 break;
3395 }
3396 }
3397
3398 /* Check that free space is kept to a minimum
3399 */
drh6019e162001-07-02 17:51:45 +00003400#if 0
3401 if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_PAGE_SIZE/4 ){
drh5eddca62001-06-30 21:53:53 +00003402 sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree,
3403 SQLITE_PAGE_SIZE/3);
3404 checkAppendMsg(pCheck, zContext, zMsg);
3405 }
drh6019e162001-07-02 17:51:45 +00003406#endif
3407
3408 /* Update freespace totals.
3409 */
3410 pCheck->nTreePage++;
3411 pCheck->nByte += USABLE_SPACE - pPage->nFree;
drh5eddca62001-06-30 21:53:53 +00003412
3413 sqlitepager_unref(pPage);
3414 return depth;
3415}
3416
3417/*
3418** This routine does a complete check of the given BTree file. aRoot[] is
3419** an array of pages numbers were each page number is the root page of
3420** a table. nRoot is the number of entries in aRoot.
3421**
3422** If everything checks out, this routine returns NULL. If something is
3423** amiss, an error message is written into memory obtained from malloc()
3424** and a pointer to that error message is returned. The calling function
3425** is responsible for freeing the error message when it is done.
3426*/
drh144f9ea2003-04-16 01:28:16 +00003427char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003428 int i;
3429 int nRef;
drhaaab5722002-02-19 13:39:21 +00003430 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003431
3432 nRef = *sqlitepager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003433 if( lockBtree(pBt)!=SQLITE_OK ){
3434 return sqliteStrDup("Unable to acquire a read lock on the database");
3435 }
drh5eddca62001-06-30 21:53:53 +00003436 sCheck.pBt = pBt;
3437 sCheck.pPager = pBt->pPager;
3438 sCheck.nPage = sqlitepager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003439 if( sCheck.nPage==0 ){
3440 unlockBtreeIfUnused(pBt);
3441 return 0;
3442 }
drh8c1238a2003-01-02 14:43:55 +00003443 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drh5eddca62001-06-30 21:53:53 +00003444 sCheck.anRef[1] = 1;
3445 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3446 sCheck.zErrMsg = 0;
3447
3448 /* Check the integrity of the freelist
3449 */
drh0d316a42002-08-11 20:10:47 +00003450 checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList),
3451 SWAB32(pBt, pBt->page1->nFree), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003452
3453 /* Check all the tables.
3454 */
3455 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00003456 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00003457 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00003458 }
3459
3460 /* Make sure every page in the file is referenced
3461 */
3462 for(i=1; i<=sCheck.nPage; i++){
3463 if( sCheck.anRef[i]==0 ){
3464 char zBuf[100];
3465 sprintf(zBuf, "Page %d is never used", i);
3466 checkAppendMsg(&sCheck, zBuf, 0);
3467 }
3468 }
3469
3470 /* Make sure this analysis did not leave any unref() pages
3471 */
drh5e00f6c2001-09-13 13:46:56 +00003472 unlockBtreeIfUnused(pBt);
drh5eddca62001-06-30 21:53:53 +00003473 if( nRef != *sqlitepager_stats(pBt->pPager) ){
3474 char zBuf[100];
3475 sprintf(zBuf,
3476 "Outstanding page count goes from %d to %d during this analysis",
3477 nRef, *sqlitepager_stats(pBt->pPager)
3478 );
3479 checkAppendMsg(&sCheck, zBuf, 0);
3480 }
3481
3482 /* Clean up and report errors.
3483 */
3484 sqliteFree(sCheck.anRef);
3485 return sCheck.zErrMsg;
3486}
paulb95a8862003-04-01 21:16:41 +00003487
drh73509ee2003-04-06 20:44:45 +00003488/*
3489** Return the full pathname of the underlying database file.
3490*/
drh144f9ea2003-04-16 01:28:16 +00003491static const char *fileBtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00003492 assert( pBt->pPager!=0 );
3493 return sqlitepager_filename(pBt->pPager);
3494}
3495
3496/*
drhf7c57532003-04-25 13:22:51 +00003497** Copy the complete content of pBtFrom into pBtTo. A transaction
3498** must be active for both files.
3499**
3500** The size of file pBtFrom may be reduced by this operation.
3501** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00003502*/
drhf7c57532003-04-25 13:22:51 +00003503static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
3504 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00003505 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00003506
3507 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
3508 if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR;
3509 if( pBtTo->pCursor ) return SQLITE_BUSY;
3510 memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_PAGE_SIZE);
drh2e6d11b2003-04-25 15:37:57 +00003511 rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1);
3512 nToPage = sqlitepager_pagecount(pBtTo->pPager);
drhf7c57532003-04-25 13:22:51 +00003513 nPage = sqlitepager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00003514 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00003515 void *pPage;
3516 rc = sqlitepager_get(pBtFrom->pPager, i, &pPage);
3517 if( rc ) break;
drh2e6d11b2003-04-25 15:37:57 +00003518 rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage);
3519 if( rc ) break;
drhf7c57532003-04-25 13:22:51 +00003520 sqlitepager_unref(pPage);
3521 }
drh2e6d11b2003-04-25 15:37:57 +00003522 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3523 void *pPage;
3524 rc = sqlitepager_get(pBtTo->pPager, i, &pPage);
3525 if( rc ) break;
3526 rc = sqlitepager_write(pPage);
3527 sqlitepager_unref(pPage);
3528 sqlitepager_dont_write(pBtTo->pPager, i);
3529 }
3530 if( !rc && nPage<nToPage ){
3531 rc = sqlitepager_truncate(pBtTo->pPager, nPage);
3532 }
drhf7c57532003-04-25 13:22:51 +00003533 if( rc ){
3534 fileBtreeRollback(pBtTo);
3535 }
3536 return rc;
drh73509ee2003-04-06 20:44:45 +00003537}
3538
3539/*
3540** The following tables contain pointers to all of the interface
3541** routines for this implementation of the B*Tree backend. To
3542** substitute a different implemention of the backend, one has merely
3543** to provide pointers to alternative functions in similar tables.
3544*/
paulb95a8862003-04-01 21:16:41 +00003545static BtOps sqliteBtreeOps = {
drh144f9ea2003-04-16 01:28:16 +00003546 fileBtreeClose,
3547 fileBtreeSetCacheSize,
3548 fileBtreeSetSafetyLevel,
3549 fileBtreeBeginTrans,
3550 fileBtreeCommit,
3551 fileBtreeRollback,
3552 fileBtreeBeginCkpt,
3553 fileBtreeCommitCkpt,
3554 fileBtreeRollbackCkpt,
3555 fileBtreeCreateTable,
3556 fileBtreeCreateTable, /* Really sqliteBtreeCreateIndex() */
3557 fileBtreeDropTable,
3558 fileBtreeClearTable,
3559 fileBtreeCursor,
3560 fileBtreeGetMeta,
3561 fileBtreeUpdateMeta,
3562 fileBtreeIntegrityCheck,
3563 fileBtreeGetFilename,
drhf7c57532003-04-25 13:22:51 +00003564 fileBtreeCopyFile,
paulb95a8862003-04-01 21:16:41 +00003565#ifdef SQLITE_TEST
drh144f9ea2003-04-16 01:28:16 +00003566 fileBtreePageDump,
3567 fileBtreePager
paulb95a8862003-04-01 21:16:41 +00003568#endif
3569};
paulb95a8862003-04-01 21:16:41 +00003570static BtCursorOps sqliteBtreeCursorOps = {
drh144f9ea2003-04-16 01:28:16 +00003571 fileBtreeMoveto,
3572 fileBtreeDelete,
3573 fileBtreeInsert,
3574 fileBtreeFirst,
3575 fileBtreeLast,
3576 fileBtreeNext,
3577 fileBtreePrevious,
3578 fileBtreeKeySize,
3579 fileBtreeKey,
3580 fileBtreeKeyCompare,
3581 fileBtreeDataSize,
3582 fileBtreeData,
3583 fileBtreeCloseCursor,
paulb95a8862003-04-01 21:16:41 +00003584#ifdef SQLITE_TEST
drh144f9ea2003-04-16 01:28:16 +00003585 fileBtreeCursorDump,
paulb95a8862003-04-01 21:16:41 +00003586#endif
3587};