blob: e3febe981855fabe00b88ed9ea9224728ef273ad [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
danielk1977aef0bf62005-12-30 16:28:01 +000012** $Id: btree.c,v 1.277 2005/12/30 16:28:02 danielk1977 Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
drh271efa52004-05-30 19:19:05 +000056** page contain a special header (the "file header") that describes the file.
57** The format of the file header is as follows:
drh9e572e62004-04-23 23:43:10 +000058**
59** OFFSET SIZE DESCRIPTION
drhde647132004-05-07 17:57:49 +000060** 0 16 Header string: "SQLite format 3\000"
drh9e572e62004-04-23 23:43:10 +000061** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
drh6f11bef2004-05-13 01:12:56 +000064** 20 1 Bytes of unused space at the end of each page
65** 21 1 Max embedded payload fraction
66** 22 1 Min embedded payload fraction
67** 23 1 Min leaf payload fraction
68** 24 4 File change counter
69** 28 4 Reserved for future use
drh9e572e62004-04-23 23:43:10 +000070** 32 4 First freelist page
71** 36 4 Number of freelist pages in the file
72** 40 60 15 4-byte meta values passed to higher layers
73**
74** All of the integer values are big-endian (most significant byte first).
drh6f11bef2004-05-13 01:12:56 +000075**
drhab01f612004-05-22 02:55:23 +000076** The file change counter is incremented when the database is changed more
drh6f11bef2004-05-13 01:12:56 +000077** than once within the same second. This counter, together with the
78** modification time of the file, allows other processes to know
79** when the file has changed and thus when they need to flush their
80** cache.
81**
82** The max embedded payload fraction is the amount of the total usable
83** space in a page that can be consumed by a single cell for standard
84** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
85** is to limit the maximum cell size so that at least 4 cells will fit
drhab01f612004-05-22 02:55:23 +000086** on one page. Thus the default max embedded payload fraction is 64.
drh6f11bef2004-05-13 01:12:56 +000087**
88** If the payload for a cell is larger than the max payload, then extra
89** payload is spilled to overflow pages. Once an overflow page is allocated,
90** as many bytes as possible are moved into the overflow pages without letting
91** the cell size drop below the min embedded payload fraction.
92**
93** The min leaf payload fraction is like the min embedded payload fraction
94** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
95** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96** not specified in the header.
drh9e572e62004-04-23 23:43:10 +000097**
drh43605152004-05-29 21:46:49 +000098** Each btree pages is divided into three sections: The header, the
99** cell pointer array, and the cell area area. Page 1 also has a 100-byte
drh271efa52004-05-30 19:19:05 +0000100** file header that occurs before the page header.
101**
102** |----------------|
103** | file header | 100 bytes. Page 1 only.
104** |----------------|
105** | page header | 8 bytes for leaves. 12 bytes for interior nodes
106** |----------------|
107** | cell pointer | | 2 bytes per cell. Sorted order.
108** | array | | Grows downward
109** | | v
110** |----------------|
111** | unallocated |
112** | space |
113** |----------------| ^ Grows upwards
114** | cell content | | Arbitrary order interspersed with freeblocks.
115** | area | | and free space fragments.
116** |----------------|
drh43605152004-05-29 21:46:49 +0000117**
118** The page headers looks like this:
drh9e572e62004-04-23 23:43:10 +0000119**
120** OFFSET SIZE DESCRIPTION
drh6f11bef2004-05-13 01:12:56 +0000121** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
drh9e572e62004-04-23 23:43:10 +0000122** 1 2 byte offset to the first freeblock
drh43605152004-05-29 21:46:49 +0000123** 3 2 number of cells on this page
drh271efa52004-05-30 19:19:05 +0000124** 5 2 first byte of the cell content area
drh43605152004-05-29 21:46:49 +0000125** 7 1 number of fragmented free bytes
drh271efa52004-05-30 19:19:05 +0000126** 8 4 Right child (the Ptr(N+1) value). Omitted on leaves.
drh9e572e62004-04-23 23:43:10 +0000127**
128** The flags define the format of this btree page. The leaf flag means that
129** this page has no children. The zerodata flag means that this page carries
drh44f87bd2004-09-27 13:19:51 +0000130** only keys and no data. The intkey flag means that the key is a integer
131** which is stored in the key size entry of the cell header rather than in
132** the payload area.
drh9e572e62004-04-23 23:43:10 +0000133**
drh43605152004-05-29 21:46:49 +0000134** The cell pointer array begins on the first byte after the page header.
135** The cell pointer array contains zero or more 2-byte numbers which are
136** offsets from the beginning of the page to the cell content in the cell
137** content area. The cell pointers occur in sorted order. The system strives
138** to keep free space after the last cell pointer so that new cells can
drh44f87bd2004-09-27 13:19:51 +0000139** be easily added without having to defragment the page.
drh43605152004-05-29 21:46:49 +0000140**
141** Cell content is stored at the very end of the page and grows toward the
142** beginning of the page.
143**
144** Unused space within the cell content area is collected into a linked list of
145** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
146** to the first freeblock is given in the header. Freeblocks occur in
147** increasing order. Because a freeblock must be at least 4 bytes in size,
148** any group of 3 or fewer unused bytes in the cell content area cannot
149** exist on the freeblock chain. A group of 3 or fewer free bytes is called
150** a fragment. The total number of bytes in all fragments is recorded.
151** in the page header at offset 7.
152**
153** SIZE DESCRIPTION
154** 2 Byte offset of the next freeblock
155** 2 Bytes in this freeblock
156**
157** Cells are of variable length. Cells are stored in the cell content area at
158** the end of the page. Pointers to the cells are in the cell pointer array
159** that immediately follows the page header. Cells is not necessarily
160** contiguous or in order, but cell pointers are contiguous and in order.
161**
162** Cell content makes use of variable length integers. A variable
163** length integer is 1 to 9 bytes where the lower 7 bits of each
drh9e572e62004-04-23 23:43:10 +0000164** byte are used. The integer consists of all bytes that have bit 8 set and
drh6f11bef2004-05-13 01:12:56 +0000165** the first byte with bit 8 clear. The most significant byte of the integer
drhab01f612004-05-22 02:55:23 +0000166** appears first. A variable-length integer may not be more than 9 bytes long.
167** As a special case, all 8 bytes of the 9th byte are used as data. This
168** allows a 64-bit integer to be encoded in 9 bytes.
drh9e572e62004-04-23 23:43:10 +0000169**
170** 0x00 becomes 0x00000000
drh6f11bef2004-05-13 01:12:56 +0000171** 0x7f becomes 0x0000007f
172** 0x81 0x00 becomes 0x00000080
173** 0x82 0x00 becomes 0x00000100
174** 0x80 0x7f becomes 0x0000007f
175** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
drh9e572e62004-04-23 23:43:10 +0000176** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
177**
178** Variable length integers are used for rowids and to hold the number of
179** bytes of key and data in a btree cell.
180**
drh43605152004-05-29 21:46:49 +0000181** The content of a cell looks like this:
drh9e572e62004-04-23 23:43:10 +0000182**
183** SIZE DESCRIPTION
drh3aac2dd2004-04-26 14:10:20 +0000184** 4 Page number of the left child. Omitted if leaf flag is set.
185** var Number of bytes of data. Omitted if the zerodata flag is set.
186** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000187** * Payload
188** 4 First page of the overflow chain. Omitted if no overflow
189**
190** Overflow pages form a linked list. Each page except the last is completely
191** filled with data (pagesize - 4 bytes). The last page can have as little
192** as 1 byte of data.
193**
194** SIZE DESCRIPTION
195** 4 Page number of next overflow page
196** * Data
197**
198** Freelist pages come in two subtypes: trunk pages and leaf pages. The
199** file header points to first in a linked list of trunk page. Each trunk
200** page points to multiple leaf pages. The content of a leaf page is
201** unspecified. A trunk page looks like this:
202**
203** SIZE DESCRIPTION
204** 4 Page number of next trunk page
205** 4 Number of leaf pointers on this page
206** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000207*/
208#include "sqliteInt.h"
209#include "pager.h"
210#include "btree.h"
drh1f595712004-06-15 01:40:29 +0000211#include "os.h"
drha059ad02001-04-17 20:09:11 +0000212#include <assert.h>
213
drhc96d8532005-05-03 12:30:33 +0000214/* Round up a number to the next larger multiple of 8. This is used
215** to force 8-byte alignment on 64-bit architectures.
216*/
217#define ROUND8(x) ((x+7)&~7)
218
219
drh4b70f112004-05-02 21:12:19 +0000220/* The following value is the maximum cell size assuming a maximum page
221** size give above.
222*/
drh2e38c322004-09-03 18:38:44 +0000223#define MX_CELL_SIZE(pBt) (pBt->pageSize-8)
drh4b70f112004-05-02 21:12:19 +0000224
225/* The maximum number of cells on a single page of the database. This
226** assumes a minimum cell size of 3 bytes. Such small cells will be
227** exceedingly rare, but they are possible.
228*/
drh2e38c322004-09-03 18:38:44 +0000229#define MX_CELL(pBt) ((pBt->pageSize-8)/3)
drh4b70f112004-05-02 21:12:19 +0000230
paulb95a8862003-04-01 21:16:41 +0000231/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000232typedef struct MemPage MemPage;
danielk1977aef0bf62005-12-30 16:28:01 +0000233typedef struct BtLock BtLock;
paulb95a8862003-04-01 21:16:41 +0000234
drh8c42ca92001-06-22 19:15:00 +0000235/*
drhbd03cae2001-06-02 02:40:57 +0000236** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000237** SQLite database in order to identify the file as a real database.
drh556b2a22005-06-14 16:04:05 +0000238**
239** You can change this value at compile-time by specifying a
240** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The
241** header must be exactly 16 bytes including the zero-terminator so
242** the string itself should be 15 characters long. If you change
243** the header, then your custom library will not be able to read
244** databases generated by the standard tools and the standard tools
245** will not be able to read databases created by your custom library.
246*/
247#ifndef SQLITE_FILE_HEADER /* 123456789 123456 */
248# define SQLITE_FILE_HEADER "SQLite format 3"
249#endif
250static const char zMagicHeader[] = SQLITE_FILE_HEADER;
drh08ed44e2001-04-29 23:32:55 +0000251
252/*
drh4b70f112004-05-02 21:12:19 +0000253** Page type flags. An ORed combination of these flags appear as the
254** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000255*/
drhde647132004-05-07 17:57:49 +0000256#define PTF_INTKEY 0x01
drh9e572e62004-04-23 23:43:10 +0000257#define PTF_ZERODATA 0x02
drh8b18dd42004-05-12 19:18:15 +0000258#define PTF_LEAFDATA 0x04
259#define PTF_LEAF 0x08
drh8c42ca92001-06-22 19:15:00 +0000260
261/*
drh9e572e62004-04-23 23:43:10 +0000262** As each page of the file is loaded into memory, an instance of the following
263** structure is appended and initialized to zero. This structure stores
264** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000265**
drh72f82862001-05-24 21:06:34 +0000266** The pParent field points back to the parent page. This allows us to
267** walk up the BTree from any leaf to the root. Care must be taken to
268** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000269** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000270*/
271struct MemPage {
drha6abd042004-06-09 17:37:22 +0000272 u8 isInit; /* True if previously initialized. MUST BE FIRST! */
drh43605152004-05-29 21:46:49 +0000273 u8 idxShift; /* True if Cell indices have changed */
274 u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
275 u8 intKey; /* True if intkey flag is set */
276 u8 leaf; /* True if leaf flag is set */
277 u8 zeroData; /* True if table stores keys only */
278 u8 leafData; /* True if tables stores data on leaves only */
279 u8 hasData; /* True if this page stores data */
280 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
drh271efa52004-05-30 19:19:05 +0000281 u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */
drha2fce642004-06-05 00:01:44 +0000282 u16 maxLocal; /* Copy of Btree.maxLocal or Btree.maxLeaf */
283 u16 minLocal; /* Copy of Btree.minLocal or Btree.minLeaf */
drh43605152004-05-29 21:46:49 +0000284 u16 cellOffset; /* Index in aData of first cell pointer */
285 u16 idxParent; /* Index in parent of this node */
286 u16 nFree; /* Number of free bytes on the page */
287 u16 nCell; /* Number of cells on this page, local and ovfl */
288 struct _OvflCell { /* Cells that will not fit on aData[] */
danielk1977aef0bf62005-12-30 16:28:01 +0000289 u8 *pCell; /* Pointers to the body of the overflow cell */
290 u16 idx; /* Insert this cell before idx-th non-overflow cell */
drha2fce642004-06-05 00:01:44 +0000291 } aOvfl[5];
danielk1977aef0bf62005-12-30 16:28:01 +0000292 BtShared *pBt; /* Pointer back to BTree structure */
293 u8 *aData; /* Pointer back to the start of the page */
294 Pgno pgno; /* Page number for this page */
295 MemPage *pParent; /* The parent of this page. NULL for root */
drh8c42ca92001-06-22 19:15:00 +0000296};
drh7e3b0a02001-04-28 16:52:40 +0000297
298/*
drh3b7511c2001-05-26 13:15:44 +0000299** The in-memory image of a disk page has the auxiliary information appended
300** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
301** that extra information.
302*/
drh3aac2dd2004-04-26 14:10:20 +0000303#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000304
danielk1977aef0bf62005-12-30 16:28:01 +0000305/* Btree handle */
306struct Btree {
307 sqlite3 *pSqlite;
308 BtShared *pBt;
309 u8 inTrans; /* TRANS_NONE, TRANS_READ or TRANS_WRITE */
310};
311
312/*
313** Btree.inTrans may take one of the following values.
314**
315** If the shared-data extension is enabled, there may be multiple users
316** of the Btree structure. At most one of these may open a write transaction,
317** but any number may have active read transactions. Variable Btree.pDb
318** points to the handle that owns any current write-transaction.
319*/
320#define TRANS_NONE 0
321#define TRANS_READ 1
322#define TRANS_WRITE 2
323
drh3b7511c2001-05-26 13:15:44 +0000324/*
drha059ad02001-04-17 20:09:11 +0000325** Everything we need to know about an open database
326*/
danielk1977aef0bf62005-12-30 16:28:01 +0000327struct BtShared {
drha059ad02001-04-17 20:09:11 +0000328 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000329 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000330 MemPage *pPage1; /* First page of the database */
drhab01f612004-05-22 02:55:23 +0000331 u8 inStmt; /* True if we are in a statement subtransaction */
drh5df72a52002-06-06 23:16:05 +0000332 u8 readOnly; /* True if the underlying file is readonly */
drhab01f612004-05-22 02:55:23 +0000333 u8 maxEmbedFrac; /* Maximum payload as % of total page size */
334 u8 minEmbedFrac; /* Minimum payload as % of total page size */
335 u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
drh90f5ecb2004-07-22 01:19:35 +0000336 u8 pageSizeFixed; /* True if the page size can no longer be changed */
drh057cd3a2005-02-15 16:23:02 +0000337#ifndef SQLITE_OMIT_AUTOVACUUM
338 u8 autoVacuum; /* True if database supports auto-vacuum */
339#endif
drha2fce642004-06-05 00:01:44 +0000340 u16 pageSize; /* Total number of bytes on a page */
341 u16 usableSize; /* Number of usable bytes on each page */
drh6f11bef2004-05-13 01:12:56 +0000342 int maxLocal; /* Maximum local payload in non-LEAFDATA tables */
343 int minLocal; /* Minimum local payload in non-LEAFDATA tables */
344 int maxLeaf; /* Maximum local payload in a LEAFDATA table */
345 int minLeaf; /* Minimum local payload in a LEAFDATA table */
drhb8ef32c2005-03-14 02:01:49 +0000346 BusyHandler *pBusyHandler; /* Callback for when there is lock contention */
danielk1977aef0bf62005-12-30 16:28:01 +0000347 u8 inTransaction; /* Transaction state */
348 BtShared *pNext; /* Next in SqliteTsd.pBtree linked list */
349 int nRef; /* Number of references to this structure */
350 int nTransaction; /* Number of open transactions (read + write) */
351 BtLock *pLock; /* List of locks held on this shared-btree struct */
drha059ad02001-04-17 20:09:11 +0000352};
danielk1977ee5741e2004-05-31 10:01:34 +0000353
354/*
drhfa1a98a2004-05-14 19:08:17 +0000355** An instance of the following structure is used to hold information
drh271efa52004-05-30 19:19:05 +0000356** about a cell. The parseCellPtr() function fills in this structure
drhab01f612004-05-22 02:55:23 +0000357** based on information extract from the raw disk page.
drhfa1a98a2004-05-14 19:08:17 +0000358*/
359typedef struct CellInfo CellInfo;
360struct CellInfo {
drh43605152004-05-29 21:46:49 +0000361 u8 *pCell; /* Pointer to the start of cell content */
drhfa1a98a2004-05-14 19:08:17 +0000362 i64 nKey; /* The key for INTKEY tables, or number of bytes in key */
363 u32 nData; /* Number of bytes of data */
drh271efa52004-05-30 19:19:05 +0000364 u16 nHeader; /* Size of the cell content header in bytes */
drhfa1a98a2004-05-14 19:08:17 +0000365 u16 nLocal; /* Amount of payload held locally */
drhab01f612004-05-22 02:55:23 +0000366 u16 iOverflow; /* Offset to overflow page number. Zero if no overflow */
drh271efa52004-05-30 19:19:05 +0000367 u16 nSize; /* Size of the cell content on the main b-tree page */
drhfa1a98a2004-05-14 19:08:17 +0000368};
369
370/*
drh365d68f2001-05-11 11:02:46 +0000371** A cursor is a pointer to a particular entry in the BTree.
372** The entry is identified by its MemPage and the index in
drha34b6762004-05-07 13:30:42 +0000373** MemPage.aCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000374*/
drh72f82862001-05-24 21:06:34 +0000375struct BtCursor {
danielk1977aef0bf62005-12-30 16:28:01 +0000376 Btree *pBtree; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000377 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drh3aac2dd2004-04-26 14:10:20 +0000378 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
379 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000380 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000381 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000382 int idx; /* Index of the entry in pPage->aCell[] */
drhfa1a98a2004-05-14 19:08:17 +0000383 CellInfo info; /* A parse of the cell we are pointing at */
drhecdc7532001-09-23 02:35:53 +0000384 u8 wrFlag; /* True if writable */
drhc39e0002004-05-07 23:50:57 +0000385 u8 isValid; /* TRUE if points to a valid entry */
drh365d68f2001-05-11 11:02:46 +0000386};
drh7e3b0a02001-04-28 16:52:40 +0000387
drha059ad02001-04-17 20:09:11 +0000388/*
drh615ae552005-01-16 23:21:00 +0000389** The TRACE macro will print high-level status information about the
390** btree operation when the global variable sqlite3_btree_trace is
391** enabled.
392*/
393#if SQLITE_TEST
394# define TRACE(X) if( sqlite3_btree_trace )\
395 { sqlite3DebugPrintf X; fflush(stdout); }
396#else
397# define TRACE(X)
398#endif
399int sqlite3_btree_trace=0; /* True to enable tracing */
400
401/*
drh66cbd152004-09-01 16:12:25 +0000402** Forward declaration
403*/
danielk1977aef0bf62005-12-30 16:28:01 +0000404static int checkReadLocks(BtShared*,Pgno,BtCursor*);
drh66cbd152004-09-01 16:12:25 +0000405
drh66cbd152004-09-01 16:12:25 +0000406/*
drhab01f612004-05-22 02:55:23 +0000407** Read or write a two- and four-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000408*/
drh9e572e62004-04-23 23:43:10 +0000409static u32 get2byte(unsigned char *p){
410 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000411}
drh9e572e62004-04-23 23:43:10 +0000412static u32 get4byte(unsigned char *p){
413 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
414}
drh9e572e62004-04-23 23:43:10 +0000415static void put2byte(unsigned char *p, u32 v){
416 p[0] = v>>8;
417 p[1] = v;
418}
419static void put4byte(unsigned char *p, u32 v){
420 p[0] = v>>24;
421 p[1] = v>>16;
422 p[2] = v>>8;
423 p[3] = v;
424}
drh6f11bef2004-05-13 01:12:56 +0000425
drh9e572e62004-04-23 23:43:10 +0000426/*
drhab01f612004-05-22 02:55:23 +0000427** Routines to read and write variable-length integers. These used to
428** be defined locally, but now we use the varint routines in the util.c
429** file.
drh9e572e62004-04-23 23:43:10 +0000430*/
drh6d2fb152004-05-14 16:50:06 +0000431#define getVarint sqlite3GetVarint
432#define getVarint32 sqlite3GetVarint32
433#define putVarint sqlite3PutVarint
drh0d316a42002-08-11 20:10:47 +0000434
danielk1977599fcba2004-11-08 07:13:13 +0000435/* The database page the PENDING_BYTE occupies. This page is never used.
436** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They
437** should possibly be consolidated (presumably in pager.h).
438*/
439#define PENDING_BYTE_PAGE(pBt) ((PENDING_BYTE/(pBt)->pageSize)+1)
danielk1977afcdd022004-10-31 16:25:42 +0000440
danielk1977aef0bf62005-12-30 16:28:01 +0000441/*
442** A linked list of the following structures is stored at BtShared.pLock.
443** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor
444** is opened on the table with root page BtShared.iTable. Locks are removed
445** from this list when a transaction is committed or rolled back, or when
446** a btree handle is closed.
447*/
448struct BtLock {
449 Btree *pBtree; /* Btree handle holding this lock */
450 Pgno iTable; /* Root page of table */
451 u8 eLock; /* READ_LOCK or WRITE_LOCK */
452 BtLock *pNext; /* Next in BtShared.pLock list */
453};
454
455/* Candidate values for BtLock.eLock */
456#define READ_LOCK 1
457#define WRITE_LOCK 2
458
459#ifdef SQLITE_OMIT_SHARED_CACHE
460 /*
461 ** The functions queryTableLock(), lockTable() and unlockAllTables()
462 ** manipulate entries in the BtShared.pLock linked list used to store
463 ** shared-cache table level locks. If the library is compiled with the
464 ** shared-cache feature disabled, then there is only ever one user
465 ** of each BtShared structure and so this locking is not required.
466 ** So define the three interface functions as no-ops.
467 */
468 #define queryTableLock(a,b,c) SQLITE_OK
469 #define lockTable(a,b,c) SQLITE_OK
470 #define unlockAllTables(a,b,c)
471#else
472
473/*
474** Query to see if btree handle p may obtain a lock of type eLock
475** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
476** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
477** SQLITE_BUSY if not.
478*/
479static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
480 BtShared *pBt = p->pBt;
481 BtLock *pIter;
482
483 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
484 if( pIter->pBtree!=p && pIter->iTable==iTab &&
485 (pIter->eLock!=READ_LOCK || eLock!=READ_LOCK) ){
486 return SQLITE_BUSY;
487 }
488 }
489 return SQLITE_OK;
490}
491
492/*
493** Add a lock on the table with root-page iTable to the shared-btree used
494** by Btree handle p. Parameter eLock must be either READ_LOCK or
495** WRITE_LOCK.
496**
497** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
498** SQLITE_NOMEM may also be returned.
499*/
500static int lockTable(Btree *p, Pgno iTable, u8 eLock){
501 BtShared *pBt = p->pBt;
502 BtLock *pLock = 0;
503 BtLock *pIter;
504
505 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
506
507 /* First search the list for an existing lock on this table. */
508 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
509 if( pIter->iTable==iTable && pIter->pBtree==p ){
510 pLock = pIter;
511 break;
512 }
513 }
514
515 /* If the above search did not find a BtLock struct associating Btree p
516 ** with table iTable, allocate one and link it into the list.
517 */
518 if( !pLock ){
519 pLock = (BtLock *)sqliteMalloc(sizeof(BtLock));
520 if( !pLock ){
521 return SQLITE_NOMEM;
522 }
523 pLock->iTable = iTable;
524 pLock->pBtree = p;
525 pLock->pNext = pBt->pLock;
526 pBt->pLock = pLock;
527 }
528
529 /* Set the BtLock.eLock variable to the maximum of the current lock
530 ** and the requested lock. This means if a write-lock was already held
531 ** and a read-lock requested, we don't incorrectly downgrade the lock.
532 */
533 assert( WRITE_LOCK>READ_LOCK );
534 pLock->eLock = MAX(pLock->eLock, eLock);
535
536 return SQLITE_OK;
537}
538
539/*
540** Release all the table locks (locks obtained via calls to the lockTable()
541** procedure) held by Btree handle p.
542*/
543static void unlockAllTables(Btree *p){
544 BtLock **ppIter = &p->pBt->pLock;
545 while( *ppIter ){
546 BtLock *pLock = *ppIter;
547 if( pLock->pBtree==p ){
548 *ppIter = pLock->pNext;
549 sqliteFree(pLock);
550 }else{
551 ppIter = &pLock->pNext;
552 }
553 }
554}
555#endif /* SQLITE_OMIT_SHARED_CACHE */
556
danielk1977599fcba2004-11-08 07:13:13 +0000557#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977afcdd022004-10-31 16:25:42 +0000558/*
drh42cac6d2004-11-20 20:31:11 +0000559** These macros define the location of the pointer-map entry for a
560** database page. The first argument to each is the number of usable
561** bytes on each page of the database (often 1024). The second is the
562** page number to look up in the pointer map.
danielk1977afcdd022004-10-31 16:25:42 +0000563**
564** PTRMAP_PAGENO returns the database page number of the pointer-map
565** page that stores the required pointer. PTRMAP_PTROFFSET returns
566** the offset of the requested map entry.
567**
568** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
569** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
danielk1977599fcba2004-11-08 07:13:13 +0000570** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
571** this test.
danielk1977afcdd022004-10-31 16:25:42 +0000572*/
573#define PTRMAP_PAGENO(pgsz, pgno) (((pgno-2)/(pgsz/5+1))*(pgsz/5+1)+2)
574#define PTRMAP_PTROFFSET(pgsz, pgno) (((pgno-2)%(pgsz/5+1)-1)*5)
danielk1977a19df672004-11-03 11:37:07 +0000575#define PTRMAP_ISPAGE(pgsz, pgno) (PTRMAP_PAGENO(pgsz,pgno)==pgno)
576
danielk1977afcdd022004-10-31 16:25:42 +0000577/*
drh615ae552005-01-16 23:21:00 +0000578** The pointer map is a lookup table that identifies the parent page for
579** each child page in the database file. The parent page is the page that
580** contains a pointer to the child. Every page in the database contains
581** 0 or 1 parent pages. (In this context 'database page' refers
582** to any page that is not part of the pointer map itself.) Each pointer map
583** entry consists of a single byte 'type' and a 4 byte parent page number.
584** The PTRMAP_XXX identifiers below are the valid types.
585**
586** The purpose of the pointer map is to facility moving pages from one
587** position in the file to another as part of autovacuum. When a page
588** is moved, the pointer in its parent must be updated to point to the
589** new location. The pointer map is used to locate the parent page quickly.
danielk1977afcdd022004-10-31 16:25:42 +0000590**
danielk1977687566d2004-11-02 12:56:41 +0000591** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
592** used in this case.
danielk1977afcdd022004-10-31 16:25:42 +0000593**
danielk1977687566d2004-11-02 12:56:41 +0000594** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
595** is not used in this case.
596**
597** PTRMAP_OVERFLOW1: The database page is the first page in a list of
598** overflow pages. The page number identifies the page that
599** contains the cell with a pointer to this overflow page.
600**
601** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
602** overflow pages. The page-number identifies the previous
603** page in the overflow page list.
604**
605** PTRMAP_BTREE: The database page is a non-root btree page. The page number
606** identifies the parent page in the btree.
danielk1977afcdd022004-10-31 16:25:42 +0000607*/
danielk1977687566d2004-11-02 12:56:41 +0000608#define PTRMAP_ROOTPAGE 1
609#define PTRMAP_FREEPAGE 2
610#define PTRMAP_OVERFLOW1 3
611#define PTRMAP_OVERFLOW2 4
612#define PTRMAP_BTREE 5
danielk1977afcdd022004-10-31 16:25:42 +0000613
614/*
615** Write an entry into the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000616**
617** This routine updates the pointer map entry for page number 'key'
618** so that it maps to type 'eType' and parent page number 'pgno'.
619** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000620*/
danielk1977aef0bf62005-12-30 16:28:01 +0000621static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
danielk1977afcdd022004-10-31 16:25:42 +0000622 u8 *pPtrmap; /* The pointer map page */
623 Pgno iPtrmap; /* The pointer map page number */
624 int offset; /* Offset in pointer map page */
625 int rc;
626
danielk1977ac11ee62005-01-15 12:45:51 +0000627 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +0000628 if( key==0 ){
drh49285702005-09-17 15:20:26 +0000629 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +0000630 }
drh42cac6d2004-11-20 20:31:11 +0000631 iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
danielk1977afcdd022004-10-31 16:25:42 +0000632 rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
danielk1977687566d2004-11-02 12:56:41 +0000633 if( rc!=SQLITE_OK ){
danielk1977afcdd022004-10-31 16:25:42 +0000634 return rc;
635 }
drh42cac6d2004-11-20 20:31:11 +0000636 offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
danielk1977afcdd022004-10-31 16:25:42 +0000637
drh615ae552005-01-16 23:21:00 +0000638 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
639 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
danielk1977afcdd022004-10-31 16:25:42 +0000640 rc = sqlite3pager_write(pPtrmap);
danielk19775558a8a2005-01-17 07:53:44 +0000641 if( rc==SQLITE_OK ){
642 pPtrmap[offset] = eType;
643 put4byte(&pPtrmap[offset+1], parent);
danielk1977afcdd022004-10-31 16:25:42 +0000644 }
danielk1977afcdd022004-10-31 16:25:42 +0000645 }
646
647 sqlite3pager_unref(pPtrmap);
danielk19775558a8a2005-01-17 07:53:44 +0000648 return rc;
danielk1977afcdd022004-10-31 16:25:42 +0000649}
650
651/*
652** Read an entry from the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000653**
654** This routine retrieves the pointer map entry for page 'key', writing
655** the type and parent page number to *pEType and *pPgno respectively.
656** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000657*/
danielk1977aef0bf62005-12-30 16:28:01 +0000658static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
danielk1977afcdd022004-10-31 16:25:42 +0000659 int iPtrmap; /* Pointer map page index */
660 u8 *pPtrmap; /* Pointer map page data */
661 int offset; /* Offset of entry in pointer map */
662 int rc;
663
drh42cac6d2004-11-20 20:31:11 +0000664 iPtrmap = PTRMAP_PAGENO(pBt->usableSize, key);
danielk1977afcdd022004-10-31 16:25:42 +0000665 rc = sqlite3pager_get(pBt->pPager, iPtrmap, (void **)&pPtrmap);
666 if( rc!=0 ){
667 return rc;
668 }
669
drh42cac6d2004-11-20 20:31:11 +0000670 offset = PTRMAP_PTROFFSET(pBt->usableSize, key);
danielk1977687566d2004-11-02 12:56:41 +0000671 if( pEType ) *pEType = pPtrmap[offset];
672 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
danielk1977afcdd022004-10-31 16:25:42 +0000673
674 sqlite3pager_unref(pPtrmap);
drh49285702005-09-17 15:20:26 +0000675 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
danielk1977afcdd022004-10-31 16:25:42 +0000676 return SQLITE_OK;
677}
678
679#endif /* SQLITE_OMIT_AUTOVACUUM */
680
drh0d316a42002-08-11 20:10:47 +0000681/*
danielk1977aef0bf62005-12-30 16:28:01 +0000682** Return a pointer to the Btree structure associated with btree pBt
683** and connection handle pSqlite.
684*/
685#if 0
686static Btree *btree_findref(BtShared *pBt, sqlite3 *pSqlite){
687#ifndef SQLITE_OMIT_SHARED_DATA
688 if( pBt->aRef ){
689 int i;
690 for(i=0; i<pBt->nRef; i++){
691 if( pBt->aRef[i].pSqlite==pSqlite ){
692 return &pBt->aRef[i];
693 }
694 }
695 assert(0);
696 }
697#endif
698 return &pBt->ref;
699}
700#endif
701
702/*
drh271efa52004-05-30 19:19:05 +0000703** Given a btree page and a cell index (0 means the first cell on
704** the page, 1 means the second cell, and so forth) return a pointer
705** to the cell content.
706**
707** This routine works only for pages that do not contain overflow cells.
drh3aac2dd2004-04-26 14:10:20 +0000708*/
drh43605152004-05-29 21:46:49 +0000709static u8 *findCell(MemPage *pPage, int iCell){
710 u8 *data = pPage->aData;
711 assert( iCell>=0 );
712 assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
713 return data + get2byte(&data[pPage->cellOffset+2*iCell]);
714}
715
716/*
717** This a more complex version of findCell() that works for
718** pages that do contain overflow cells. See insert
719*/
720static u8 *findOverflowCell(MemPage *pPage, int iCell){
721 int i;
722 for(i=pPage->nOverflow-1; i>=0; i--){
drh6d08b4d2004-07-20 12:45:22 +0000723 int k;
724 struct _OvflCell *pOvfl;
725 pOvfl = &pPage->aOvfl[i];
726 k = pOvfl->idx;
727 if( k<=iCell ){
728 if( k==iCell ){
729 return pOvfl->pCell;
drh43605152004-05-29 21:46:49 +0000730 }
731 iCell--;
732 }
733 }
734 return findCell(pPage, iCell);
735}
736
737/*
738** Parse a cell content block and fill in the CellInfo structure. There
739** are two versions of this function. parseCell() takes a cell index
740** as the second argument and parseCellPtr() takes a pointer to the
741** body of the cell as its second argument.
742*/
743static void parseCellPtr(
drh3aac2dd2004-04-26 14:10:20 +0000744 MemPage *pPage, /* Page containing the cell */
drh43605152004-05-29 21:46:49 +0000745 u8 *pCell, /* Pointer to the cell text. */
drh6f11bef2004-05-13 01:12:56 +0000746 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000747){
drh271efa52004-05-30 19:19:05 +0000748 int n; /* Number bytes in cell content header */
749 u32 nPayload; /* Number of bytes of cell payload */
drh43605152004-05-29 21:46:49 +0000750
751 pInfo->pCell = pCell;
drhab01f612004-05-22 02:55:23 +0000752 assert( pPage->leaf==0 || pPage->leaf==1 );
drh271efa52004-05-30 19:19:05 +0000753 n = pPage->childPtrSize;
754 assert( n==4-4*pPage->leaf );
drh8b18dd42004-05-12 19:18:15 +0000755 if( pPage->hasData ){
drh271efa52004-05-30 19:19:05 +0000756 n += getVarint32(&pCell[n], &nPayload);
drh8b18dd42004-05-12 19:18:15 +0000757 }else{
drh271efa52004-05-30 19:19:05 +0000758 nPayload = 0;
drh3aac2dd2004-04-26 14:10:20 +0000759 }
danielk1977e0d4b062004-06-28 01:11:46 +0000760 n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
drh6f11bef2004-05-13 01:12:56 +0000761 pInfo->nHeader = n;
drh271efa52004-05-30 19:19:05 +0000762 pInfo->nData = nPayload;
drh6f11bef2004-05-13 01:12:56 +0000763 if( !pPage->intKey ){
764 nPayload += pInfo->nKey;
765 }
drh271efa52004-05-30 19:19:05 +0000766 if( nPayload<=pPage->maxLocal ){
767 /* This is the (easy) common case where the entire payload fits
768 ** on the local page. No overflow is required.
769 */
770 int nSize; /* Total size of cell content in bytes */
drh6f11bef2004-05-13 01:12:56 +0000771 pInfo->nLocal = nPayload;
772 pInfo->iOverflow = 0;
drh271efa52004-05-30 19:19:05 +0000773 nSize = nPayload + n;
774 if( nSize<4 ){
775 nSize = 4; /* Minimum cell size is 4 */
drh43605152004-05-29 21:46:49 +0000776 }
drh271efa52004-05-30 19:19:05 +0000777 pInfo->nSize = nSize;
drh6f11bef2004-05-13 01:12:56 +0000778 }else{
drh271efa52004-05-30 19:19:05 +0000779 /* If the payload will not fit completely on the local page, we have
780 ** to decide how much to store locally and how much to spill onto
781 ** overflow pages. The strategy is to minimize the amount of unused
782 ** space on overflow pages while keeping the amount of local storage
783 ** in between minLocal and maxLocal.
784 **
785 ** Warning: changing the way overflow payload is distributed in any
786 ** way will result in an incompatible file format.
787 */
788 int minLocal; /* Minimum amount of payload held locally */
789 int maxLocal; /* Maximum amount of payload held locally */
790 int surplus; /* Overflow payload available for local storage */
791
792 minLocal = pPage->minLocal;
793 maxLocal = pPage->maxLocal;
794 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +0000795 if( surplus <= maxLocal ){
796 pInfo->nLocal = surplus;
797 }else{
798 pInfo->nLocal = minLocal;
799 }
800 pInfo->iOverflow = pInfo->nLocal + n;
801 pInfo->nSize = pInfo->iOverflow + 4;
802 }
drh3aac2dd2004-04-26 14:10:20 +0000803}
drh43605152004-05-29 21:46:49 +0000804static void parseCell(
805 MemPage *pPage, /* Page containing the cell */
806 int iCell, /* The cell index. First cell is 0 */
807 CellInfo *pInfo /* Fill in this structure */
808){
809 parseCellPtr(pPage, findCell(pPage, iCell), pInfo);
810}
drh3aac2dd2004-04-26 14:10:20 +0000811
812/*
drh43605152004-05-29 21:46:49 +0000813** Compute the total number of bytes that a Cell needs in the cell
814** data area of the btree-page. The return number includes the cell
815** data header and the local payload, but not any overflow page or
816** the space used by the cell pointer.
drh3b7511c2001-05-26 13:15:44 +0000817*/
danielk1977bc6ada42004-06-30 08:20:16 +0000818#ifndef NDEBUG
drh43605152004-05-29 21:46:49 +0000819static int cellSize(MemPage *pPage, int iCell){
drh6f11bef2004-05-13 01:12:56 +0000820 CellInfo info;
drh43605152004-05-29 21:46:49 +0000821 parseCell(pPage, iCell, &info);
822 return info.nSize;
823}
danielk1977bc6ada42004-06-30 08:20:16 +0000824#endif
drh43605152004-05-29 21:46:49 +0000825static int cellSizePtr(MemPage *pPage, u8 *pCell){
826 CellInfo info;
827 parseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +0000828 return info.nSize;
drh3b7511c2001-05-26 13:15:44 +0000829}
830
danielk197779a40da2005-01-16 08:00:01 +0000831#ifndef SQLITE_OMIT_AUTOVACUUM
drh3b7511c2001-05-26 13:15:44 +0000832/*
danielk197726836652005-01-17 01:33:13 +0000833** If the cell pCell, part of page pPage contains a pointer
danielk197779a40da2005-01-16 08:00:01 +0000834** to an overflow page, insert an entry into the pointer-map
835** for the overflow page.
danielk1977ac11ee62005-01-15 12:45:51 +0000836*/
danielk197726836652005-01-17 01:33:13 +0000837static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
danielk197779a40da2005-01-16 08:00:01 +0000838 if( pCell ){
839 CellInfo info;
840 parseCellPtr(pPage, pCell, &info);
841 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
842 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
843 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
844 }
danielk1977ac11ee62005-01-15 12:45:51 +0000845 }
danielk197779a40da2005-01-16 08:00:01 +0000846 return SQLITE_OK;
danielk1977ac11ee62005-01-15 12:45:51 +0000847}
danielk197726836652005-01-17 01:33:13 +0000848/*
849** If the cell with index iCell on page pPage contains a pointer
850** to an overflow page, insert an entry into the pointer-map
851** for the overflow page.
852*/
853static int ptrmapPutOvfl(MemPage *pPage, int iCell){
854 u8 *pCell;
855 pCell = findOverflowCell(pPage, iCell);
856 return ptrmapPutOvflPtr(pPage, pCell);
857}
danielk197779a40da2005-01-16 08:00:01 +0000858#endif
859
danielk1977ac11ee62005-01-15 12:45:51 +0000860
861/*
drhda200cc2004-05-09 11:51:38 +0000862** Do sanity checking on a page. Throw an exception if anything is
863** not right.
864**
865** This routine is used for internal error checking only. It is omitted
866** from most builds.
867*/
868#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
869static void _pageIntegrity(MemPage *pPage){
drhb6f41482004-05-14 01:58:11 +0000870 int usableSize;
drhda200cc2004-05-09 11:51:38 +0000871 u8 *data;
drh43605152004-05-29 21:46:49 +0000872 int i, j, idx, c, pc, hdr, nFree;
873 int cellOffset;
874 int nCell, cellLimit;
drh2e38c322004-09-03 18:38:44 +0000875 u8 *used;
drhda200cc2004-05-09 11:51:38 +0000876
drh2e38c322004-09-03 18:38:44 +0000877 used = sqliteMallocRaw( pPage->pBt->pageSize );
878 if( used==0 ) return;
drhb6f41482004-05-14 01:58:11 +0000879 usableSize = pPage->pBt->usableSize;
drh07d183d2005-05-01 22:52:42 +0000880 assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhda200cc2004-05-09 11:51:38 +0000881 hdr = pPage->hdrOffset;
882 assert( hdr==(pPage->pgno==1 ? 100 : 0) );
883 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
884 c = pPage->aData[hdr];
885 if( pPage->isInit ){
886 assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
887 assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
drh8b18dd42004-05-12 19:18:15 +0000888 assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) );
889 assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) );
890 assert( pPage->hasData ==
891 !(pPage->zeroData || (!pPage->leaf && pPage->leafData)) );
drh43605152004-05-29 21:46:49 +0000892 assert( pPage->cellOffset==pPage->hdrOffset+12-4*pPage->leaf );
893 assert( pPage->nCell = get2byte(&pPage->aData[hdr+3]) );
drhda200cc2004-05-09 11:51:38 +0000894 }
895 data = pPage->aData;
drhb6f41482004-05-14 01:58:11 +0000896 memset(used, 0, usableSize);
drhda200cc2004-05-09 11:51:38 +0000897 for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
898 nFree = 0;
899 pc = get2byte(&data[hdr+1]);
900 while( pc ){
901 int size;
drhb6f41482004-05-14 01:58:11 +0000902 assert( pc>0 && pc<usableSize-4 );
drhda200cc2004-05-09 11:51:38 +0000903 size = get2byte(&data[pc+2]);
drhb6f41482004-05-14 01:58:11 +0000904 assert( pc+size<=usableSize );
drhda200cc2004-05-09 11:51:38 +0000905 nFree += size;
906 for(i=pc; i<pc+size; i++){
907 assert( used[i]==0 );
908 used[i] = 1;
909 }
910 pc = get2byte(&data[pc]);
911 }
drhda200cc2004-05-09 11:51:38 +0000912 idx = 0;
drh43605152004-05-29 21:46:49 +0000913 nCell = get2byte(&data[hdr+3]);
914 cellLimit = get2byte(&data[hdr+5]);
915 assert( pPage->isInit==0
916 || pPage->nFree==nFree+data[hdr+7]+cellLimit-(cellOffset+2*nCell) );
917 cellOffset = pPage->cellOffset;
918 for(i=0; i<nCell; i++){
drhda200cc2004-05-09 11:51:38 +0000919 int size;
drh43605152004-05-29 21:46:49 +0000920 pc = get2byte(&data[cellOffset+2*i]);
drhb6f41482004-05-14 01:58:11 +0000921 assert( pc>0 && pc<usableSize-4 );
drhda200cc2004-05-09 11:51:38 +0000922 size = cellSize(pPage, &data[pc]);
drhb6f41482004-05-14 01:58:11 +0000923 assert( pc+size<=usableSize );
drh43605152004-05-29 21:46:49 +0000924 for(j=pc; j<pc+size; j++){
925 assert( used[j]==0 );
926 used[j] = 1;
drhda200cc2004-05-09 11:51:38 +0000927 }
drhda200cc2004-05-09 11:51:38 +0000928 }
drh43605152004-05-29 21:46:49 +0000929 for(i=cellOffset+2*nCell; i<cellimit; i++){
930 assert( used[i]==0 );
931 used[i] = 1;
932 }
drhda200cc2004-05-09 11:51:38 +0000933 nFree = 0;
drhb6f41482004-05-14 01:58:11 +0000934 for(i=0; i<usableSize; i++){
drhda200cc2004-05-09 11:51:38 +0000935 assert( used[i]<=1 );
936 if( used[i]==0 ) nFree++;
937 }
drh43605152004-05-29 21:46:49 +0000938 assert( nFree==data[hdr+7] );
drh2e38c322004-09-03 18:38:44 +0000939 sqliteFree(used);
drhda200cc2004-05-09 11:51:38 +0000940}
941#define pageIntegrity(X) _pageIntegrity(X)
942#else
943# define pageIntegrity(X)
944#endif
945
danielk1977aef0bf62005-12-30 16:28:01 +0000946/* A bunch of assert() statements to check the transaction state variables
947** of handle p (type Btree*) are internally consistent.
948*/
949#define btreeIntegrity(p) \
950 assert( p->inTrans!=TRANS_NONE || p->pBt->nTransaction<p->pBt->nRef ); \
951 assert( p->pBt->nTransaction<=p->pBt->nRef ); \
952 assert( p->pBt->inTransaction!=TRANS_NONE || p->pBt->nTransaction==0 ); \
953 assert( p->pBt->inTransaction>=p->inTrans );
954
drhda200cc2004-05-09 11:51:38 +0000955/*
drh72f82862001-05-24 21:06:34 +0000956** Defragment the page given. All Cells are moved to the
drh3a4a2d42005-11-24 14:24:28 +0000957** end of the page and all free space is collected into one
958** big FreeBlk that occurs in between the header and cell
drh31beae92005-11-24 14:34:36 +0000959** pointer array and the cell content area.
drh365d68f2001-05-11 11:02:46 +0000960*/
drh2e38c322004-09-03 18:38:44 +0000961static int defragmentPage(MemPage *pPage){
drh43605152004-05-29 21:46:49 +0000962 int i; /* Loop counter */
963 int pc; /* Address of a i-th cell */
964 int addr; /* Offset of first byte after cell pointer array */
965 int hdr; /* Offset to the page header */
966 int size; /* Size of a cell */
967 int usableSize; /* Number of usable bytes on a page */
968 int cellOffset; /* Offset to the cell pointer array */
969 int brk; /* Offset to the cell content area */
970 int nCell; /* Number of cells on the page */
drh2e38c322004-09-03 18:38:44 +0000971 unsigned char *data; /* The page data */
972 unsigned char *temp; /* Temp area for cell content */
drh2af926b2001-05-15 00:39:25 +0000973
drha34b6762004-05-07 13:30:42 +0000974 assert( sqlite3pager_iswriteable(pPage->aData) );
drh9e572e62004-04-23 23:43:10 +0000975 assert( pPage->pBt!=0 );
drh90f5ecb2004-07-22 01:19:35 +0000976 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
drh43605152004-05-29 21:46:49 +0000977 assert( pPage->nOverflow==0 );
drh2e38c322004-09-03 18:38:44 +0000978 temp = sqliteMalloc( pPage->pBt->pageSize );
979 if( temp==0 ) return SQLITE_NOMEM;
drh43605152004-05-29 21:46:49 +0000980 data = pPage->aData;
drh9e572e62004-04-23 23:43:10 +0000981 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000982 cellOffset = pPage->cellOffset;
983 nCell = pPage->nCell;
984 assert( nCell==get2byte(&data[hdr+3]) );
985 usableSize = pPage->pBt->usableSize;
986 brk = get2byte(&data[hdr+5]);
987 memcpy(&temp[brk], &data[brk], usableSize - brk);
988 brk = usableSize;
989 for(i=0; i<nCell; i++){
990 u8 *pAddr; /* The i-th cell pointer */
991 pAddr = &data[cellOffset + i*2];
992 pc = get2byte(pAddr);
993 assert( pc<pPage->pBt->usableSize );
994 size = cellSizePtr(pPage, &temp[pc]);
995 brk -= size;
996 memcpy(&data[brk], &temp[pc], size);
997 put2byte(pAddr, brk);
drh2af926b2001-05-15 00:39:25 +0000998 }
drh43605152004-05-29 21:46:49 +0000999 assert( brk>=cellOffset+2*nCell );
1000 put2byte(&data[hdr+5], brk);
1001 data[hdr+1] = 0;
1002 data[hdr+2] = 0;
1003 data[hdr+7] = 0;
1004 addr = cellOffset+2*nCell;
1005 memset(&data[addr], 0, brk-addr);
drh2e38c322004-09-03 18:38:44 +00001006 sqliteFree(temp);
1007 return SQLITE_OK;
drh365d68f2001-05-11 11:02:46 +00001008}
1009
drha059ad02001-04-17 20:09:11 +00001010/*
drh43605152004-05-29 21:46:49 +00001011** Allocate nByte bytes of space on a page.
drhbd03cae2001-06-02 02:40:57 +00001012**
drh9e572e62004-04-23 23:43:10 +00001013** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +00001014** the new allocation. Or return 0 if there is not enough free
1015** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +00001016**
drh72f82862001-05-24 21:06:34 +00001017** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +00001018** nBytes of contiguous free space, then this routine automatically
1019** calls defragementPage() to consolidate all free space before
1020** allocating the new chunk.
drh7e3b0a02001-04-28 16:52:40 +00001021*/
drh9e572e62004-04-23 23:43:10 +00001022static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +00001023 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +00001024 int size;
drh24cd67e2004-05-10 16:18:47 +00001025 int nFrag;
drh43605152004-05-29 21:46:49 +00001026 int top;
1027 int nCell;
1028 int cellOffset;
drh9e572e62004-04-23 23:43:10 +00001029 unsigned char *data;
drh43605152004-05-29 21:46:49 +00001030
drh9e572e62004-04-23 23:43:10 +00001031 data = pPage->aData;
drha34b6762004-05-07 13:30:42 +00001032 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +00001033 assert( pPage->pBt );
1034 if( nByte<4 ) nByte = 4;
drh43605152004-05-29 21:46:49 +00001035 if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
1036 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +00001037 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00001038
1039 nFrag = data[hdr+7];
1040 if( nFrag<60 ){
1041 /* Search the freelist looking for a slot big enough to satisfy the
1042 ** space request. */
1043 addr = hdr+1;
1044 while( (pc = get2byte(&data[addr]))>0 ){
1045 size = get2byte(&data[pc+2]);
1046 if( size>=nByte ){
1047 if( size<nByte+4 ){
1048 memcpy(&data[addr], &data[pc], 2);
1049 data[hdr+7] = nFrag + size - nByte;
1050 return pc;
1051 }else{
1052 put2byte(&data[pc+2], size-nByte);
1053 return pc + size - nByte;
1054 }
1055 }
1056 addr = pc;
drh9e572e62004-04-23 23:43:10 +00001057 }
1058 }
drh43605152004-05-29 21:46:49 +00001059
1060 /* Allocate memory from the gap in between the cell pointer array
1061 ** and the cell content area.
1062 */
1063 top = get2byte(&data[hdr+5]);
1064 nCell = get2byte(&data[hdr+3]);
1065 cellOffset = pPage->cellOffset;
1066 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
drh2e38c322004-09-03 18:38:44 +00001067 if( defragmentPage(pPage) ) return 0;
drh43605152004-05-29 21:46:49 +00001068 top = get2byte(&data[hdr+5]);
drh2af926b2001-05-15 00:39:25 +00001069 }
drh43605152004-05-29 21:46:49 +00001070 top -= nByte;
1071 assert( cellOffset + 2*nCell <= top );
1072 put2byte(&data[hdr+5], top);
1073 return top;
drh7e3b0a02001-04-28 16:52:40 +00001074}
1075
1076/*
drh9e572e62004-04-23 23:43:10 +00001077** Return a section of the pPage->aData to the freelist.
1078** The first byte of the new free block is pPage->aDisk[start]
1079** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +00001080**
1081** Most of the effort here is involved in coalesing adjacent
1082** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +00001083*/
drh9e572e62004-04-23 23:43:10 +00001084static void freeSpace(MemPage *pPage, int start, int size){
drh43605152004-05-29 21:46:49 +00001085 int addr, pbegin, hdr;
drh9e572e62004-04-23 23:43:10 +00001086 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +00001087
drh9e572e62004-04-23 23:43:10 +00001088 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +00001089 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +00001090 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
danielk1977bc6ada42004-06-30 08:20:16 +00001091 assert( (start + size)<=pPage->pBt->usableSize );
drh9e572e62004-04-23 23:43:10 +00001092 if( size<4 ) size = 4;
1093
1094 /* Add the space back into the linked list of freeblocks */
drh43605152004-05-29 21:46:49 +00001095 hdr = pPage->hdrOffset;
1096 addr = hdr + 1;
drh3aac2dd2004-04-26 14:10:20 +00001097 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drhb6f41482004-05-14 01:58:11 +00001098 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +00001099 assert( pbegin>addr );
1100 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +00001101 }
drhb6f41482004-05-14 01:58:11 +00001102 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +00001103 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +00001104 put2byte(&data[addr], start);
1105 put2byte(&data[start], pbegin);
1106 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +00001107 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +00001108
1109 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +00001110 addr = pPage->hdrOffset + 1;
1111 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +00001112 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +00001113 assert( pbegin>addr );
drh43605152004-05-29 21:46:49 +00001114 assert( pbegin<=pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +00001115 pnext = get2byte(&data[pbegin]);
1116 psize = get2byte(&data[pbegin+2]);
1117 if( pbegin + psize + 3 >= pnext && pnext>0 ){
1118 int frag = pnext - (pbegin+psize);
drh43605152004-05-29 21:46:49 +00001119 assert( frag<=data[pPage->hdrOffset+7] );
1120 data[pPage->hdrOffset+7] -= frag;
drh9e572e62004-04-23 23:43:10 +00001121 put2byte(&data[pbegin], get2byte(&data[pnext]));
1122 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
1123 }else{
drh3aac2dd2004-04-26 14:10:20 +00001124 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +00001125 }
1126 }
drh7e3b0a02001-04-28 16:52:40 +00001127
drh43605152004-05-29 21:46:49 +00001128 /* If the cell content area begins with a freeblock, remove it. */
1129 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1130 int top;
1131 pbegin = get2byte(&data[hdr+1]);
1132 memcpy(&data[hdr+1], &data[pbegin], 2);
1133 top = get2byte(&data[hdr+5]);
1134 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
drh4b70f112004-05-02 21:12:19 +00001135 }
drh4b70f112004-05-02 21:12:19 +00001136}
1137
1138/*
drh271efa52004-05-30 19:19:05 +00001139** Decode the flags byte (the first byte of the header) for a page
1140** and initialize fields of the MemPage structure accordingly.
1141*/
1142static void decodeFlags(MemPage *pPage, int flagByte){
danielk1977aef0bf62005-12-30 16:28:01 +00001143 BtShared *pBt; /* A copy of pPage->pBt */
drh271efa52004-05-30 19:19:05 +00001144
1145 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1146 pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
1147 pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
1148 pPage->leaf = (flagByte & PTF_LEAF)!=0;
1149 pPage->childPtrSize = 4*(pPage->leaf==0);
1150 pBt = pPage->pBt;
1151 if( flagByte & PTF_LEAFDATA ){
1152 pPage->leafData = 1;
1153 pPage->maxLocal = pBt->maxLeaf;
1154 pPage->minLocal = pBt->minLeaf;
1155 }else{
1156 pPage->leafData = 0;
1157 pPage->maxLocal = pBt->maxLocal;
1158 pPage->minLocal = pBt->minLocal;
1159 }
1160 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
1161}
1162
1163/*
drh7e3b0a02001-04-28 16:52:40 +00001164** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +00001165**
drhbd03cae2001-06-02 02:40:57 +00001166** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +00001167** is the parent of the page being initialized. The root of a
1168** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +00001169**
drh72f82862001-05-24 21:06:34 +00001170** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +00001171** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +00001172** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1173** guarantee that the page is well-formed. It only shows that
1174** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +00001175*/
drh9e572e62004-04-23 23:43:10 +00001176static int initPage(
drh3aac2dd2004-04-26 14:10:20 +00001177 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +00001178 MemPage *pParent /* The parent. Might be NULL */
1179){
drh271efa52004-05-30 19:19:05 +00001180 int pc; /* Address of a freeblock within pPage->aData[] */
drh271efa52004-05-30 19:19:05 +00001181 int hdr; /* Offset to beginning of page header */
1182 u8 *data; /* Equal to pPage->aData */
danielk1977aef0bf62005-12-30 16:28:01 +00001183 BtShared *pBt; /* The main btree structure */
drh271efa52004-05-30 19:19:05 +00001184 int usableSize; /* Amount of usable space on each page */
1185 int cellOffset; /* Offset from start of page to first cell pointer */
1186 int nFree; /* Number of unused bytes on the page */
1187 int top; /* First byte of the cell content area */
drh2af926b2001-05-15 00:39:25 +00001188
drh2e38c322004-09-03 18:38:44 +00001189 pBt = pPage->pBt;
1190 assert( pBt!=0 );
1191 assert( pParent==0 || pParent->pBt==pBt );
drha34b6762004-05-07 13:30:42 +00001192 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
drh07d183d2005-05-01 22:52:42 +00001193 assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] );
drhee696e22004-08-30 16:52:17 +00001194 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
1195 /* The parent page should never change unless the file is corrupt */
drh49285702005-09-17 15:20:26 +00001196 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001197 }
drh10617cd2004-05-14 15:27:27 +00001198 if( pPage->isInit ) return SQLITE_OK;
drhda200cc2004-05-09 11:51:38 +00001199 if( pPage->pParent==0 && pParent!=0 ){
1200 pPage->pParent = pParent;
drha34b6762004-05-07 13:30:42 +00001201 sqlite3pager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +00001202 }
drhde647132004-05-07 17:57:49 +00001203 hdr = pPage->hdrOffset;
drha34b6762004-05-07 13:30:42 +00001204 data = pPage->aData;
drh271efa52004-05-30 19:19:05 +00001205 decodeFlags(pPage, data[hdr]);
drh43605152004-05-29 21:46:49 +00001206 pPage->nOverflow = 0;
drhc8629a12004-05-08 20:07:40 +00001207 pPage->idxShift = 0;
drh2e38c322004-09-03 18:38:44 +00001208 usableSize = pBt->usableSize;
drh43605152004-05-29 21:46:49 +00001209 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1210 top = get2byte(&data[hdr+5]);
1211 pPage->nCell = get2byte(&data[hdr+3]);
drh2e38c322004-09-03 18:38:44 +00001212 if( pPage->nCell>MX_CELL(pBt) ){
drhee696e22004-08-30 16:52:17 +00001213 /* To many cells for a single page. The page must be corrupt */
drh49285702005-09-17 15:20:26 +00001214 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001215 }
1216 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
1217 /* All pages must have at least one cell, except for root pages */
drh49285702005-09-17 15:20:26 +00001218 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001219 }
drh9e572e62004-04-23 23:43:10 +00001220
1221 /* Compute the total free space on the page */
drh9e572e62004-04-23 23:43:10 +00001222 pc = get2byte(&data[hdr+1]);
drh43605152004-05-29 21:46:49 +00001223 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
drh9e572e62004-04-23 23:43:10 +00001224 while( pc>0 ){
1225 int next, size;
drhee696e22004-08-30 16:52:17 +00001226 if( pc>usableSize-4 ){
1227 /* Free block is off the page */
drh49285702005-09-17 15:20:26 +00001228 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001229 }
drh9e572e62004-04-23 23:43:10 +00001230 next = get2byte(&data[pc]);
1231 size = get2byte(&data[pc+2]);
drhee696e22004-08-30 16:52:17 +00001232 if( next>0 && next<=pc+size+3 ){
1233 /* Free blocks must be in accending order */
drh49285702005-09-17 15:20:26 +00001234 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001235 }
drh3add3672004-05-15 00:29:24 +00001236 nFree += size;
drh9e572e62004-04-23 23:43:10 +00001237 pc = next;
1238 }
drh3add3672004-05-15 00:29:24 +00001239 pPage->nFree = nFree;
drhee696e22004-08-30 16:52:17 +00001240 if( nFree>=usableSize ){
1241 /* Free space cannot exceed total page size */
drh49285702005-09-17 15:20:26 +00001242 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001243 }
drh9e572e62004-04-23 23:43:10 +00001244
drhde647132004-05-07 17:57:49 +00001245 pPage->isInit = 1;
drhda200cc2004-05-09 11:51:38 +00001246 pageIntegrity(pPage);
drh9e572e62004-04-23 23:43:10 +00001247 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001248}
1249
1250/*
drh8b2f49b2001-06-08 00:21:52 +00001251** Set up a raw page so that it looks like a database page holding
1252** no entries.
drhbd03cae2001-06-02 02:40:57 +00001253*/
drh9e572e62004-04-23 23:43:10 +00001254static void zeroPage(MemPage *pPage, int flags){
1255 unsigned char *data = pPage->aData;
danielk1977aef0bf62005-12-30 16:28:01 +00001256 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00001257 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +00001258 int first;
1259
drhda200cc2004-05-09 11:51:38 +00001260 assert( sqlite3pager_pagenumber(data)==pPage->pgno );
drh07d183d2005-05-01 22:52:42 +00001261 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +00001262 assert( sqlite3pager_iswriteable(data) );
drhb6f41482004-05-14 01:58:11 +00001263 memset(&data[hdr], 0, pBt->usableSize - hdr);
drh9e572e62004-04-23 23:43:10 +00001264 data[hdr] = flags;
drh43605152004-05-29 21:46:49 +00001265 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1266 memset(&data[hdr+1], 0, 4);
1267 data[hdr+7] = 0;
1268 put2byte(&data[hdr+5], pBt->usableSize);
drhb6f41482004-05-14 01:58:11 +00001269 pPage->nFree = pBt->usableSize - first;
drh271efa52004-05-30 19:19:05 +00001270 decodeFlags(pPage, flags);
drh9e572e62004-04-23 23:43:10 +00001271 pPage->hdrOffset = hdr;
drh43605152004-05-29 21:46:49 +00001272 pPage->cellOffset = first;
1273 pPage->nOverflow = 0;
drhda200cc2004-05-09 11:51:38 +00001274 pPage->idxShift = 0;
drh43605152004-05-29 21:46:49 +00001275 pPage->nCell = 0;
drhda200cc2004-05-09 11:51:38 +00001276 pPage->isInit = 1;
1277 pageIntegrity(pPage);
drhbd03cae2001-06-02 02:40:57 +00001278}
1279
1280/*
drh3aac2dd2004-04-26 14:10:20 +00001281** Get a page from the pager. Initialize the MemPage.pBt and
1282** MemPage.aData elements if needed.
1283*/
danielk1977aef0bf62005-12-30 16:28:01 +00001284static int getPage(BtShared *pBt, Pgno pgno, MemPage **ppPage){
drh3aac2dd2004-04-26 14:10:20 +00001285 int rc;
1286 unsigned char *aData;
1287 MemPage *pPage;
drha34b6762004-05-07 13:30:42 +00001288 rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
drh3aac2dd2004-04-26 14:10:20 +00001289 if( rc ) return rc;
drh07d183d2005-05-01 22:52:42 +00001290 pPage = (MemPage*)&aData[pBt->pageSize];
drh3aac2dd2004-04-26 14:10:20 +00001291 pPage->aData = aData;
1292 pPage->pBt = pBt;
1293 pPage->pgno = pgno;
drhde647132004-05-07 17:57:49 +00001294 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
drh3aac2dd2004-04-26 14:10:20 +00001295 *ppPage = pPage;
1296 return SQLITE_OK;
1297}
1298
1299/*
drhde647132004-05-07 17:57:49 +00001300** Get a page from the pager and initialize it. This routine
1301** is just a convenience wrapper around separate calls to
1302** getPage() and initPage().
1303*/
1304static int getAndInitPage(
danielk1977aef0bf62005-12-30 16:28:01 +00001305 BtShared *pBt, /* The database file */
drhde647132004-05-07 17:57:49 +00001306 Pgno pgno, /* Number of the page to get */
1307 MemPage **ppPage, /* Write the page pointer here */
1308 MemPage *pParent /* Parent of the page */
1309){
1310 int rc;
drhee696e22004-08-30 16:52:17 +00001311 if( pgno==0 ){
drh49285702005-09-17 15:20:26 +00001312 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001313 }
drhde647132004-05-07 17:57:49 +00001314 rc = getPage(pBt, pgno, ppPage);
drh10617cd2004-05-14 15:27:27 +00001315 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
drhde647132004-05-07 17:57:49 +00001316 rc = initPage(*ppPage, pParent);
1317 }
1318 return rc;
1319}
1320
1321/*
drh3aac2dd2004-04-26 14:10:20 +00001322** Release a MemPage. This should be called once for each prior
1323** call to getPage.
1324*/
drh4b70f112004-05-02 21:12:19 +00001325static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +00001326 if( pPage ){
1327 assert( pPage->aData );
1328 assert( pPage->pBt );
drh07d183d2005-05-01 22:52:42 +00001329 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +00001330 sqlite3pager_unref(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00001331 }
1332}
1333
1334/*
drh72f82862001-05-24 21:06:34 +00001335** This routine is called when the reference count for a page
1336** reaches zero. We need to unref the pParent pointer when that
1337** happens.
1338*/
drhb6f41482004-05-14 01:58:11 +00001339static void pageDestructor(void *pData, int pageSize){
drh07d183d2005-05-01 22:52:42 +00001340 MemPage *pPage;
1341 assert( (pageSize & 7)==0 );
1342 pPage = (MemPage*)&((char*)pData)[pageSize];
drh72f82862001-05-24 21:06:34 +00001343 if( pPage->pParent ){
1344 MemPage *pParent = pPage->pParent;
1345 pPage->pParent = 0;
drha34b6762004-05-07 13:30:42 +00001346 releasePage(pParent);
drh72f82862001-05-24 21:06:34 +00001347 }
drh3aac2dd2004-04-26 14:10:20 +00001348 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +00001349}
1350
1351/*
drha6abd042004-06-09 17:37:22 +00001352** During a rollback, when the pager reloads information into the cache
1353** so that the cache is restored to its original state at the start of
1354** the transaction, for each page restored this routine is called.
1355**
1356** This routine needs to reset the extra data section at the end of the
1357** page to agree with the restored data.
1358*/
1359static void pageReinit(void *pData, int pageSize){
drh07d183d2005-05-01 22:52:42 +00001360 MemPage *pPage;
1361 assert( (pageSize & 7)==0 );
1362 pPage = (MemPage*)&((char*)pData)[pageSize];
drha6abd042004-06-09 17:37:22 +00001363 if( pPage->isInit ){
1364 pPage->isInit = 0;
1365 initPage(pPage, pPage->pParent);
1366 }
1367}
1368
1369/*
drhad3e0102004-09-03 23:32:18 +00001370** Open a database file.
1371**
drh382c0242001-10-06 16:33:02 +00001372** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +00001373** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +00001374** database file will be deleted when sqlite3BtreeClose() is called.
drha059ad02001-04-17 20:09:11 +00001375*/
drh23e11ca2004-05-04 17:27:28 +00001376int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +00001377 const char *zFilename, /* Name of the file containing the BTree database */
danielk1977aef0bf62005-12-30 16:28:01 +00001378 sqlite3 *pSqlite, /* Associated database handle */
drh3aac2dd2004-04-26 14:10:20 +00001379 Btree **ppBtree, /* Pointer to new Btree object written here */
drh90f5ecb2004-07-22 01:19:35 +00001380 int flags /* Options */
drh6019e162001-07-02 17:51:45 +00001381){
danielk1977aef0bf62005-12-30 16:28:01 +00001382 BtShared *pBt; /* Shared part of btree structure */
1383 Btree *p; /* Handle to return */
drha34b6762004-05-07 13:30:42 +00001384 int rc;
drh90f5ecb2004-07-22 01:19:35 +00001385 int nReserve;
1386 unsigned char zDbHeader[100];
danielk1977aef0bf62005-12-30 16:28:01 +00001387 SqliteTsd *pTsd = sqlite3Tsd();
1388
1389 /* Set the variable isMemdb to true for an in-memory database, or
1390 ** false for a file-based database. This symbol is only required if
1391 ** either of the shared-data or autovacuum features are compiled
1392 ** into the library.
1393 */
1394#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1395 #ifdef SQLITE_OMIT_MEMORYDB
1396 const int isMemdb = !zFilename;
1397 #else
1398 const int isMemdb = !zFilename || (strcmp(zFilename, ":memory:")?0:1);
1399 #endif
1400#endif
1401
1402 p = sqliteMalloc(sizeof(Btree));
1403 if( !p ){
1404 return SQLITE_NOMEM;
1405 }
1406 p->inTrans = TRANS_NONE;
1407 p->pSqlite = pSqlite;
1408
1409 /* Try to find an existing Btree structure opened on zFilename. */
1410#ifndef SQLITE_OMIT_SHARED_CACHE
1411 if( pTsd->useSharedData && zFilename && !isMemdb ){
1412 char *zFullPathname = sqlite3Os.xFullPathname(zFilename);
1413 if( !zFullPathname ){
1414 sqliteFree(p);
1415 return SQLITE_NOMEM;
1416 }
1417 for(pBt=pTsd->pBtree; pBt; pBt=pBt->pNext){
1418 if( 0==strcmp(zFullPathname, sqlite3pager_filename(pBt->pPager)) ){
1419 p->pBt = pBt;
1420 *ppBtree = p;
1421 pBt->nRef++;
1422 sqliteFree(zFullPathname);
1423 return SQLITE_OK;
1424 }
1425 }
1426 sqliteFree(zFullPathname);
1427 }
1428#endif
drha059ad02001-04-17 20:09:11 +00001429
drhd62d3d02003-01-24 12:14:20 +00001430 /*
1431 ** The following asserts make sure that structures used by the btree are
1432 ** the right size. This is to guard against size changes that result
1433 ** when compiling on a different architecture.
1434 */
drh4a1c3802004-05-12 15:15:47 +00001435 assert( sizeof(i64)==8 );
drh9e572e62004-04-23 23:43:10 +00001436 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +00001437 assert( sizeof(u32)==4 );
1438 assert( sizeof(u16)==2 );
1439 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +00001440
drha059ad02001-04-17 20:09:11 +00001441 pBt = sqliteMalloc( sizeof(*pBt) );
1442 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +00001443 *ppBtree = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00001444 sqliteFree(p);
drha059ad02001-04-17 20:09:11 +00001445 return SQLITE_NOMEM;
1446 }
drh7bec5052005-02-06 02:45:41 +00001447 rc = sqlite3pager_open(&pBt->pPager, zFilename, EXTRA_SIZE, flags);
drha059ad02001-04-17 20:09:11 +00001448 if( rc!=SQLITE_OK ){
drha34b6762004-05-07 13:30:42 +00001449 if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +00001450 sqliteFree(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00001451 sqliteFree(p);
drha059ad02001-04-17 20:09:11 +00001452 *ppBtree = 0;
1453 return rc;
1454 }
danielk1977aef0bf62005-12-30 16:28:01 +00001455 p->pBt = pBt;
1456
drha34b6762004-05-07 13:30:42 +00001457 sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
drha6abd042004-06-09 17:37:22 +00001458 sqlite3pager_set_reiniter(pBt->pPager, pageReinit);
drha059ad02001-04-17 20:09:11 +00001459 pBt->pCursor = 0;
drha34b6762004-05-07 13:30:42 +00001460 pBt->pPage1 = 0;
1461 pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
drh90f5ecb2004-07-22 01:19:35 +00001462 sqlite3pager_read_fileheader(pBt->pPager, sizeof(zDbHeader), zDbHeader);
1463 pBt->pageSize = get2byte(&zDbHeader[16]);
drh07d183d2005-05-01 22:52:42 +00001464 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1465 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
drh90f5ecb2004-07-22 01:19:35 +00001466 pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1467 pBt->maxEmbedFrac = 64; /* 25% */
1468 pBt->minEmbedFrac = 32; /* 12.5% */
1469 pBt->minLeafFrac = 32; /* 12.5% */
drheee46cf2004-11-06 00:02:48 +00001470#ifndef SQLITE_OMIT_AUTOVACUUM
danielk197703aded42004-11-22 05:26:27 +00001471 /* If the magic name ":memory:" will create an in-memory database, then
1472 ** do not set the auto-vacuum flag, even if SQLITE_DEFAULT_AUTOVACUUM
1473 ** is true. On the other hand, if SQLITE_OMIT_MEMORYDB has been defined,
1474 ** then ":memory:" is just a regular file-name. Respect the auto-vacuum
1475 ** default in this case.
1476 */
danielk1977aef0bf62005-12-30 16:28:01 +00001477 if( zFilename && !isMemdb ){
danielk1977951af802004-11-05 15:45:09 +00001478 pBt->autoVacuum = SQLITE_DEFAULT_AUTOVACUUM;
1479 }
drheee46cf2004-11-06 00:02:48 +00001480#endif
drh90f5ecb2004-07-22 01:19:35 +00001481 nReserve = 0;
1482 }else{
1483 nReserve = zDbHeader[20];
1484 pBt->maxEmbedFrac = zDbHeader[21];
1485 pBt->minEmbedFrac = zDbHeader[22];
1486 pBt->minLeafFrac = zDbHeader[23];
1487 pBt->pageSizeFixed = 1;
danielk1977951af802004-11-05 15:45:09 +00001488#ifndef SQLITE_OMIT_AUTOVACUUM
1489 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1490#endif
drh90f5ecb2004-07-22 01:19:35 +00001491 }
1492 pBt->usableSize = pBt->pageSize - nReserve;
drh07d183d2005-05-01 22:52:42 +00001493 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
drh90f5ecb2004-07-22 01:19:35 +00001494 sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize);
danielk1977aef0bf62005-12-30 16:28:01 +00001495
1496#ifndef SQLITE_OMIT_SHARED_CACHE
1497 /* Add the new btree to the linked list starting at SqliteTsd.pBtree */
1498 if( pTsd->useSharedData && zFilename && !isMemdb ){
1499 pBt->pNext = pTsd->pBtree;
1500 pTsd->pBtree = pBt;
1501 }
1502 pBt->nRef = 1;
1503#endif
1504 *ppBtree = p;
drha059ad02001-04-17 20:09:11 +00001505 return SQLITE_OK;
1506}
1507
1508/*
1509** Close an open database and invalidate all cursors.
1510*/
danielk1977aef0bf62005-12-30 16:28:01 +00001511int sqlite3BtreeClose(Btree *p){
1512 SqliteTsd *pTsd = sqlite3Tsd();
1513 BtShared *pBt = p->pBt;
1514 BtCursor *pCur;
1515
1516 /* Drop any table-locks */
1517 unlockAllTables(p);
1518
1519 /* Close all cursors opened via this handle. */
1520 pCur = pBt->pCursor;
1521 while( pCur ){
1522 BtCursor *pTmp = pCur;
1523 pCur = pCur->pNext;
1524 if( pTmp->pBtree==p ){
1525 sqlite3BtreeCloseCursor(pTmp);
1526 }
drha059ad02001-04-17 20:09:11 +00001527 }
danielk1977aef0bf62005-12-30 16:28:01 +00001528
1529 sqliteFree(p);
1530
1531#ifndef SQLITE_OMIT_SHARED_CACHE
1532 /* If there are still other outstanding references to the shared-btree
1533 ** structure, return now. The remainder of this procedure cleans
1534 ** up the shared-btree.
1535 */
1536 assert( pBt->nRef>0 );
1537 pBt->nRef--;
1538 if( pBt->nRef ){
1539 return SQLITE_OK;
1540 }
1541
1542 /* Remove the shared-btree from the thread wide list */
1543 if( pTsd->pBtree==pBt ){
1544 pTsd->pBtree = pBt->pNext;
1545 }else{
1546 BtShared *pPrev;
1547 for(pPrev=pTsd->pBtree; pPrev && pPrev->pNext!=pBt; pPrev=pPrev->pNext);
1548 if( pPrev ){
1549 pPrev->pNext = pBt->pNext;
1550 }
1551 }
1552#endif
1553
1554 /* Close the pager and free the shared-btree structure */
1555 assert( !pBt->pCursor );
drha34b6762004-05-07 13:30:42 +00001556 sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +00001557 sqliteFree(pBt);
1558 return SQLITE_OK;
1559}
1560
1561/*
drh90f5ecb2004-07-22 01:19:35 +00001562** Change the busy handler callback function.
1563*/
danielk1977aef0bf62005-12-30 16:28:01 +00001564int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){
1565 BtShared *pBt = p->pBt;
drhb8ef32c2005-03-14 02:01:49 +00001566 pBt->pBusyHandler = pHandler;
drh90f5ecb2004-07-22 01:19:35 +00001567 sqlite3pager_set_busyhandler(pBt->pPager, pHandler);
1568 return SQLITE_OK;
1569}
1570
1571/*
drhda47d772002-12-02 04:25:19 +00001572** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +00001573**
1574** The maximum number of cache pages is set to the absolute
1575** value of mxPage. If mxPage is negative, the pager will
1576** operate asynchronously - it will not stop to do fsync()s
1577** to insure data is written to the disk surface before
1578** continuing. Transactions still work if synchronous is off,
1579** and the database cannot be corrupted if this program
1580** crashes. But if the operating system crashes or there is
1581** an abrupt power failure when synchronous is off, the database
1582** could be left in an inconsistent and unrecoverable state.
1583** Synchronous is on by default so database corruption is not
1584** normally a worry.
drhf57b14a2001-09-14 18:54:08 +00001585*/
danielk1977aef0bf62005-12-30 16:28:01 +00001586int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1587 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00001588 sqlite3pager_set_cachesize(pBt->pPager, mxPage);
drhf57b14a2001-09-14 18:54:08 +00001589 return SQLITE_OK;
1590}
1591
1592/*
drh973b6e32003-02-12 14:09:42 +00001593** Change the way data is synced to disk in order to increase or decrease
1594** how well the database resists damage due to OS crashes and power
1595** failures. Level 1 is the same as asynchronous (no syncs() occur and
1596** there is a high probability of damage) Level 2 is the default. There
1597** is a very low but non-zero probability of damage. Level 3 reduces the
1598** probability of damage to near zero but with a write performance reduction.
1599*/
danielk197793758c82005-01-21 08:13:14 +00001600#ifndef SQLITE_OMIT_PAGER_PRAGMAS
danielk1977aef0bf62005-12-30 16:28:01 +00001601int sqlite3BtreeSetSafetyLevel(Btree *p, int level){
1602 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00001603 sqlite3pager_set_safety_level(pBt->pPager, level);
drh973b6e32003-02-12 14:09:42 +00001604 return SQLITE_OK;
1605}
danielk197793758c82005-01-21 08:13:14 +00001606#endif
drh973b6e32003-02-12 14:09:42 +00001607
drh2c8997b2005-08-27 16:36:48 +00001608/*
1609** Return TRUE if the given btree is set to safety level 1. In other
1610** words, return TRUE if no sync() occurs on the disk files.
1611*/
danielk1977aef0bf62005-12-30 16:28:01 +00001612int sqlite3BtreeSyncDisabled(Btree *p){
1613 BtShared *pBt = p->pBt;
drh2c8997b2005-08-27 16:36:48 +00001614 assert( pBt && pBt->pPager );
1615 return sqlite3pager_nosync(pBt->pPager);
1616}
1617
danielk1977576ec6b2005-01-21 11:55:25 +00001618#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
drh973b6e32003-02-12 14:09:42 +00001619/*
drh90f5ecb2004-07-22 01:19:35 +00001620** Change the default pages size and the number of reserved bytes per page.
drh06f50212004-11-02 14:24:33 +00001621**
1622** The page size must be a power of 2 between 512 and 65536. If the page
1623** size supplied does not meet this constraint then the page size is not
1624** changed.
1625**
1626** Page sizes are constrained to be a power of two so that the region
1627** of the database file used for locking (beginning at PENDING_BYTE,
1628** the first byte past the 1GB boundary, 0x40000000) needs to occur
1629** at the beginning of a page.
danielk197728129562005-01-11 10:25:06 +00001630**
1631** If parameter nReserve is less than zero, then the number of reserved
1632** bytes per page is left unchanged.
drh90f5ecb2004-07-22 01:19:35 +00001633*/
danielk1977aef0bf62005-12-30 16:28:01 +00001634int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1635 BtShared *pBt = p->pBt;
drh90f5ecb2004-07-22 01:19:35 +00001636 if( pBt->pageSizeFixed ){
1637 return SQLITE_READONLY;
1638 }
1639 if( nReserve<0 ){
1640 nReserve = pBt->pageSize - pBt->usableSize;
1641 }
drh06f50212004-11-02 14:24:33 +00001642 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1643 ((pageSize-1)&pageSize)==0 ){
drh07d183d2005-05-01 22:52:42 +00001644 assert( (pageSize & 7)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00001645 assert( !pBt->pPage1 && !pBt->pCursor );
drh1c7880e2005-05-20 20:01:55 +00001646 pBt->pageSize = sqlite3pager_set_pagesize(pBt->pPager, pageSize);
drh90f5ecb2004-07-22 01:19:35 +00001647 }
1648 pBt->usableSize = pBt->pageSize - nReserve;
1649 return SQLITE_OK;
1650}
1651
1652/*
1653** Return the currently defined page size
1654*/
danielk1977aef0bf62005-12-30 16:28:01 +00001655int sqlite3BtreeGetPageSize(Btree *p){
1656 return p->pBt->pageSize;
drh90f5ecb2004-07-22 01:19:35 +00001657}
danielk1977aef0bf62005-12-30 16:28:01 +00001658int sqlite3BtreeGetReserve(Btree *p){
1659 return p->pBt->pageSize - p->pBt->usableSize;
drh2011d5f2004-07-22 02:40:37 +00001660}
danielk1977576ec6b2005-01-21 11:55:25 +00001661#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
drh90f5ecb2004-07-22 01:19:35 +00001662
1663/*
danielk1977951af802004-11-05 15:45:09 +00001664** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1665** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1666** is disabled. The default value for the auto-vacuum property is
1667** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1668*/
danielk1977aef0bf62005-12-30 16:28:01 +00001669int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1670 BtShared *pBt = p->pBt;;
danielk1977951af802004-11-05 15:45:09 +00001671#ifdef SQLITE_OMIT_AUTOVACUUM
drheee46cf2004-11-06 00:02:48 +00001672 return SQLITE_READONLY;
danielk1977951af802004-11-05 15:45:09 +00001673#else
1674 if( pBt->pageSizeFixed ){
1675 return SQLITE_READONLY;
1676 }
1677 pBt->autoVacuum = (autoVacuum?1:0);
1678 return SQLITE_OK;
1679#endif
1680}
1681
1682/*
1683** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1684** enabled 1 is returned. Otherwise 0.
1685*/
danielk1977aef0bf62005-12-30 16:28:01 +00001686int sqlite3BtreeGetAutoVacuum(Btree *p){
danielk1977951af802004-11-05 15:45:09 +00001687#ifdef SQLITE_OMIT_AUTOVACUUM
1688 return 0;
1689#else
danielk1977aef0bf62005-12-30 16:28:01 +00001690 return p->pBt->autoVacuum;
danielk1977951af802004-11-05 15:45:09 +00001691#endif
1692}
1693
1694
1695/*
drha34b6762004-05-07 13:30:42 +00001696** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +00001697** also acquire a readlock on that file.
1698**
1699** SQLITE_OK is returned on success. If the file is not a
1700** well-formed database file, then SQLITE_CORRUPT is returned.
1701** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1702** is returned if we run out of memory. SQLITE_PROTOCOL is returned
1703** if there is a locking protocol violation.
1704*/
danielk1977aef0bf62005-12-30 16:28:01 +00001705static int lockBtree(BtShared *pBt){
drh07d183d2005-05-01 22:52:42 +00001706 int rc, pageSize;
drh3aac2dd2004-04-26 14:10:20 +00001707 MemPage *pPage1;
drha34b6762004-05-07 13:30:42 +00001708 if( pBt->pPage1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001709 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +00001710 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001711
drh306dc212001-05-21 13:45:10 +00001712
1713 /* Do some checking to help insure the file we opened really is
1714 ** a valid database file.
1715 */
drhb6f41482004-05-14 01:58:11 +00001716 rc = SQLITE_NOTADB;
drha34b6762004-05-07 13:30:42 +00001717 if( sqlite3pager_pagecount(pBt->pPager)>0 ){
drhb6f41482004-05-14 01:58:11 +00001718 u8 *page1 = pPage1->aData;
1719 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00001720 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00001721 }
drhb6f41482004-05-14 01:58:11 +00001722 if( page1[18]>1 || page1[19]>1 ){
1723 goto page1_init_failed;
1724 }
drh07d183d2005-05-01 22:52:42 +00001725 pageSize = get2byte(&page1[16]);
1726 if( ((pageSize-1)&pageSize)!=0 ){
1727 goto page1_init_failed;
1728 }
1729 assert( (pageSize & 7)==0 );
1730 pBt->pageSize = pageSize;
1731 pBt->usableSize = pageSize - page1[20];
drhb6f41482004-05-14 01:58:11 +00001732 if( pBt->usableSize<500 ){
1733 goto page1_init_failed;
1734 }
1735 pBt->maxEmbedFrac = page1[21];
1736 pBt->minEmbedFrac = page1[22];
1737 pBt->minLeafFrac = page1[23];
drh057cd3a2005-02-15 16:23:02 +00001738#ifndef SQLITE_OMIT_AUTOVACUUM
1739 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1740#endif
drh306dc212001-05-21 13:45:10 +00001741 }
drhb6f41482004-05-14 01:58:11 +00001742
1743 /* maxLocal is the maximum amount of payload to store locally for
1744 ** a cell. Make sure it is small enough so that at least minFanout
1745 ** cells can will fit on one page. We assume a 10-byte page header.
1746 ** Besides the payload, the cell must store:
drh43605152004-05-29 21:46:49 +00001747 ** 2-byte pointer to the cell
drhb6f41482004-05-14 01:58:11 +00001748 ** 4-byte child pointer
1749 ** 9-byte nKey value
1750 ** 4-byte nData value
1751 ** 4-byte overflow page pointer
drh43605152004-05-29 21:46:49 +00001752 ** So a cell consists of a 2-byte poiner, a header which is as much as
1753 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1754 ** page pointer.
drhb6f41482004-05-14 01:58:11 +00001755 */
drh43605152004-05-29 21:46:49 +00001756 pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1757 pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1758 pBt->maxLeaf = pBt->usableSize - 35;
1759 pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
drhb6f41482004-05-14 01:58:11 +00001760 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1761 goto page1_init_failed;
1762 }
drh2e38c322004-09-03 18:38:44 +00001763 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00001764 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00001765 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00001766
drh72f82862001-05-24 21:06:34 +00001767page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00001768 releasePage(pPage1);
1769 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00001770 return rc;
drh306dc212001-05-21 13:45:10 +00001771}
1772
1773/*
drhb8ef32c2005-03-14 02:01:49 +00001774** This routine works like lockBtree() except that it also invokes the
1775** busy callback if there is lock contention.
1776*/
danielk1977aef0bf62005-12-30 16:28:01 +00001777static int lockBtreeWithRetry(Btree *pRef){
drhb8ef32c2005-03-14 02:01:49 +00001778 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00001779 if( pRef->inTrans==TRANS_NONE ){
1780 u8 inTransaction = pRef->pBt->inTransaction;
1781 btreeIntegrity(pRef);
1782 rc = sqlite3BtreeBeginTrans(pRef, 0);
1783 pRef->pBt->inTransaction = inTransaction;
1784 pRef->inTrans = TRANS_NONE;
1785 if( rc==SQLITE_OK ){
1786 pRef->pBt->nTransaction--;
1787 }
1788 btreeIntegrity(pRef);
drhb8ef32c2005-03-14 02:01:49 +00001789 }
1790 return rc;
1791}
1792
1793
1794/*
drhb8ca3072001-12-05 00:21:20 +00001795** If there are no outstanding cursors and we are not in the middle
1796** of a transaction but there is a read lock on the database, then
1797** this routine unrefs the first page of the database file which
1798** has the effect of releasing the read lock.
1799**
1800** If there are any outstanding cursors, this routine is a no-op.
1801**
1802** If there is a transaction in progress, this routine is a no-op.
1803*/
danielk1977aef0bf62005-12-30 16:28:01 +00001804static void unlockBtreeIfUnused(BtShared *pBt){
1805 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
drh51c6d962004-06-06 00:42:25 +00001806 if( pBt->pPage1->aData==0 ){
1807 MemPage *pPage = pBt->pPage1;
drh2646da72005-12-09 20:02:05 +00001808 pPage->aData = &((u8*)pPage)[-pBt->pageSize];
drh51c6d962004-06-06 00:42:25 +00001809 pPage->pBt = pBt;
1810 pPage->pgno = 1;
1811 }
drh3aac2dd2004-04-26 14:10:20 +00001812 releasePage(pBt->pPage1);
1813 pBt->pPage1 = 0;
drh3aac2dd2004-04-26 14:10:20 +00001814 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001815 }
1816}
1817
1818/*
drh9e572e62004-04-23 23:43:10 +00001819** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001820** file.
drh8b2f49b2001-06-08 00:21:52 +00001821*/
danielk1977aef0bf62005-12-30 16:28:01 +00001822static int newDatabase(BtShared *pBt){
drh9e572e62004-04-23 23:43:10 +00001823 MemPage *pP1;
1824 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001825 int rc;
drhde647132004-05-07 17:57:49 +00001826 if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001827 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001828 assert( pP1!=0 );
1829 data = pP1->aData;
drha34b6762004-05-07 13:30:42 +00001830 rc = sqlite3pager_write(data);
drh8b2f49b2001-06-08 00:21:52 +00001831 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001832 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1833 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00001834 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00001835 data[18] = 1;
1836 data[19] = 1;
drhb6f41482004-05-14 01:58:11 +00001837 data[20] = pBt->pageSize - pBt->usableSize;
1838 data[21] = pBt->maxEmbedFrac;
1839 data[22] = pBt->minEmbedFrac;
1840 data[23] = pBt->minLeafFrac;
1841 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00001842 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drhf2a611c2004-09-05 00:33:43 +00001843 pBt->pageSizeFixed = 1;
danielk1977003ba062004-11-04 02:57:33 +00001844#ifndef SQLITE_OMIT_AUTOVACUUM
1845 if( pBt->autoVacuum ){
1846 put4byte(&data[36 + 4*4], 1);
1847 }
1848#endif
drh8b2f49b2001-06-08 00:21:52 +00001849 return SQLITE_OK;
1850}
1851
1852/*
danielk1977ee5741e2004-05-31 10:01:34 +00001853** Attempt to start a new transaction. A write-transaction
drh684917c2004-10-05 02:41:42 +00001854** is started if the second argument is nonzero, otherwise a read-
1855** transaction. If the second argument is 2 or more and exclusive
1856** transaction is started, meaning that no other process is allowed
1857** to access the database. A preexisting transaction may not be
drhb8ef32c2005-03-14 02:01:49 +00001858** upgraded to exclusive by calling this routine a second time - the
drh684917c2004-10-05 02:41:42 +00001859** exclusivity flag only works for a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001860**
danielk1977ee5741e2004-05-31 10:01:34 +00001861** A write-transaction must be started before attempting any
1862** changes to the database. None of the following routines
1863** will work unless a transaction is started first:
drh8b2f49b2001-06-08 00:21:52 +00001864**
drh23e11ca2004-05-04 17:27:28 +00001865** sqlite3BtreeCreateTable()
1866** sqlite3BtreeCreateIndex()
1867** sqlite3BtreeClearTable()
1868** sqlite3BtreeDropTable()
1869** sqlite3BtreeInsert()
1870** sqlite3BtreeDelete()
1871** sqlite3BtreeUpdateMeta()
danielk197713adf8a2004-06-03 16:08:41 +00001872**
drhb8ef32c2005-03-14 02:01:49 +00001873** If an initial attempt to acquire the lock fails because of lock contention
1874** and the database was previously unlocked, then invoke the busy handler
1875** if there is one. But if there was previously a read-lock, do not
1876** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1877** returned when there is already a read-lock in order to avoid a deadlock.
1878**
1879** Suppose there are two processes A and B. A has a read lock and B has
1880** a reserved lock. B tries to promote to exclusive but is blocked because
1881** of A's read lock. A tries to promote to reserved but is blocked by B.
1882** One or the other of the two processes must give way or there can be
1883** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1884** when A already has a read lock, we encourage A to give up and let B
1885** proceed.
drha059ad02001-04-17 20:09:11 +00001886*/
danielk1977aef0bf62005-12-30 16:28:01 +00001887int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1888 BtShared *pBt = p->pBt;
danielk1977ee5741e2004-05-31 10:01:34 +00001889 int rc = SQLITE_OK;
1890
danielk1977aef0bf62005-12-30 16:28:01 +00001891 btreeIntegrity(p);
1892
danielk1977ee5741e2004-05-31 10:01:34 +00001893 /* If the btree is already in a write-transaction, or it
1894 ** is already in a read-transaction and a read-transaction
1895 ** is requested, this is a no-op.
1896 */
danielk1977aef0bf62005-12-30 16:28:01 +00001897 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
danielk1977ee5741e2004-05-31 10:01:34 +00001898 return SQLITE_OK;
1899 }
drhb8ef32c2005-03-14 02:01:49 +00001900
1901 /* Write transactions are not possible on a read-only database */
danielk1977ee5741e2004-05-31 10:01:34 +00001902 if( pBt->readOnly && wrflag ){
1903 return SQLITE_READONLY;
1904 }
1905
danielk1977aef0bf62005-12-30 16:28:01 +00001906 /* If another database handle has already opened a write transaction
1907 ** on this shared-btree structure and a second write transaction is
1908 ** requested, return SQLITE_BUSY.
1909 */
1910 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1911 return SQLITE_BUSY;
1912 }
1913
drhb8ef32c2005-03-14 02:01:49 +00001914 do {
1915 if( pBt->pPage1==0 ){
1916 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +00001917 }
drhb8ef32c2005-03-14 02:01:49 +00001918
1919 if( rc==SQLITE_OK && wrflag ){
1920 rc = sqlite3pager_begin(pBt->pPage1->aData, wrflag>1);
1921 if( rc==SQLITE_OK ){
1922 rc = newDatabase(pBt);
1923 }
1924 }
1925
1926 if( rc==SQLITE_OK ){
drhb8ef32c2005-03-14 02:01:49 +00001927 if( wrflag ) pBt->inStmt = 0;
1928 }else{
1929 unlockBtreeIfUnused(pBt);
1930 }
danielk1977aef0bf62005-12-30 16:28:01 +00001931 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
drha4afb652005-07-09 02:16:02 +00001932 sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
danielk1977aef0bf62005-12-30 16:28:01 +00001933
1934 if( rc==SQLITE_OK ){
1935 if( p->inTrans==TRANS_NONE ){
1936 pBt->nTransaction++;
1937 }
1938 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1939 if( p->inTrans>pBt->inTransaction ){
1940 pBt->inTransaction = p->inTrans;
1941 }
1942 }
1943
1944 btreeIntegrity(p);
drhb8ca3072001-12-05 00:21:20 +00001945 return rc;
drha059ad02001-04-17 20:09:11 +00001946}
1947
danielk1977687566d2004-11-02 12:56:41 +00001948#ifndef SQLITE_OMIT_AUTOVACUUM
1949
1950/*
1951** Set the pointer-map entries for all children of page pPage. Also, if
1952** pPage contains cells that point to overflow pages, set the pointer
1953** map entries for the overflow pages as well.
1954*/
1955static int setChildPtrmaps(MemPage *pPage){
1956 int i; /* Counter variable */
1957 int nCell; /* Number of cells in page pPage */
1958 int rc = SQLITE_OK; /* Return code */
danielk1977aef0bf62005-12-30 16:28:01 +00001959 BtShared *pBt = pPage->pBt;
danielk1977687566d2004-11-02 12:56:41 +00001960 int isInitOrig = pPage->isInit;
1961 Pgno pgno = pPage->pgno;
1962
1963 initPage(pPage, 0);
1964 nCell = pPage->nCell;
1965
1966 for(i=0; i<nCell; i++){
danielk1977687566d2004-11-02 12:56:41 +00001967 u8 *pCell = findCell(pPage, i);
1968
danielk197726836652005-01-17 01:33:13 +00001969 rc = ptrmapPutOvflPtr(pPage, pCell);
1970 if( rc!=SQLITE_OK ){
1971 goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00001972 }
danielk197726836652005-01-17 01:33:13 +00001973
danielk1977687566d2004-11-02 12:56:41 +00001974 if( !pPage->leaf ){
1975 Pgno childPgno = get4byte(pCell);
1976 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1977 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1978 }
1979 }
1980
1981 if( !pPage->leaf ){
1982 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1983 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1984 }
1985
1986set_child_ptrmaps_out:
1987 pPage->isInit = isInitOrig;
1988 return rc;
1989}
1990
1991/*
1992** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1993** page, is a pointer to page iFrom. Modify this pointer so that it points to
1994** iTo. Parameter eType describes the type of pointer to be modified, as
1995** follows:
1996**
1997** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1998** page of pPage.
1999**
2000** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2001** page pointed to by one of the cells on pPage.
2002**
2003** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2004** overflow page in the list.
2005*/
danielk1977fdb7cdb2005-01-17 02:12:18 +00002006static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
danielk1977687566d2004-11-02 12:56:41 +00002007 if( eType==PTRMAP_OVERFLOW2 ){
danielk1977f78fc082004-11-02 14:40:32 +00002008 /* The pointer is always the first 4 bytes of the page in this case. */
danielk1977fdb7cdb2005-01-17 02:12:18 +00002009 if( get4byte(pPage->aData)!=iFrom ){
drh49285702005-09-17 15:20:26 +00002010 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002011 }
danielk1977f78fc082004-11-02 14:40:32 +00002012 put4byte(pPage->aData, iTo);
danielk1977687566d2004-11-02 12:56:41 +00002013 }else{
2014 int isInitOrig = pPage->isInit;
2015 int i;
2016 int nCell;
2017
2018 initPage(pPage, 0);
2019 nCell = pPage->nCell;
2020
danielk1977687566d2004-11-02 12:56:41 +00002021 for(i=0; i<nCell; i++){
2022 u8 *pCell = findCell(pPage, i);
2023 if( eType==PTRMAP_OVERFLOW1 ){
2024 CellInfo info;
2025 parseCellPtr(pPage, pCell, &info);
2026 if( info.iOverflow ){
2027 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2028 put4byte(&pCell[info.iOverflow], iTo);
2029 break;
2030 }
2031 }
2032 }else{
2033 if( get4byte(pCell)==iFrom ){
2034 put4byte(pCell, iTo);
2035 break;
2036 }
2037 }
2038 }
2039
2040 if( i==nCell ){
danielk1977fdb7cdb2005-01-17 02:12:18 +00002041 if( eType!=PTRMAP_BTREE ||
2042 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
drh49285702005-09-17 15:20:26 +00002043 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002044 }
danielk1977687566d2004-11-02 12:56:41 +00002045 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2046 }
2047
2048 pPage->isInit = isInitOrig;
2049 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002050 return SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002051}
2052
danielk1977003ba062004-11-04 02:57:33 +00002053
danielk19777701e812005-01-10 12:59:51 +00002054/*
2055** Move the open database page pDbPage to location iFreePage in the
2056** database. The pDbPage reference remains valid.
2057*/
danielk1977003ba062004-11-04 02:57:33 +00002058static int relocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00002059 BtShared *pBt, /* Btree */
danielk19777701e812005-01-10 12:59:51 +00002060 MemPage *pDbPage, /* Open page to move */
2061 u8 eType, /* Pointer map 'type' entry for pDbPage */
2062 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2063 Pgno iFreePage /* The location to move pDbPage to */
danielk1977003ba062004-11-04 02:57:33 +00002064){
2065 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2066 Pgno iDbPage = pDbPage->pgno;
2067 Pager *pPager = pBt->pPager;
2068 int rc;
2069
danielk1977a0bf2652004-11-04 14:30:04 +00002070 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2071 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
danielk1977003ba062004-11-04 02:57:33 +00002072
2073 /* Move page iDbPage from it's current location to page number iFreePage */
2074 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2075 iDbPage, iFreePage, iPtrPage, eType));
2076 rc = sqlite3pager_movepage(pPager, pDbPage->aData, iFreePage);
2077 if( rc!=SQLITE_OK ){
2078 return rc;
2079 }
2080 pDbPage->pgno = iFreePage;
2081
2082 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2083 ** that point to overflow pages. The pointer map entries for all these
2084 ** pages need to be changed.
2085 **
2086 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2087 ** pointer to a subsequent overflow page. If this is the case, then
2088 ** the pointer map needs to be updated for the subsequent overflow page.
2089 */
danielk1977a0bf2652004-11-04 14:30:04 +00002090 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00002091 rc = setChildPtrmaps(pDbPage);
2092 if( rc!=SQLITE_OK ){
2093 return rc;
2094 }
2095 }else{
2096 Pgno nextOvfl = get4byte(pDbPage->aData);
2097 if( nextOvfl!=0 ){
danielk1977003ba062004-11-04 02:57:33 +00002098 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2099 if( rc!=SQLITE_OK ){
2100 return rc;
2101 }
2102 }
2103 }
2104
2105 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2106 ** that it points at iFreePage. Also fix the pointer map entry for
2107 ** iPtrPage.
2108 */
danielk1977a0bf2652004-11-04 14:30:04 +00002109 if( eType!=PTRMAP_ROOTPAGE ){
2110 rc = getPage(pBt, iPtrPage, &pPtrPage);
2111 if( rc!=SQLITE_OK ){
2112 return rc;
2113 }
2114 rc = sqlite3pager_write(pPtrPage->aData);
2115 if( rc!=SQLITE_OK ){
2116 releasePage(pPtrPage);
2117 return rc;
2118 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002119 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
danielk1977003ba062004-11-04 02:57:33 +00002120 releasePage(pPtrPage);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002121 if( rc==SQLITE_OK ){
2122 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2123 }
danielk1977003ba062004-11-04 02:57:33 +00002124 }
danielk1977003ba062004-11-04 02:57:33 +00002125 return rc;
2126}
2127
danielk1977687566d2004-11-02 12:56:41 +00002128/* Forward declaration required by autoVacuumCommit(). */
danielk1977aef0bf62005-12-30 16:28:01 +00002129static int allocatePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
danielk1977687566d2004-11-02 12:56:41 +00002130
2131/*
2132** This routine is called prior to sqlite3pager_commit when a transaction
2133** is commited for an auto-vacuum database.
2134*/
danielk1977aef0bf62005-12-30 16:28:01 +00002135static int autoVacuumCommit(BtShared *pBt, Pgno *nTrunc){
danielk1977687566d2004-11-02 12:56:41 +00002136 Pager *pPager = pBt->pPager;
2137 Pgno nFreeList; /* Number of pages remaining on the free-list. */
danielk1977a19df672004-11-03 11:37:07 +00002138 int nPtrMap; /* Number of pointer-map pages deallocated */
2139 Pgno origSize; /* Pages in the database file */
2140 Pgno finSize; /* Pages in the database file after truncation */
danielk1977687566d2004-11-02 12:56:41 +00002141 int rc; /* Return code */
2142 u8 eType;
danielk1977a19df672004-11-03 11:37:07 +00002143 int pgsz = pBt->pageSize; /* Page size for this database */
danielk1977687566d2004-11-02 12:56:41 +00002144 Pgno iDbPage; /* The database page to move */
danielk1977687566d2004-11-02 12:56:41 +00002145 MemPage *pDbMemPage = 0; /* "" */
2146 Pgno iPtrPage; /* The page that contains a pointer to iDbPage */
danielk1977687566d2004-11-02 12:56:41 +00002147 Pgno iFreePage; /* The free-list page to move iDbPage to */
2148 MemPage *pFreeMemPage = 0; /* "" */
2149
2150#ifndef NDEBUG
2151 int nRef = *sqlite3pager_stats(pPager);
2152#endif
2153
2154 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +00002155 if( PTRMAP_ISPAGE(pgsz, sqlite3pager_pagecount(pPager)) ){
drh49285702005-09-17 15:20:26 +00002156 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002157 }
danielk1977687566d2004-11-02 12:56:41 +00002158
2159 /* Figure out how many free-pages are in the database. If there are no
2160 ** free pages, then auto-vacuum is a no-op.
2161 */
2162 nFreeList = get4byte(&pBt->pPage1->aData[36]);
danielk1977a19df672004-11-03 11:37:07 +00002163 if( nFreeList==0 ){
danielk1977d761c0c2004-11-05 16:37:02 +00002164 *nTrunc = 0;
danielk1977a19df672004-11-03 11:37:07 +00002165 return SQLITE_OK;
2166 }
danielk1977687566d2004-11-02 12:56:41 +00002167
danielk1977a19df672004-11-03 11:37:07 +00002168 origSize = sqlite3pager_pagecount(pPager);
2169 nPtrMap = (nFreeList-origSize+PTRMAP_PAGENO(pgsz, origSize)+pgsz/5)/(pgsz/5);
2170 finSize = origSize - nFreeList - nPtrMap;
danielk1977fd5f5b62005-09-16 09:52:29 +00002171 if( origSize>=PENDING_BYTE_PAGE(pBt) && finSize<=PENDING_BYTE_PAGE(pBt) ){
danielk1977599fcba2004-11-08 07:13:13 +00002172 finSize--;
drh42cac6d2004-11-20 20:31:11 +00002173 if( PTRMAP_ISPAGE(pBt->usableSize, finSize) ){
danielk1977599fcba2004-11-08 07:13:13 +00002174 finSize--;
2175 }
2176 }
danielk1977a19df672004-11-03 11:37:07 +00002177 TRACE(("AUTOVACUUM: Begin (db size %d->%d)\n", origSize, finSize));
danielk1977687566d2004-11-02 12:56:41 +00002178
danielk1977a19df672004-11-03 11:37:07 +00002179 /* Variable 'finSize' will be the size of the file in pages after
danielk1977687566d2004-11-02 12:56:41 +00002180 ** the auto-vacuum has completed (the current file size minus the number
2181 ** of pages on the free list). Loop through the pages that lie beyond
2182 ** this mark, and if they are not already on the free list, move them
danielk1977a19df672004-11-03 11:37:07 +00002183 ** to a free page earlier in the file (somewhere before finSize).
danielk1977687566d2004-11-02 12:56:41 +00002184 */
danielk1977a19df672004-11-03 11:37:07 +00002185 for( iDbPage=finSize+1; iDbPage<=origSize; iDbPage++ ){
danielk1977599fcba2004-11-08 07:13:13 +00002186 /* If iDbPage is a pointer map page, or the pending-byte page, skip it. */
2187 if( PTRMAP_ISPAGE(pgsz, iDbPage) || iDbPage==PENDING_BYTE_PAGE(pBt) ){
2188 continue;
2189 }
2190
danielk1977687566d2004-11-02 12:56:41 +00002191 rc = ptrmapGet(pBt, iDbPage, &eType, &iPtrPage);
2192 if( rc!=SQLITE_OK ) goto autovacuum_out;
drhccae6022005-02-26 17:31:26 +00002193 if( eType==PTRMAP_ROOTPAGE ){
drh49285702005-09-17 15:20:26 +00002194 rc = SQLITE_CORRUPT_BKPT;
drhccae6022005-02-26 17:31:26 +00002195 goto autovacuum_out;
2196 }
danielk1977687566d2004-11-02 12:56:41 +00002197
danielk1977599fcba2004-11-08 07:13:13 +00002198 /* If iDbPage is free, do not swap it. */
2199 if( eType==PTRMAP_FREEPAGE ){
danielk1977687566d2004-11-02 12:56:41 +00002200 continue;
2201 }
2202 rc = getPage(pBt, iDbPage, &pDbMemPage);
2203 if( rc!=SQLITE_OK ) goto autovacuum_out;
danielk1977687566d2004-11-02 12:56:41 +00002204
2205 /* Find the next page in the free-list that is not already at the end
2206 ** of the file. A page can be pulled off the free list using the
2207 ** allocatePage() routine.
2208 */
2209 do{
2210 if( pFreeMemPage ){
2211 releasePage(pFreeMemPage);
2212 pFreeMemPage = 0;
2213 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00002214 rc = allocatePage(pBt, &pFreeMemPage, &iFreePage, 0, 0);
danielk1977ac11ee62005-01-15 12:45:51 +00002215 if( rc!=SQLITE_OK ){
2216 releasePage(pDbMemPage);
2217 goto autovacuum_out;
2218 }
danielk1977a19df672004-11-03 11:37:07 +00002219 assert( iFreePage<=origSize );
2220 }while( iFreePage>finSize );
danielk1977687566d2004-11-02 12:56:41 +00002221 releasePage(pFreeMemPage);
2222 pFreeMemPage = 0;
danielk1977687566d2004-11-02 12:56:41 +00002223
danielk1977003ba062004-11-04 02:57:33 +00002224 rc = relocatePage(pBt, pDbMemPage, eType, iPtrPage, iFreePage);
danielk1977687566d2004-11-02 12:56:41 +00002225 releasePage(pDbMemPage);
danielk1977687566d2004-11-02 12:56:41 +00002226 if( rc!=SQLITE_OK ) goto autovacuum_out;
danielk1977687566d2004-11-02 12:56:41 +00002227 }
2228
2229 /* The entire free-list has been swapped to the end of the file. So
danielk1977a19df672004-11-03 11:37:07 +00002230 ** truncate the database file to finSize pages and consider the
danielk1977687566d2004-11-02 12:56:41 +00002231 ** free-list empty.
2232 */
2233 rc = sqlite3pager_write(pBt->pPage1->aData);
2234 if( rc!=SQLITE_OK ) goto autovacuum_out;
2235 put4byte(&pBt->pPage1->aData[32], 0);
2236 put4byte(&pBt->pPage1->aData[36], 0);
danielk1977687566d2004-11-02 12:56:41 +00002237 if( rc!=SQLITE_OK ) goto autovacuum_out;
danielk1977d761c0c2004-11-05 16:37:02 +00002238 *nTrunc = finSize;
danielk1977687566d2004-11-02 12:56:41 +00002239
2240autovacuum_out:
danielk1977687566d2004-11-02 12:56:41 +00002241 assert( nRef==*sqlite3pager_stats(pPager) );
2242 if( rc!=SQLITE_OK ){
2243 sqlite3pager_rollback(pPager);
2244 }
2245 return rc;
2246}
2247#endif
2248
2249/*
drh2aa679f2001-06-25 02:11:07 +00002250** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00002251**
2252** This will release the write lock on the database file. If there
2253** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002254*/
danielk1977aef0bf62005-12-30 16:28:01 +00002255int sqlite3BtreeCommit(Btree *p){
danielk1977ee5741e2004-05-31 10:01:34 +00002256 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002257 BtShared *pBt = p->pBt;
2258
2259 btreeIntegrity(p);
2260 unlockAllTables(p);
2261
2262 /* If the handle has a write-transaction open, commit the shared-btrees
2263 ** transaction and set the shared state to TRANS_READ.
2264 */
2265 if( p->inTrans==TRANS_WRITE ){
2266 assert( pBt->inTransaction==TRANS_WRITE );
2267 assert( pBt->nTransaction>0 );
danielk1977ee5741e2004-05-31 10:01:34 +00002268 rc = sqlite3pager_commit(pBt->pPager);
danielk1977aef0bf62005-12-30 16:28:01 +00002269 pBt->inTransaction = TRANS_READ;
2270 pBt->inStmt = 0;
danielk1977ee5741e2004-05-31 10:01:34 +00002271 }
danielk1977aef0bf62005-12-30 16:28:01 +00002272
2273 /* If the handle has any kind of transaction open, decrement the transaction
2274 ** count of the shared btree. If the transaction count reaches 0, set
2275 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2276 ** will unlock the pager.
2277 */
2278 if( p->inTrans!=TRANS_NONE ){
2279 pBt->nTransaction--;
2280 if( 0==pBt->nTransaction ){
2281 pBt->inTransaction = TRANS_NONE;
2282 }
2283 }
2284
2285 /* Set the handles current transaction state to TRANS_NONE and unlock
2286 ** the pager if this call closed the only read or write transaction.
2287 */
2288 p->inTrans = TRANS_NONE;
drh5e00f6c2001-09-13 13:46:56 +00002289 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002290
2291 btreeIntegrity(p);
drha059ad02001-04-17 20:09:11 +00002292 return rc;
2293}
2294
danielk1977fbcd5852004-06-15 02:44:18 +00002295#ifndef NDEBUG
2296/*
2297** Return the number of write-cursors open on this handle. This is for use
2298** in assert() expressions, so it is only compiled if NDEBUG is not
2299** defined.
2300*/
danielk1977aef0bf62005-12-30 16:28:01 +00002301static int countWriteCursors(BtShared *pBt){
danielk1977fbcd5852004-06-15 02:44:18 +00002302 BtCursor *pCur;
2303 int r = 0;
2304 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
danielk1977aef0bf62005-12-30 16:28:01 +00002305 if( pCur->wrFlag ) r++;
danielk1977fbcd5852004-06-15 02:44:18 +00002306 }
2307 return r;
2308}
2309#endif
2310
drhda200cc2004-05-09 11:51:38 +00002311#ifdef SQLITE_TEST
2312/*
2313** Print debugging information about all cursors to standard output.
2314*/
danielk1977aef0bf62005-12-30 16:28:01 +00002315void sqlite3BtreeCursorList(Btree *p){
drhda200cc2004-05-09 11:51:38 +00002316 BtCursor *pCur;
danielk1977aef0bf62005-12-30 16:28:01 +00002317 BtShared *pBt = p->pBt;
drhda200cc2004-05-09 11:51:38 +00002318 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2319 MemPage *pPage = pCur->pPage;
2320 char *zMode = pCur->wrFlag ? "rw" : "ro";
drhfe63d1c2004-09-08 20:13:04 +00002321 sqlite3DebugPrintf("CURSOR %p rooted at %4d(%s) currently at %d.%d%s\n",
2322 pCur, pCur->pgnoRoot, zMode,
drhda200cc2004-05-09 11:51:38 +00002323 pPage ? pPage->pgno : 0, pCur->idx,
2324 pCur->isValid ? "" : " eof"
2325 );
2326 }
2327}
2328#endif
2329
drhc39e0002004-05-07 23:50:57 +00002330/*
drhecdc7532001-09-23 02:35:53 +00002331** Rollback the transaction in progress. All cursors will be
2332** invalided by this operation. Any attempt to use a cursor
2333** that was open at the beginning of this operation will result
2334** in an error.
drh5e00f6c2001-09-13 13:46:56 +00002335**
2336** This will release the write lock on the database file. If there
2337** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002338*/
danielk1977aef0bf62005-12-30 16:28:01 +00002339int sqlite3BtreeRollback(Btree *p){
danielk1977cfe9a692004-06-16 12:00:29 +00002340 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002341 BtShared *pBt = p->pBt;
drh24cd67e2004-05-10 16:18:47 +00002342 MemPage *pPage1;
danielk1977aef0bf62005-12-30 16:28:01 +00002343
2344 btreeIntegrity(p);
2345 unlockAllTables(p);
2346
2347 if( p->inTrans==TRANS_WRITE ){
2348 assert( TRANS_WRITE==pBt->inTransaction );
2349
drh24cd67e2004-05-10 16:18:47 +00002350 rc = sqlite3pager_rollback(pBt->pPager);
2351 /* The rollback may have destroyed the pPage1->aData value. So
2352 ** call getPage() on page 1 again to make sure pPage1->aData is
2353 ** set correctly. */
2354 if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
2355 releasePage(pPage1);
2356 }
danielk1977fbcd5852004-06-15 02:44:18 +00002357 assert( countWriteCursors(pBt)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00002358 pBt->inTransaction = TRANS_READ;
drh24cd67e2004-05-10 16:18:47 +00002359 }
danielk1977aef0bf62005-12-30 16:28:01 +00002360
2361 if( p->inTrans!=TRANS_NONE ){
2362 assert( pBt->nTransaction>0 );
2363 pBt->nTransaction--;
2364 if( 0==pBt->nTransaction ){
2365 pBt->inTransaction = TRANS_NONE;
2366 }
2367 }
2368
2369 p->inTrans = TRANS_NONE;
danielk1977ee5741e2004-05-31 10:01:34 +00002370 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00002371 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002372
2373 btreeIntegrity(p);
drha059ad02001-04-17 20:09:11 +00002374 return rc;
2375}
2376
2377/*
drhab01f612004-05-22 02:55:23 +00002378** Start a statement subtransaction. The subtransaction can
2379** can be rolled back independently of the main transaction.
2380** You must start a transaction before starting a subtransaction.
2381** The subtransaction is ended automatically if the main transaction
drh663fc632002-02-02 18:49:19 +00002382** commits or rolls back.
2383**
drhab01f612004-05-22 02:55:23 +00002384** Only one subtransaction may be active at a time. It is an error to try
2385** to start a new subtransaction if another subtransaction is already active.
2386**
2387** Statement subtransactions are used around individual SQL statements
2388** that are contained within a BEGIN...COMMIT block. If a constraint
2389** error occurs within the statement, the effect of that one statement
2390** can be rolled back without having to rollback the entire transaction.
drh663fc632002-02-02 18:49:19 +00002391*/
danielk1977aef0bf62005-12-30 16:28:01 +00002392int sqlite3BtreeBeginStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002393 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002394 BtShared *pBt = p->pBt;
2395 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00002396 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00002397 }
danielk1977aef0bf62005-12-30 16:28:01 +00002398 assert( pBt->inTransaction==TRANS_WRITE );
drha34b6762004-05-07 13:30:42 +00002399 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00002400 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00002401 return rc;
2402}
2403
2404
2405/*
drhab01f612004-05-22 02:55:23 +00002406** Commit the statment subtransaction currently in progress. If no
2407** subtransaction is active, this is a no-op.
drh663fc632002-02-02 18:49:19 +00002408*/
danielk1977aef0bf62005-12-30 16:28:01 +00002409int sqlite3BtreeCommitStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002410 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002411 BtShared *pBt = p->pBt;
drh3aac2dd2004-04-26 14:10:20 +00002412 if( pBt->inStmt && !pBt->readOnly ){
drha34b6762004-05-07 13:30:42 +00002413 rc = sqlite3pager_stmt_commit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00002414 }else{
2415 rc = SQLITE_OK;
2416 }
drh3aac2dd2004-04-26 14:10:20 +00002417 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00002418 return rc;
2419}
2420
2421/*
drhab01f612004-05-22 02:55:23 +00002422** Rollback the active statement subtransaction. If no subtransaction
2423** is active this routine is a no-op.
drh663fc632002-02-02 18:49:19 +00002424**
drhab01f612004-05-22 02:55:23 +00002425** All cursors will be invalidated by this operation. Any attempt
drh663fc632002-02-02 18:49:19 +00002426** to use a cursor that was open at the beginning of this operation
2427** will result in an error.
2428*/
danielk1977aef0bf62005-12-30 16:28:01 +00002429int sqlite3BtreeRollbackStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002430 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002431 BtShared *pBt = p->pBt;
drh3aac2dd2004-04-26 14:10:20 +00002432 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drha34b6762004-05-07 13:30:42 +00002433 rc = sqlite3pager_stmt_rollback(pBt->pPager);
danielk1977fbcd5852004-06-15 02:44:18 +00002434 assert( countWriteCursors(pBt)==0 );
drh3aac2dd2004-04-26 14:10:20 +00002435 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00002436 return rc;
2437}
2438
2439/*
drh3aac2dd2004-04-26 14:10:20 +00002440** Default key comparison function to be used if no comparison function
2441** is specified on the sqlite3BtreeCursor() call.
2442*/
2443static int dfltCompare(
2444 void *NotUsed, /* User data is not used */
2445 int n1, const void *p1, /* First key to compare */
2446 int n2, const void *p2 /* Second key to compare */
2447){
2448 int c;
2449 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2450 if( c==0 ){
2451 c = n1 - n2;
2452 }
2453 return c;
2454}
2455
2456/*
drh8b2f49b2001-06-08 00:21:52 +00002457** Create a new cursor for the BTree whose root is on the page
2458** iTable. The act of acquiring a cursor gets a read lock on
2459** the database file.
drh1bee3d72001-10-15 00:44:35 +00002460**
2461** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00002462** If wrFlag==1, then the cursor can be used for reading or for
2463** writing if other conditions for writing are also met. These
2464** are the conditions that must be met in order for writing to
2465** be allowed:
drh6446c4d2001-12-15 14:22:18 +00002466**
drhf74b8d92002-09-01 23:20:45 +00002467** 1: The cursor must have been opened with wrFlag==1
2468**
2469** 2: No other cursors may be open with wrFlag==0 on the same table
2470**
2471** 3: The database must be writable (not on read-only media)
2472**
2473** 4: There must be an active transaction.
2474**
2475** Condition 2 warrants further discussion. If any cursor is opened
2476** on a table with wrFlag==0, that prevents all other cursors from
2477** writing to that table. This is a kind of "read-lock". When a cursor
2478** is opened with wrFlag==0 it is guaranteed that the table will not
2479** change as long as the cursor is open. This allows the cursor to
2480** do a sequential scan of the table without having to worry about
2481** entries being inserted or deleted during the scan. Cursors should
2482** be opened with wrFlag==0 only if this read-lock property is needed.
2483** That is to say, cursors should be opened with wrFlag==0 only if they
drh23e11ca2004-05-04 17:27:28 +00002484** intend to use the sqlite3BtreeNext() system call. All other cursors
drhf74b8d92002-09-01 23:20:45 +00002485** should be opened with wrFlag==1 even if they never really intend
2486** to write.
2487**
drh6446c4d2001-12-15 14:22:18 +00002488** No checking is done to make sure that page iTable really is the
2489** root page of a b-tree. If it is not, then the cursor acquired
2490** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00002491**
2492** The comparison function must be logically the same for every cursor
2493** on a particular table. Changing the comparison function will result
2494** in incorrect operations. If the comparison function is NULL, a
2495** default comparison function is used. The comparison function is
2496** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00002497*/
drh3aac2dd2004-04-26 14:10:20 +00002498int sqlite3BtreeCursor(
danielk1977aef0bf62005-12-30 16:28:01 +00002499 Btree *p, /* The btree */
drh3aac2dd2004-04-26 14:10:20 +00002500 int iTable, /* Root page of table to open */
2501 int wrFlag, /* 1 to write. 0 read-only */
2502 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2503 void *pArg, /* First arg to xCompare() */
2504 BtCursor **ppCur /* Write new cursor here */
2505){
drha059ad02001-04-17 20:09:11 +00002506 int rc;
drh8dcd7ca2004-08-08 19:43:29 +00002507 BtCursor *pCur;
danielk1977aef0bf62005-12-30 16:28:01 +00002508 BtShared *pBt = p->pBt;
drhecdc7532001-09-23 02:35:53 +00002509
drh8dcd7ca2004-08-08 19:43:29 +00002510 *ppCur = 0;
2511 if( wrFlag ){
drh8dcd7ca2004-08-08 19:43:29 +00002512 if( pBt->readOnly ){
2513 return SQLITE_READONLY;
2514 }
2515 if( checkReadLocks(pBt, iTable, 0) ){
2516 return SQLITE_LOCKED;
2517 }
drha0c9a112004-03-10 13:42:37 +00002518 }
danielk1977aef0bf62005-12-30 16:28:01 +00002519
2520#ifndef SQLITE_OMIT_SHARED_CACHE
2521 rc = queryTableLock(p, iTable, wrFlag?WRITE_LOCK:READ_LOCK);
2522 if( rc!=SQLITE_OK ){
2523 return rc;
2524 }
2525#endif
2526
drh4b70f112004-05-02 21:12:19 +00002527 if( pBt->pPage1==0 ){
danielk1977aef0bf62005-12-30 16:28:01 +00002528 rc = lockBtreeWithRetry(p);
drha059ad02001-04-17 20:09:11 +00002529 if( rc!=SQLITE_OK ){
drha059ad02001-04-17 20:09:11 +00002530 return rc;
2531 }
2532 }
drheafe05b2004-06-13 00:54:01 +00002533 pCur = sqliteMallocRaw( sizeof(*pCur) );
drha059ad02001-04-17 20:09:11 +00002534 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00002535 rc = SQLITE_NOMEM;
2536 goto create_cursor_exception;
2537 }
drh8b2f49b2001-06-08 00:21:52 +00002538 pCur->pgnoRoot = (Pgno)iTable;
danielk19776b456a22005-03-21 04:04:02 +00002539 pCur->pPage = 0; /* For exit-handler, in case getAndInitPage() fails. */
drh24cd67e2004-05-10 16:18:47 +00002540 if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
2541 rc = SQLITE_EMPTY;
2542 goto create_cursor_exception;
2543 }
drhde647132004-05-07 17:57:49 +00002544 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00002545 if( rc!=SQLITE_OK ){
2546 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00002547 }
danielk1977aef0bf62005-12-30 16:28:01 +00002548
2549 /* Obtain the table-lock on the shared-btree. */
2550 rc = lockTable(p, iTable, wrFlag?WRITE_LOCK:READ_LOCK);
2551 if( rc!=SQLITE_OK ){
2552 assert( rc==SQLITE_NOMEM );
2553 goto create_cursor_exception;
2554 }
2555
2556 /* Now that no other errors can occur, finish filling in the BtCursor
2557 ** variables, link the cursor into the BtShared list and set *ppCur (the
2558 ** output argument to this function).
2559 */
drh3aac2dd2004-04-26 14:10:20 +00002560 pCur->xCompare = xCmp ? xCmp : dfltCompare;
2561 pCur->pArg = pArg;
danielk1977aef0bf62005-12-30 16:28:01 +00002562 pCur->pBtree = p;
drhecdc7532001-09-23 02:35:53 +00002563 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00002564 pCur->idx = 0;
drh59eb6762004-06-13 23:07:04 +00002565 memset(&pCur->info, 0, sizeof(pCur->info));
drha059ad02001-04-17 20:09:11 +00002566 pCur->pNext = pBt->pCursor;
2567 if( pCur->pNext ){
2568 pCur->pNext->pPrev = pCur;
2569 }
drh14acc042001-06-10 19:56:58 +00002570 pCur->pPrev = 0;
drha059ad02001-04-17 20:09:11 +00002571 pBt->pCursor = pCur;
drhc39e0002004-05-07 23:50:57 +00002572 pCur->isValid = 0;
drh2af926b2001-05-15 00:39:25 +00002573 *ppCur = pCur;
drhbd03cae2001-06-02 02:40:57 +00002574
danielk1977aef0bf62005-12-30 16:28:01 +00002575 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00002576create_cursor_exception:
drhbd03cae2001-06-02 02:40:57 +00002577 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00002578 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00002579 sqliteFree(pCur);
2580 }
drh5e00f6c2001-09-13 13:46:56 +00002581 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00002582 return rc;
drha059ad02001-04-17 20:09:11 +00002583}
2584
drh7a224de2004-06-02 01:22:02 +00002585#if 0 /* Not Used */
drhd3d39e92004-05-20 22:16:29 +00002586/*
2587** Change the value of the comparison function used by a cursor.
2588*/
danielk1977bf3b7212004-05-18 10:06:24 +00002589void sqlite3BtreeSetCompare(
drhd3d39e92004-05-20 22:16:29 +00002590 BtCursor *pCur, /* The cursor to whose comparison function is changed */
2591 int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */
2592 void *pArg /* First argument to xCmp() */
danielk1977bf3b7212004-05-18 10:06:24 +00002593){
2594 pCur->xCompare = xCmp ? xCmp : dfltCompare;
2595 pCur->pArg = pArg;
2596}
drh7a224de2004-06-02 01:22:02 +00002597#endif
danielk1977bf3b7212004-05-18 10:06:24 +00002598
drha059ad02001-04-17 20:09:11 +00002599/*
drh5e00f6c2001-09-13 13:46:56 +00002600** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00002601** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00002602*/
drh3aac2dd2004-04-26 14:10:20 +00002603int sqlite3BtreeCloseCursor(BtCursor *pCur){
danielk1977aef0bf62005-12-30 16:28:01 +00002604 BtShared *pBt = pCur->pBtree->pBt;
drha059ad02001-04-17 20:09:11 +00002605 if( pCur->pPrev ){
2606 pCur->pPrev->pNext = pCur->pNext;
2607 }else{
2608 pBt->pCursor = pCur->pNext;
2609 }
2610 if( pCur->pNext ){
2611 pCur->pNext->pPrev = pCur->pPrev;
2612 }
drh3aac2dd2004-04-26 14:10:20 +00002613 releasePage(pCur->pPage);
drh5e00f6c2001-09-13 13:46:56 +00002614 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00002615 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00002616 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002617}
2618
drh7e3b0a02001-04-28 16:52:40 +00002619/*
drh5e2f8b92001-05-28 00:41:15 +00002620** Make a temporary cursor by filling in the fields of pTempCur.
2621** The temporary cursor is not on the cursor list for the Btree.
2622*/
drh14acc042001-06-10 19:56:58 +00002623static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00002624 memcpy(pTempCur, pCur, sizeof(*pCur));
2625 pTempCur->pNext = 0;
2626 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00002627 if( pTempCur->pPage ){
drha34b6762004-05-07 13:30:42 +00002628 sqlite3pager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00002629 }
drh5e2f8b92001-05-28 00:41:15 +00002630}
2631
2632/*
drhbd03cae2001-06-02 02:40:57 +00002633** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00002634** function above.
2635*/
drh14acc042001-06-10 19:56:58 +00002636static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00002637 if( pCur->pPage ){
drha34b6762004-05-07 13:30:42 +00002638 sqlite3pager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00002639 }
drh5e2f8b92001-05-28 00:41:15 +00002640}
2641
2642/*
drh9188b382004-05-14 21:12:22 +00002643** Make sure the BtCursor.info field of the given cursor is valid.
drhab01f612004-05-22 02:55:23 +00002644** If it is not already valid, call parseCell() to fill it in.
2645**
2646** BtCursor.info is a cache of the information in the current cell.
2647** Using this cache reduces the number of calls to parseCell().
drh9188b382004-05-14 21:12:22 +00002648*/
2649static void getCellInfo(BtCursor *pCur){
drh271efa52004-05-30 19:19:05 +00002650 if( pCur->info.nSize==0 ){
drh3a41a3f2004-05-30 02:14:17 +00002651 parseCell(pCur->pPage, pCur->idx, &pCur->info);
drh9188b382004-05-14 21:12:22 +00002652 }else{
2653#ifndef NDEBUG
2654 CellInfo info;
drh51c6d962004-06-06 00:42:25 +00002655 memset(&info, 0, sizeof(info));
drh3a41a3f2004-05-30 02:14:17 +00002656 parseCell(pCur->pPage, pCur->idx, &info);
drh9188b382004-05-14 21:12:22 +00002657 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2658#endif
2659 }
2660}
2661
2662/*
drh3aac2dd2004-04-26 14:10:20 +00002663** Set *pSize to the size of the buffer needed to hold the value of
2664** the key for the current entry. If the cursor is not pointing
2665** to a valid entry, *pSize is set to 0.
2666**
drh4b70f112004-05-02 21:12:19 +00002667** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00002668** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00002669*/
drh4a1c3802004-05-12 15:15:47 +00002670int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
danielk1977299b1872004-11-22 10:02:10 +00002671 if( !pCur->isValid ){
drh72f82862001-05-24 21:06:34 +00002672 *pSize = 0;
2673 }else{
drh9188b382004-05-14 21:12:22 +00002674 getCellInfo(pCur);
2675 *pSize = pCur->info.nKey;
drh72f82862001-05-24 21:06:34 +00002676 }
2677 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002678}
drh2af926b2001-05-15 00:39:25 +00002679
drh72f82862001-05-24 21:06:34 +00002680/*
drh0e1c19e2004-05-11 00:58:56 +00002681** Set *pSize to the number of bytes of data in the entry the
2682** cursor currently points to. Always return SQLITE_OK.
2683** Failure is not possible. If the cursor is not currently
2684** pointing to an entry (which can happen, for example, if
2685** the database is empty) then *pSize is set to 0.
2686*/
2687int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
danielk1977299b1872004-11-22 10:02:10 +00002688 if( !pCur->isValid ){
danielk197796fc5fe2004-05-13 11:34:16 +00002689 /* Not pointing at a valid entry - set *pSize to 0. */
drh0e1c19e2004-05-11 00:58:56 +00002690 *pSize = 0;
2691 }else{
drh9188b382004-05-14 21:12:22 +00002692 getCellInfo(pCur);
2693 *pSize = pCur->info.nData;
drh0e1c19e2004-05-11 00:58:56 +00002694 }
2695 return SQLITE_OK;
2696}
2697
2698/*
drh72f82862001-05-24 21:06:34 +00002699** Read payload information from the entry that the pCur cursor is
2700** pointing to. Begin reading the payload at "offset" and read
2701** a total of "amt" bytes. Put the result in zBuf.
2702**
2703** This routine does not make a distinction between key and data.
drhab01f612004-05-22 02:55:23 +00002704** It just reads bytes from the payload area. Data might appear
2705** on the main page or be scattered out on multiple overflow pages.
drh72f82862001-05-24 21:06:34 +00002706*/
drh3aac2dd2004-04-26 14:10:20 +00002707static int getPayload(
2708 BtCursor *pCur, /* Cursor pointing to entry to read from */
2709 int offset, /* Begin reading this far into payload */
2710 int amt, /* Read this many bytes */
2711 unsigned char *pBuf, /* Write the bytes into this buffer */
2712 int skipKey /* offset begins at data if this is true */
2713){
2714 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00002715 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00002716 int rc;
drh3aac2dd2004-04-26 14:10:20 +00002717 MemPage *pPage;
danielk1977aef0bf62005-12-30 16:28:01 +00002718 BtShared *pBt;
drh6f11bef2004-05-13 01:12:56 +00002719 int ovflSize;
drhfa1a98a2004-05-14 19:08:17 +00002720 u32 nKey;
drh3aac2dd2004-04-26 14:10:20 +00002721
drh72f82862001-05-24 21:06:34 +00002722 assert( pCur!=0 && pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00002723 assert( pCur->isValid );
danielk1977aef0bf62005-12-30 16:28:01 +00002724 pBt = pCur->pBtree->pBt;
drh3aac2dd2004-04-26 14:10:20 +00002725 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00002726 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00002727 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh9188b382004-05-14 21:12:22 +00002728 getCellInfo(pCur);
drh43605152004-05-29 21:46:49 +00002729 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00002730 aPayload += pCur->info.nHeader;
drh3aac2dd2004-04-26 14:10:20 +00002731 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00002732 nKey = 0;
2733 }else{
2734 nKey = pCur->info.nKey;
drh3aac2dd2004-04-26 14:10:20 +00002735 }
2736 assert( offset>=0 );
2737 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00002738 offset += nKey;
drh3aac2dd2004-04-26 14:10:20 +00002739 }
drhfa1a98a2004-05-14 19:08:17 +00002740 if( offset+amt > nKey+pCur->info.nData ){
drha34b6762004-05-07 13:30:42 +00002741 return SQLITE_ERROR;
drh3aac2dd2004-04-26 14:10:20 +00002742 }
drhfa1a98a2004-05-14 19:08:17 +00002743 if( offset<pCur->info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00002744 int a = amt;
drhfa1a98a2004-05-14 19:08:17 +00002745 if( a+offset>pCur->info.nLocal ){
2746 a = pCur->info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00002747 }
drha34b6762004-05-07 13:30:42 +00002748 memcpy(pBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00002749 if( a==amt ){
2750 return SQLITE_OK;
2751 }
drh2aa679f2001-06-25 02:11:07 +00002752 offset = 0;
drha34b6762004-05-07 13:30:42 +00002753 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00002754 amt -= a;
drhdd793422001-06-28 01:54:48 +00002755 }else{
drhfa1a98a2004-05-14 19:08:17 +00002756 offset -= pCur->info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00002757 }
danielk1977cfe9a692004-06-16 12:00:29 +00002758 ovflSize = pBt->usableSize - 4;
drhbd03cae2001-06-02 02:40:57 +00002759 if( amt>0 ){
drhfa1a98a2004-05-14 19:08:17 +00002760 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
danielk1977cfe9a692004-06-16 12:00:29 +00002761 while( amt>0 && nextPage ){
2762 rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
2763 if( rc!=0 ){
2764 return rc;
drh2af926b2001-05-15 00:39:25 +00002765 }
danielk1977cfe9a692004-06-16 12:00:29 +00002766 nextPage = get4byte(aPayload);
2767 if( offset<ovflSize ){
2768 int a = amt;
2769 if( a + offset > ovflSize ){
2770 a = ovflSize - offset;
2771 }
2772 memcpy(pBuf, &aPayload[offset+4], a);
2773 offset = 0;
2774 amt -= a;
2775 pBuf += a;
2776 }else{
2777 offset -= ovflSize;
2778 }
2779 sqlite3pager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00002780 }
drh2af926b2001-05-15 00:39:25 +00002781 }
danielk1977cfe9a692004-06-16 12:00:29 +00002782
drha7fcb052001-12-14 15:09:55 +00002783 if( amt>0 ){
drh49285702005-09-17 15:20:26 +00002784 return SQLITE_CORRUPT_BKPT;
drha7fcb052001-12-14 15:09:55 +00002785 }
2786 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00002787}
2788
drh72f82862001-05-24 21:06:34 +00002789/*
drh3aac2dd2004-04-26 14:10:20 +00002790** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00002791** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00002792** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00002793**
drh3aac2dd2004-04-26 14:10:20 +00002794** Return SQLITE_OK on success or an error code if anything goes
2795** wrong. An error is returned if "offset+amt" is larger than
2796** the available payload.
drh72f82862001-05-24 21:06:34 +00002797*/
drha34b6762004-05-07 13:30:42 +00002798int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
danielk197728129562005-01-11 10:25:06 +00002799 assert( pCur->isValid );
drhc39e0002004-05-07 23:50:57 +00002800 assert( pCur->pPage!=0 );
drh6575a222005-03-10 17:06:34 +00002801 if( pCur->pPage->intKey ){
drh49285702005-09-17 15:20:26 +00002802 return SQLITE_CORRUPT_BKPT;
drh6575a222005-03-10 17:06:34 +00002803 }
drhc39e0002004-05-07 23:50:57 +00002804 assert( pCur->pPage->intKey==0 );
2805 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00002806 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
2807}
2808
2809/*
drh3aac2dd2004-04-26 14:10:20 +00002810** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00002811** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00002812** begins at "offset".
2813**
2814** Return SQLITE_OK on success or an error code if anything goes
2815** wrong. An error is returned if "offset+amt" is larger than
2816** the available payload.
drh72f82862001-05-24 21:06:34 +00002817*/
drh3aac2dd2004-04-26 14:10:20 +00002818int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
danielk197728129562005-01-11 10:25:06 +00002819 assert( pCur->isValid );
drh8c1238a2003-01-02 14:43:55 +00002820 assert( pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00002821 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00002822 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00002823}
2824
drh72f82862001-05-24 21:06:34 +00002825/*
drh0e1c19e2004-05-11 00:58:56 +00002826** Return a pointer to payload information from the entry that the
2827** pCur cursor is pointing to. The pointer is to the beginning of
2828** the key if skipKey==0 and it points to the beginning of data if
drhe51c44f2004-05-30 20:46:09 +00002829** skipKey==1. The number of bytes of available key/data is written
2830** into *pAmt. If *pAmt==0, then the value returned will not be
2831** a valid pointer.
drh0e1c19e2004-05-11 00:58:56 +00002832**
2833** This routine is an optimization. It is common for the entire key
2834** and data to fit on the local page and for there to be no overflow
2835** pages. When that is so, this routine can be used to access the
2836** key and data without making a copy. If the key and/or data spills
2837** onto overflow pages, then getPayload() must be used to reassembly
2838** the key/data and copy it into a preallocated buffer.
2839**
2840** The pointer returned by this routine looks directly into the cached
2841** page of the database. The data might change or move the next time
2842** any btree routine is called.
2843*/
2844static const unsigned char *fetchPayload(
2845 BtCursor *pCur, /* Cursor pointing to entry to read from */
drhe51c44f2004-05-30 20:46:09 +00002846 int *pAmt, /* Write the number of available bytes here */
drh0e1c19e2004-05-11 00:58:56 +00002847 int skipKey /* read beginning at data if this is true */
2848){
2849 unsigned char *aPayload;
2850 MemPage *pPage;
drhfa1a98a2004-05-14 19:08:17 +00002851 u32 nKey;
2852 int nLocal;
drh0e1c19e2004-05-11 00:58:56 +00002853
2854 assert( pCur!=0 && pCur->pPage!=0 );
2855 assert( pCur->isValid );
drh0e1c19e2004-05-11 00:58:56 +00002856 pPage = pCur->pPage;
2857 pageIntegrity(pPage);
2858 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh9188b382004-05-14 21:12:22 +00002859 getCellInfo(pCur);
drh43605152004-05-29 21:46:49 +00002860 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00002861 aPayload += pCur->info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00002862 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00002863 nKey = 0;
2864 }else{
2865 nKey = pCur->info.nKey;
drh0e1c19e2004-05-11 00:58:56 +00002866 }
drh0e1c19e2004-05-11 00:58:56 +00002867 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00002868 aPayload += nKey;
2869 nLocal = pCur->info.nLocal - nKey;
drh0e1c19e2004-05-11 00:58:56 +00002870 }else{
drhfa1a98a2004-05-14 19:08:17 +00002871 nLocal = pCur->info.nLocal;
drhe51c44f2004-05-30 20:46:09 +00002872 if( nLocal>nKey ){
2873 nLocal = nKey;
2874 }
drh0e1c19e2004-05-11 00:58:56 +00002875 }
drhe51c44f2004-05-30 20:46:09 +00002876 *pAmt = nLocal;
drh0e1c19e2004-05-11 00:58:56 +00002877 return aPayload;
2878}
2879
2880
2881/*
drhe51c44f2004-05-30 20:46:09 +00002882** For the entry that cursor pCur is point to, return as
2883** many bytes of the key or data as are available on the local
2884** b-tree page. Write the number of available bytes into *pAmt.
drh0e1c19e2004-05-11 00:58:56 +00002885**
2886** The pointer returned is ephemeral. The key/data may move
2887** or be destroyed on the next call to any Btree routine.
2888**
2889** These routines is used to get quick access to key and data
2890** in the common case where no overflow pages are used.
drh0e1c19e2004-05-11 00:58:56 +00002891*/
drhe51c44f2004-05-30 20:46:09 +00002892const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
2893 return (const void*)fetchPayload(pCur, pAmt, 0);
drh0e1c19e2004-05-11 00:58:56 +00002894}
drhe51c44f2004-05-30 20:46:09 +00002895const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
2896 return (const void*)fetchPayload(pCur, pAmt, 1);
drh0e1c19e2004-05-11 00:58:56 +00002897}
2898
2899
2900/*
drh8178a752003-01-05 21:41:40 +00002901** Move the cursor down to a new child page. The newPgno argument is the
drhab01f612004-05-22 02:55:23 +00002902** page number of the child page to move to.
drh72f82862001-05-24 21:06:34 +00002903*/
drh3aac2dd2004-04-26 14:10:20 +00002904static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00002905 int rc;
2906 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00002907 MemPage *pOldPage;
danielk1977aef0bf62005-12-30 16:28:01 +00002908 BtShared *pBt = pCur->pBtree->pBt;
drh72f82862001-05-24 21:06:34 +00002909
drhc39e0002004-05-07 23:50:57 +00002910 assert( pCur->isValid );
drhde647132004-05-07 17:57:49 +00002911 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00002912 if( rc ) return rc;
drhda200cc2004-05-09 11:51:38 +00002913 pageIntegrity(pNewPage);
drh428ae8c2003-01-04 16:48:09 +00002914 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00002915 pOldPage = pCur->pPage;
2916 pOldPage->idxShift = 0;
2917 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00002918 pCur->pPage = pNewPage;
2919 pCur->idx = 0;
drh271efa52004-05-30 19:19:05 +00002920 pCur->info.nSize = 0;
drh4be295b2003-12-16 03:44:47 +00002921 if( pNewPage->nCell<1 ){
drh49285702005-09-17 15:20:26 +00002922 return SQLITE_CORRUPT_BKPT;
drh4be295b2003-12-16 03:44:47 +00002923 }
drh72f82862001-05-24 21:06:34 +00002924 return SQLITE_OK;
2925}
2926
2927/*
drh8856d6a2004-04-29 14:42:46 +00002928** Return true if the page is the virtual root of its table.
2929**
2930** The virtual root page is the root page for most tables. But
2931** for the table rooted on page 1, sometime the real root page
2932** is empty except for the right-pointer. In such cases the
2933** virtual root page is the page that the right-pointer of page
2934** 1 is pointing to.
2935*/
2936static int isRootPage(MemPage *pPage){
2937 MemPage *pParent = pPage->pParent;
drhda200cc2004-05-09 11:51:38 +00002938 if( pParent==0 ) return 1;
2939 if( pParent->pgno>1 ) return 0;
2940 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
drh8856d6a2004-04-29 14:42:46 +00002941 return 0;
2942}
2943
2944/*
drh5e2f8b92001-05-28 00:41:15 +00002945** Move the cursor up to the parent page.
2946**
2947** pCur->idx is set to the cell index that contains the pointer
2948** to the page we are coming from. If we are coming from the
2949** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00002950** the largest cell index.
drh72f82862001-05-24 21:06:34 +00002951*/
drh8178a752003-01-05 21:41:40 +00002952static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00002953 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00002954 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00002955 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00002956
drhc39e0002004-05-07 23:50:57 +00002957 assert( pCur->isValid );
drh8178a752003-01-05 21:41:40 +00002958 pPage = pCur->pPage;
2959 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00002960 assert( !isRootPage(pPage) );
drhda200cc2004-05-09 11:51:38 +00002961 pageIntegrity(pPage);
drh8178a752003-01-05 21:41:40 +00002962 pParent = pPage->pParent;
2963 assert( pParent!=0 );
drhda200cc2004-05-09 11:51:38 +00002964 pageIntegrity(pParent);
drh8178a752003-01-05 21:41:40 +00002965 idxParent = pPage->idxParent;
drha34b6762004-05-07 13:30:42 +00002966 sqlite3pager_ref(pParent->aData);
drh3aac2dd2004-04-26 14:10:20 +00002967 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00002968 pCur->pPage = pParent;
drh271efa52004-05-30 19:19:05 +00002969 pCur->info.nSize = 0;
drh428ae8c2003-01-04 16:48:09 +00002970 assert( pParent->idxShift==0 );
drh43605152004-05-29 21:46:49 +00002971 pCur->idx = idxParent;
drh72f82862001-05-24 21:06:34 +00002972}
2973
2974/*
2975** Move the cursor to the root page
2976*/
drh5e2f8b92001-05-28 00:41:15 +00002977static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00002978 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00002979 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002980 BtShared *pBt = pCur->pBtree->pBt;
drhbd03cae2001-06-02 02:40:57 +00002981
drhde647132004-05-07 17:57:49 +00002982 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
drhc39e0002004-05-07 23:50:57 +00002983 if( rc ){
2984 pCur->isValid = 0;
2985 return rc;
2986 }
drh3aac2dd2004-04-26 14:10:20 +00002987 releasePage(pCur->pPage);
drhda200cc2004-05-09 11:51:38 +00002988 pageIntegrity(pRoot);
drh3aac2dd2004-04-26 14:10:20 +00002989 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00002990 pCur->idx = 0;
drh271efa52004-05-30 19:19:05 +00002991 pCur->info.nSize = 0;
drh8856d6a2004-04-29 14:42:46 +00002992 if( pRoot->nCell==0 && !pRoot->leaf ){
2993 Pgno subpage;
2994 assert( pRoot->pgno==1 );
drh43605152004-05-29 21:46:49 +00002995 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
drh8856d6a2004-04-29 14:42:46 +00002996 assert( subpage>0 );
drh3644f082004-05-10 18:45:09 +00002997 pCur->isValid = 1;
drh4b70f112004-05-02 21:12:19 +00002998 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00002999 }
drhc39e0002004-05-07 23:50:57 +00003000 pCur->isValid = pCur->pPage->nCell>0;
drh8856d6a2004-04-29 14:42:46 +00003001 return rc;
drh72f82862001-05-24 21:06:34 +00003002}
drh2af926b2001-05-15 00:39:25 +00003003
drh5e2f8b92001-05-28 00:41:15 +00003004/*
3005** Move the cursor down to the left-most leaf entry beneath the
3006** entry to which it is currently pointing.
3007*/
3008static int moveToLeftmost(BtCursor *pCur){
3009 Pgno pgno;
3010 int rc;
drh3aac2dd2004-04-26 14:10:20 +00003011 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00003012
drhc39e0002004-05-07 23:50:57 +00003013 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00003014 while( !(pPage = pCur->pPage)->leaf ){
drha34b6762004-05-07 13:30:42 +00003015 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00003016 pgno = get4byte(findCell(pPage, pCur->idx));
drh8178a752003-01-05 21:41:40 +00003017 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00003018 if( rc ) return rc;
3019 }
3020 return SQLITE_OK;
3021}
3022
drh2dcc9aa2002-12-04 13:40:25 +00003023/*
3024** Move the cursor down to the right-most leaf entry beneath the
3025** page to which it is currently pointing. Notice the difference
3026** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3027** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3028** finds the right-most entry beneath the *page*.
3029*/
3030static int moveToRightmost(BtCursor *pCur){
3031 Pgno pgno;
3032 int rc;
drh3aac2dd2004-04-26 14:10:20 +00003033 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00003034
drhc39e0002004-05-07 23:50:57 +00003035 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00003036 while( !(pPage = pCur->pPage)->leaf ){
drh43605152004-05-29 21:46:49 +00003037 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh3aac2dd2004-04-26 14:10:20 +00003038 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00003039 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00003040 if( rc ) return rc;
3041 }
drh3aac2dd2004-04-26 14:10:20 +00003042 pCur->idx = pPage->nCell - 1;
drh271efa52004-05-30 19:19:05 +00003043 pCur->info.nSize = 0;
drh2dcc9aa2002-12-04 13:40:25 +00003044 return SQLITE_OK;
3045}
3046
drh5e00f6c2001-09-13 13:46:56 +00003047/* Move the cursor to the first entry in the table. Return SQLITE_OK
3048** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003049** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00003050*/
drh3aac2dd2004-04-26 14:10:20 +00003051int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00003052 int rc;
3053 rc = moveToRoot(pCur);
3054 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00003055 if( pCur->isValid==0 ){
3056 assert( pCur->pPage->nCell==0 );
drh5e00f6c2001-09-13 13:46:56 +00003057 *pRes = 1;
3058 return SQLITE_OK;
3059 }
drhc39e0002004-05-07 23:50:57 +00003060 assert( pCur->pPage->nCell>0 );
drh5e00f6c2001-09-13 13:46:56 +00003061 *pRes = 0;
3062 rc = moveToLeftmost(pCur);
3063 return rc;
3064}
drh5e2f8b92001-05-28 00:41:15 +00003065
drh9562b552002-02-19 15:00:07 +00003066/* Move the cursor to the last entry in the table. Return SQLITE_OK
3067** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003068** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00003069*/
drh3aac2dd2004-04-26 14:10:20 +00003070int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00003071 int rc;
drh9562b552002-02-19 15:00:07 +00003072 rc = moveToRoot(pCur);
3073 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00003074 if( pCur->isValid==0 ){
3075 assert( pCur->pPage->nCell==0 );
drh9562b552002-02-19 15:00:07 +00003076 *pRes = 1;
3077 return SQLITE_OK;
3078 }
drhc39e0002004-05-07 23:50:57 +00003079 assert( pCur->isValid );
drh9562b552002-02-19 15:00:07 +00003080 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00003081 rc = moveToRightmost(pCur);
drh9562b552002-02-19 15:00:07 +00003082 return rc;
3083}
3084
drh3aac2dd2004-04-26 14:10:20 +00003085/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00003086** Return a success code.
3087**
drh3aac2dd2004-04-26 14:10:20 +00003088** For INTKEY tables, only the nKey parameter is used. pKey is
3089** ignored. For other tables, nKey is the number of bytes of data
drh0b2f3162005-12-21 18:36:45 +00003090** in pKey. The comparison function specified when the cursor was
drh3aac2dd2004-04-26 14:10:20 +00003091** created is used to compare keys.
3092**
drh5e2f8b92001-05-28 00:41:15 +00003093** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00003094** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00003095** were present. The cursor might point to an entry that comes
3096** before or after the key.
3097**
drhbd03cae2001-06-02 02:40:57 +00003098** The result of comparing the key with the entry to which the
drhab01f612004-05-22 02:55:23 +00003099** cursor is written to *pRes if pRes!=NULL. The meaning of
drhbd03cae2001-06-02 02:40:57 +00003100** this value is as follows:
3101**
3102** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00003103** is smaller than pKey or if the table is empty
3104** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00003105**
3106** *pRes==0 The cursor is left pointing at an entry that
3107** exactly matches pKey.
3108**
3109** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00003110** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00003111*/
drh4a1c3802004-05-12 15:15:47 +00003112int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00003113 int rc;
drh5e2f8b92001-05-28 00:41:15 +00003114 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00003115 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00003116 assert( pCur->pPage );
3117 assert( pCur->pPage->isInit );
3118 if( pCur->isValid==0 ){
drhf328bc82004-05-10 23:29:49 +00003119 *pRes = -1;
drhc39e0002004-05-07 23:50:57 +00003120 assert( pCur->pPage->nCell==0 );
3121 return SQLITE_OK;
3122 }
drh4eec4c12005-01-21 00:22:37 +00003123 for(;;){
drh72f82862001-05-24 21:06:34 +00003124 int lwr, upr;
3125 Pgno chldPg;
3126 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00003127 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00003128 lwr = 0;
3129 upr = pPage->nCell-1;
drh4eec4c12005-01-21 00:22:37 +00003130 if( !pPage->intKey && pKey==0 ){
drh49285702005-09-17 15:20:26 +00003131 return SQLITE_CORRUPT_BKPT;
drh4eec4c12005-01-21 00:22:37 +00003132 }
drhda200cc2004-05-09 11:51:38 +00003133 pageIntegrity(pPage);
drh72f82862001-05-24 21:06:34 +00003134 while( lwr<=upr ){
danielk197713adf8a2004-06-03 16:08:41 +00003135 void *pCellKey;
drh4a1c3802004-05-12 15:15:47 +00003136 i64 nCellKey;
drh72f82862001-05-24 21:06:34 +00003137 pCur->idx = (lwr+upr)/2;
drh271efa52004-05-30 19:19:05 +00003138 pCur->info.nSize = 0;
drhde647132004-05-07 17:57:49 +00003139 sqlite3BtreeKeySize(pCur, &nCellKey);
drh3aac2dd2004-04-26 14:10:20 +00003140 if( pPage->intKey ){
3141 if( nCellKey<nKey ){
3142 c = -1;
3143 }else if( nCellKey>nKey ){
3144 c = +1;
3145 }else{
3146 c = 0;
3147 }
drh3aac2dd2004-04-26 14:10:20 +00003148 }else{
drhe51c44f2004-05-30 20:46:09 +00003149 int available;
danielk197713adf8a2004-06-03 16:08:41 +00003150 pCellKey = (void *)fetchPayload(pCur, &available, 0);
drhe51c44f2004-05-30 20:46:09 +00003151 if( available>=nCellKey ){
3152 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3153 }else{
3154 pCellKey = sqliteMallocRaw( nCellKey );
3155 if( pCellKey==0 ) return SQLITE_NOMEM;
danielk197713adf8a2004-06-03 16:08:41 +00003156 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
drhe51c44f2004-05-30 20:46:09 +00003157 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3158 sqliteFree(pCellKey);
3159 if( rc ) return rc;
3160 }
drh3aac2dd2004-04-26 14:10:20 +00003161 }
drh72f82862001-05-24 21:06:34 +00003162 if( c==0 ){
drh8b18dd42004-05-12 19:18:15 +00003163 if( pPage->leafData && !pPage->leaf ){
drhfc70e6f2004-05-12 21:11:27 +00003164 lwr = pCur->idx;
3165 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00003166 break;
3167 }else{
drh8b18dd42004-05-12 19:18:15 +00003168 if( pRes ) *pRes = 0;
3169 return SQLITE_OK;
3170 }
drh72f82862001-05-24 21:06:34 +00003171 }
3172 if( c<0 ){
3173 lwr = pCur->idx+1;
3174 }else{
3175 upr = pCur->idx-1;
3176 }
3177 }
3178 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00003179 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00003180 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003181 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00003182 }else if( lwr>=pPage->nCell ){
drh43605152004-05-29 21:46:49 +00003183 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh72f82862001-05-24 21:06:34 +00003184 }else{
drh43605152004-05-29 21:46:49 +00003185 chldPg = get4byte(findCell(pPage, lwr));
drh72f82862001-05-24 21:06:34 +00003186 }
3187 if( chldPg==0 ){
drhc39e0002004-05-07 23:50:57 +00003188 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh72f82862001-05-24 21:06:34 +00003189 if( pRes ) *pRes = c;
3190 return SQLITE_OK;
3191 }
drh428ae8c2003-01-04 16:48:09 +00003192 pCur->idx = lwr;
drh271efa52004-05-30 19:19:05 +00003193 pCur->info.nSize = 0;
drh8178a752003-01-05 21:41:40 +00003194 rc = moveToChild(pCur, chldPg);
drhc39e0002004-05-07 23:50:57 +00003195 if( rc ){
3196 return rc;
3197 }
drh72f82862001-05-24 21:06:34 +00003198 }
drhbd03cae2001-06-02 02:40:57 +00003199 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00003200}
3201
3202/*
drhc39e0002004-05-07 23:50:57 +00003203** Return TRUE if the cursor is not pointing at an entry of the table.
3204**
3205** TRUE will be returned after a call to sqlite3BtreeNext() moves
3206** past the last entry in the table or sqlite3BtreePrev() moves past
3207** the first entry. TRUE is also returned if the table is empty.
3208*/
3209int sqlite3BtreeEof(BtCursor *pCur){
3210 return pCur->isValid==0;
3211}
3212
3213/*
drhbd03cae2001-06-02 02:40:57 +00003214** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00003215** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00003216** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00003217** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00003218*/
drh3aac2dd2004-04-26 14:10:20 +00003219int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00003220 int rc;
drh8178a752003-01-05 21:41:40 +00003221 MemPage *pPage = pCur->pPage;
drh8b18dd42004-05-12 19:18:15 +00003222
drh8c1238a2003-01-02 14:43:55 +00003223 assert( pRes!=0 );
drhc39e0002004-05-07 23:50:57 +00003224 if( pCur->isValid==0 ){
drh8c1238a2003-01-02 14:43:55 +00003225 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00003226 return SQLITE_OK;
drhecdc7532001-09-23 02:35:53 +00003227 }
drh8178a752003-01-05 21:41:40 +00003228 assert( pPage->isInit );
drh8178a752003-01-05 21:41:40 +00003229 assert( pCur->idx<pPage->nCell );
danielk19776a43f9b2004-11-16 04:57:24 +00003230
drh72f82862001-05-24 21:06:34 +00003231 pCur->idx++;
drh271efa52004-05-30 19:19:05 +00003232 pCur->info.nSize = 0;
drh8178a752003-01-05 21:41:40 +00003233 if( pCur->idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00003234 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00003235 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
drh5e2f8b92001-05-28 00:41:15 +00003236 if( rc ) return rc;
3237 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003238 *pRes = 0;
3239 return rc;
drh72f82862001-05-24 21:06:34 +00003240 }
drh5e2f8b92001-05-28 00:41:15 +00003241 do{
drh8856d6a2004-04-29 14:42:46 +00003242 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00003243 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00003244 pCur->isValid = 0;
drh5e2f8b92001-05-28 00:41:15 +00003245 return SQLITE_OK;
3246 }
drh8178a752003-01-05 21:41:40 +00003247 moveToParent(pCur);
3248 pPage = pCur->pPage;
3249 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00003250 *pRes = 0;
drh8b18dd42004-05-12 19:18:15 +00003251 if( pPage->leafData ){
3252 rc = sqlite3BtreeNext(pCur, pRes);
3253 }else{
3254 rc = SQLITE_OK;
3255 }
3256 return rc;
drh8178a752003-01-05 21:41:40 +00003257 }
3258 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00003259 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00003260 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00003261 }
drh5e2f8b92001-05-28 00:41:15 +00003262 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003263 return rc;
drh72f82862001-05-24 21:06:34 +00003264}
3265
drh3b7511c2001-05-26 13:15:44 +00003266/*
drh2dcc9aa2002-12-04 13:40:25 +00003267** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00003268** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00003269** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00003270** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00003271*/
drh3aac2dd2004-04-26 14:10:20 +00003272int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00003273 int rc;
3274 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00003275 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00003276 if( pCur->isValid==0 ){
3277 *pRes = 1;
3278 return SQLITE_OK;
3279 }
danielk19776a43f9b2004-11-16 04:57:24 +00003280
drh8178a752003-01-05 21:41:40 +00003281 pPage = pCur->pPage;
drh8178a752003-01-05 21:41:40 +00003282 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00003283 assert( pCur->idx>=0 );
drha34b6762004-05-07 13:30:42 +00003284 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00003285 pgno = get4byte( findCell(pPage, pCur->idx) );
drh8178a752003-01-05 21:41:40 +00003286 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00003287 if( rc ) return rc;
3288 rc = moveToRightmost(pCur);
3289 }else{
3290 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00003291 if( isRootPage(pPage) ){
drhc39e0002004-05-07 23:50:57 +00003292 pCur->isValid = 0;
3293 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00003294 return SQLITE_OK;
3295 }
drh8178a752003-01-05 21:41:40 +00003296 moveToParent(pCur);
3297 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00003298 }
3299 pCur->idx--;
drh271efa52004-05-30 19:19:05 +00003300 pCur->info.nSize = 0;
drh8237d452004-11-22 19:07:09 +00003301 if( pPage->leafData && !pPage->leaf ){
drh8b18dd42004-05-12 19:18:15 +00003302 rc = sqlite3BtreePrevious(pCur, pRes);
3303 }else{
3304 rc = SQLITE_OK;
3305 }
drh2dcc9aa2002-12-04 13:40:25 +00003306 }
drh8178a752003-01-05 21:41:40 +00003307 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00003308 return rc;
3309}
3310
3311/*
drh3b7511c2001-05-26 13:15:44 +00003312** Allocate a new page from the database file.
3313**
drha34b6762004-05-07 13:30:42 +00003314** The new page is marked as dirty. (In other words, sqlite3pager_write()
drh3b7511c2001-05-26 13:15:44 +00003315** has already been called on the new page.) The new page has also
3316** been referenced and the calling routine is responsible for calling
drha34b6762004-05-07 13:30:42 +00003317** sqlite3pager_unref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00003318**
3319** SQLITE_OK is returned on success. Any other return value indicates
3320** an error. *ppPage and *pPgno are undefined in the event of an error.
drha34b6762004-05-07 13:30:42 +00003321** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00003322**
drh199e3cf2002-07-18 11:01:47 +00003323** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3324** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00003325** attempt to keep related pages close to each other in the database file,
3326** which in turn can make database access faster.
danielk1977cb1a7eb2004-11-05 12:27:02 +00003327**
3328** If the "exact" parameter is not 0, and the page-number nearby exists
3329** anywhere on the free-list, then it is guarenteed to be returned. This
3330** is only used by auto-vacuum databases when allocating a new table.
drh3b7511c2001-05-26 13:15:44 +00003331*/
danielk1977cb1a7eb2004-11-05 12:27:02 +00003332static int allocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00003333 BtShared *pBt,
danielk1977cb1a7eb2004-11-05 12:27:02 +00003334 MemPage **ppPage,
3335 Pgno *pPgno,
3336 Pgno nearby,
3337 u8 exact
3338){
drh3aac2dd2004-04-26 14:10:20 +00003339 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00003340 int rc;
drh3aac2dd2004-04-26 14:10:20 +00003341 int n; /* Number of pages on the freelist */
3342 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00003343
drh3aac2dd2004-04-26 14:10:20 +00003344 pPage1 = pBt->pPage1;
3345 n = get4byte(&pPage1->aData[36]);
3346 if( n>0 ){
drh91025292004-05-03 19:49:32 +00003347 /* There are pages on the freelist. Reuse one of those pages. */
danielk1977cb1a7eb2004-11-05 12:27:02 +00003348 MemPage *pTrunk = 0;
3349 Pgno iTrunk;
3350 MemPage *pPrevTrunk = 0;
3351 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3352
3353 /* If the 'exact' parameter was true and a query of the pointer-map
3354 ** shows that the page 'nearby' is somewhere on the free-list, then
3355 ** the entire-list will be searched for that page.
3356 */
3357#ifndef SQLITE_OMIT_AUTOVACUUM
3358 if( exact ){
3359 u8 eType;
3360 assert( nearby>0 );
3361 assert( pBt->autoVacuum );
3362 rc = ptrmapGet(pBt, nearby, &eType, 0);
3363 if( rc ) return rc;
3364 if( eType==PTRMAP_FREEPAGE ){
3365 searchList = 1;
3366 }
3367 *pPgno = nearby;
3368 }
3369#endif
3370
3371 /* Decrement the free-list count by 1. Set iTrunk to the index of the
3372 ** first free-list trunk page. iPrevTrunk is initially 1.
3373 */
drha34b6762004-05-07 13:30:42 +00003374 rc = sqlite3pager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00003375 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003376 put4byte(&pPage1->aData[36], n-1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00003377
3378 /* The code within this loop is run only once if the 'searchList' variable
3379 ** is not true. Otherwise, it runs once for each trunk-page on the
3380 ** free-list until the page 'nearby' is located.
3381 */
3382 do {
3383 pPrevTrunk = pTrunk;
3384 if( pPrevTrunk ){
3385 iTrunk = get4byte(&pPrevTrunk->aData[0]);
drhbea00b92002-07-08 10:59:50 +00003386 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00003387 iTrunk = get4byte(&pPage1->aData[32]);
drhbea00b92002-07-08 10:59:50 +00003388 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00003389 rc = getPage(pBt, iTrunk, &pTrunk);
3390 if( rc ){
3391 releasePage(pPrevTrunk);
3392 return rc;
3393 }
3394
3395 /* TODO: This should move to after the loop? */
3396 rc = sqlite3pager_write(pTrunk->aData);
3397 if( rc ){
3398 releasePage(pTrunk);
3399 releasePage(pPrevTrunk);
3400 return rc;
3401 }
3402
3403 k = get4byte(&pTrunk->aData[4]);
3404 if( k==0 && !searchList ){
3405 /* The trunk has no leaves and the list is not being searched.
3406 ** So extract the trunk page itself and use it as the newly
3407 ** allocated page */
3408 assert( pPrevTrunk==0 );
3409 *pPgno = iTrunk;
3410 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3411 *ppPage = pTrunk;
3412 pTrunk = 0;
3413 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3414 }else if( k>pBt->usableSize/4 - 8 ){
3415 /* Value of k is out of range. Database corruption */
drh49285702005-09-17 15:20:26 +00003416 return SQLITE_CORRUPT_BKPT;
danielk1977cb1a7eb2004-11-05 12:27:02 +00003417#ifndef SQLITE_OMIT_AUTOVACUUM
3418 }else if( searchList && nearby==iTrunk ){
3419 /* The list is being searched and this trunk page is the page
3420 ** to allocate, regardless of whether it has leaves.
3421 */
3422 assert( *pPgno==iTrunk );
3423 *ppPage = pTrunk;
3424 searchList = 0;
3425 if( k==0 ){
3426 if( !pPrevTrunk ){
3427 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3428 }else{
3429 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3430 }
3431 }else{
3432 /* The trunk page is required by the caller but it contains
3433 ** pointers to free-list leaves. The first leaf becomes a trunk
3434 ** page in this case.
3435 */
3436 MemPage *pNewTrunk;
3437 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3438 rc = getPage(pBt, iNewTrunk, &pNewTrunk);
3439 if( rc!=SQLITE_OK ){
3440 releasePage(pTrunk);
3441 releasePage(pPrevTrunk);
3442 return rc;
3443 }
3444 rc = sqlite3pager_write(pNewTrunk->aData);
3445 if( rc!=SQLITE_OK ){
3446 releasePage(pNewTrunk);
3447 releasePage(pTrunk);
3448 releasePage(pPrevTrunk);
3449 return rc;
3450 }
3451 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3452 put4byte(&pNewTrunk->aData[4], k-1);
3453 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3454 if( !pPrevTrunk ){
3455 put4byte(&pPage1->aData[32], iNewTrunk);
3456 }else{
3457 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3458 }
3459 releasePage(pNewTrunk);
3460 }
3461 pTrunk = 0;
3462 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3463#endif
3464 }else{
3465 /* Extract a leaf from the trunk */
3466 int closest;
3467 Pgno iPage;
3468 unsigned char *aData = pTrunk->aData;
3469 if( nearby>0 ){
3470 int i, dist;
3471 closest = 0;
3472 dist = get4byte(&aData[8]) - nearby;
3473 if( dist<0 ) dist = -dist;
3474 for(i=1; i<k; i++){
3475 int d2 = get4byte(&aData[8+i*4]) - nearby;
3476 if( d2<0 ) d2 = -d2;
3477 if( d2<dist ){
3478 closest = i;
3479 dist = d2;
3480 }
3481 }
3482 }else{
3483 closest = 0;
3484 }
3485
3486 iPage = get4byte(&aData[8+closest*4]);
3487 if( !searchList || iPage==nearby ){
3488 *pPgno = iPage;
3489 if( *pPgno>sqlite3pager_pagecount(pBt->pPager) ){
3490 /* Free page off the end of the file */
drh49285702005-09-17 15:20:26 +00003491 return SQLITE_CORRUPT_BKPT;
danielk1977cb1a7eb2004-11-05 12:27:02 +00003492 }
3493 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
3494 ": %d more free pages\n",
3495 *pPgno, closest+1, k, pTrunk->pgno, n-1));
3496 if( closest<k-1 ){
3497 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
3498 }
3499 put4byte(&aData[4], k-1);
3500 rc = getPage(pBt, *pPgno, ppPage);
3501 if( rc==SQLITE_OK ){
3502 sqlite3pager_dont_rollback((*ppPage)->aData);
3503 rc = sqlite3pager_write((*ppPage)->aData);
danielk1977aac0a382005-01-16 11:07:06 +00003504 if( rc!=SQLITE_OK ){
3505 releasePage(*ppPage);
3506 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00003507 }
3508 searchList = 0;
3509 }
drhee696e22004-08-30 16:52:17 +00003510 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00003511 releasePage(pPrevTrunk);
3512 }while( searchList );
3513 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00003514 }else{
drh3aac2dd2004-04-26 14:10:20 +00003515 /* There are no pages on the freelist, so create a new page at the
3516 ** end of the file */
drha34b6762004-05-07 13:30:42 +00003517 *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
danielk1977afcdd022004-10-31 16:25:42 +00003518
3519#ifndef SQLITE_OMIT_AUTOVACUUM
drh42cac6d2004-11-20 20:31:11 +00003520 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt->usableSize, *pPgno) ){
danielk1977afcdd022004-10-31 16:25:42 +00003521 /* If *pPgno refers to a pointer-map page, allocate two new pages
3522 ** at the end of the file instead of one. The first allocated page
3523 ** becomes a new pointer-map page, the second is used by the caller.
3524 */
3525 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
danielk1977599fcba2004-11-08 07:13:13 +00003526 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk1977afcdd022004-10-31 16:25:42 +00003527 (*pPgno)++;
3528 }
3529#endif
3530
danielk1977599fcba2004-11-08 07:13:13 +00003531 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00003532 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00003533 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003534 rc = sqlite3pager_write((*ppPage)->aData);
danielk1977aac0a382005-01-16 11:07:06 +00003535 if( rc!=SQLITE_OK ){
3536 releasePage(*ppPage);
3537 }
drh3a4c1412004-05-09 20:40:11 +00003538 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00003539 }
danielk1977599fcba2004-11-08 07:13:13 +00003540
3541 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drh3b7511c2001-05-26 13:15:44 +00003542 return rc;
3543}
3544
3545/*
drh3aac2dd2004-04-26 14:10:20 +00003546** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00003547**
drha34b6762004-05-07 13:30:42 +00003548** sqlite3pager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00003549*/
drh3aac2dd2004-04-26 14:10:20 +00003550static int freePage(MemPage *pPage){
danielk1977aef0bf62005-12-30 16:28:01 +00003551 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00003552 MemPage *pPage1 = pBt->pPage1;
3553 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00003554
drh3aac2dd2004-04-26 14:10:20 +00003555 /* Prepare the page for freeing */
3556 assert( pPage->pgno>1 );
3557 pPage->isInit = 0;
3558 releasePage(pPage->pParent);
3559 pPage->pParent = 0;
3560
drha34b6762004-05-07 13:30:42 +00003561 /* Increment the free page count on pPage1 */
3562 rc = sqlite3pager_write(pPage1->aData);
drh3aac2dd2004-04-26 14:10:20 +00003563 if( rc ) return rc;
3564 n = get4byte(&pPage1->aData[36]);
3565 put4byte(&pPage1->aData[36], n+1);
3566
danielk1977687566d2004-11-02 12:56:41 +00003567#ifndef SQLITE_OMIT_AUTOVACUUM
3568 /* If the database supports auto-vacuum, write an entry in the pointer-map
danielk1977cb1a7eb2004-11-05 12:27:02 +00003569 ** to indicate that the page is free.
danielk1977687566d2004-11-02 12:56:41 +00003570 */
3571 if( pBt->autoVacuum ){
3572 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
danielk1977a64a0352004-11-05 01:45:13 +00003573 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00003574 }
3575#endif
3576
drh3aac2dd2004-04-26 14:10:20 +00003577 if( n==0 ){
3578 /* This is the first free page */
drhda200cc2004-05-09 11:51:38 +00003579 rc = sqlite3pager_write(pPage->aData);
3580 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003581 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00003582 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00003583 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00003584 }else{
3585 /* Other free pages already exist. Retrive the first trunk page
3586 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00003587 MemPage *pTrunk;
3588 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00003589 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003590 k = get4byte(&pTrunk->aData[4]);
drhee696e22004-08-30 16:52:17 +00003591 if( k>=pBt->usableSize/4 - 8 ){
drh3aac2dd2004-04-26 14:10:20 +00003592 /* The trunk is full. Turn the page being freed into a new
3593 ** trunk page with no leaves. */
drha34b6762004-05-07 13:30:42 +00003594 rc = sqlite3pager_write(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00003595 if( rc ) return rc;
3596 put4byte(pPage->aData, pTrunk->pgno);
3597 put4byte(&pPage->aData[4], 0);
3598 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00003599 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
3600 pPage->pgno, pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00003601 }else{
3602 /* Add the newly freed page as a leaf on the current trunk */
drha34b6762004-05-07 13:30:42 +00003603 rc = sqlite3pager_write(pTrunk->aData);
drh3aac2dd2004-04-26 14:10:20 +00003604 if( rc ) return rc;
3605 put4byte(&pTrunk->aData[4], k+1);
3606 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drha34b6762004-05-07 13:30:42 +00003607 sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00003608 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00003609 }
3610 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00003611 }
drh3b7511c2001-05-26 13:15:44 +00003612 return rc;
3613}
3614
3615/*
drh3aac2dd2004-04-26 14:10:20 +00003616** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00003617*/
drh3aac2dd2004-04-26 14:10:20 +00003618static int clearCell(MemPage *pPage, unsigned char *pCell){
danielk1977aef0bf62005-12-30 16:28:01 +00003619 BtShared *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00003620 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00003621 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00003622 int rc;
drh3b7511c2001-05-26 13:15:44 +00003623
drh43605152004-05-29 21:46:49 +00003624 parseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00003625 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00003626 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00003627 }
drh6f11bef2004-05-13 01:12:56 +00003628 ovflPgno = get4byte(&pCell[info.iOverflow]);
drh3aac2dd2004-04-26 14:10:20 +00003629 while( ovflPgno!=0 ){
3630 MemPage *pOvfl;
danielk1977a1cb1832005-02-12 08:59:55 +00003631 if( ovflPgno>sqlite3pager_pagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00003632 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00003633 }
drh3aac2dd2004-04-26 14:10:20 +00003634 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00003635 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003636 ovflPgno = get4byte(pOvfl->aData);
drha34b6762004-05-07 13:30:42 +00003637 rc = freePage(pOvfl);
drha34b6762004-05-07 13:30:42 +00003638 sqlite3pager_unref(pOvfl->aData);
danielk19776b456a22005-03-21 04:04:02 +00003639 if( rc ) return rc;
drh3b7511c2001-05-26 13:15:44 +00003640 }
drh5e2f8b92001-05-28 00:41:15 +00003641 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00003642}
3643
3644/*
drh91025292004-05-03 19:49:32 +00003645** Create the byte sequence used to represent a cell on page pPage
3646** and write that byte sequence into pCell[]. Overflow pages are
3647** allocated and filled in as necessary. The calling procedure
3648** is responsible for making sure sufficient space has been allocated
3649** for pCell[].
3650**
3651** Note that pCell does not necessary need to point to the pPage->aData
3652** area. pCell might point to some temporary storage. The cell will
3653** be constructed in this temporary area then copied into pPage->aData
3654** later.
drh3b7511c2001-05-26 13:15:44 +00003655*/
3656static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00003657 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00003658 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00003659 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00003660 const void *pData,int nData, /* The data */
3661 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00003662){
drh3b7511c2001-05-26 13:15:44 +00003663 int nPayload;
drh8c6fa9b2004-05-26 00:01:53 +00003664 const u8 *pSrc;
drha34b6762004-05-07 13:30:42 +00003665 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00003666 int spaceLeft;
3667 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00003668 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00003669 unsigned char *pPrior;
3670 unsigned char *pPayload;
danielk1977aef0bf62005-12-30 16:28:01 +00003671 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00003672 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00003673 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00003674 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00003675
drh91025292004-05-03 19:49:32 +00003676 /* Fill in the header. */
drh43605152004-05-29 21:46:49 +00003677 nHeader = 0;
drh91025292004-05-03 19:49:32 +00003678 if( !pPage->leaf ){
3679 nHeader += 4;
3680 }
drh8b18dd42004-05-12 19:18:15 +00003681 if( pPage->hasData ){
drh91025292004-05-03 19:49:32 +00003682 nHeader += putVarint(&pCell[nHeader], nData);
drh6f11bef2004-05-13 01:12:56 +00003683 }else{
drh91025292004-05-03 19:49:32 +00003684 nData = 0;
3685 }
drh6f11bef2004-05-13 01:12:56 +00003686 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
drh43605152004-05-29 21:46:49 +00003687 parseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00003688 assert( info.nHeader==nHeader );
3689 assert( info.nKey==nKey );
3690 assert( info.nData==nData );
3691
3692 /* Fill in the payload */
drh3aac2dd2004-04-26 14:10:20 +00003693 nPayload = nData;
3694 if( pPage->intKey ){
3695 pSrc = pData;
3696 nSrc = nData;
drh91025292004-05-03 19:49:32 +00003697 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00003698 }else{
3699 nPayload += nKey;
3700 pSrc = pKey;
3701 nSrc = nKey;
3702 }
drh6f11bef2004-05-13 01:12:56 +00003703 *pnSize = info.nSize;
3704 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00003705 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00003706 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00003707
drh3b7511c2001-05-26 13:15:44 +00003708 while( nPayload>0 ){
3709 if( spaceLeft==0 ){
danielk1977afcdd022004-10-31 16:25:42 +00003710#ifndef SQLITE_OMIT_AUTOVACUUM
3711 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
3712#endif
danielk1977cb1a7eb2004-11-05 12:27:02 +00003713 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
danielk1977afcdd022004-10-31 16:25:42 +00003714#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977a19df672004-11-03 11:37:07 +00003715 /* If the database supports auto-vacuum, and the second or subsequent
3716 ** overflow page is being allocated, add an entry to the pointer-map
3717 ** for that page now. The entry for the first overflow page will be
3718 ** added later, by the insertCell() routine.
danielk1977afcdd022004-10-31 16:25:42 +00003719 */
danielk1977a19df672004-11-03 11:37:07 +00003720 if( pBt->autoVacuum && pgnoPtrmap!=0 && rc==SQLITE_OK ){
3721 rc = ptrmapPut(pBt, pgnoOvfl, PTRMAP_OVERFLOW2, pgnoPtrmap);
danielk1977afcdd022004-10-31 16:25:42 +00003722 }
3723#endif
drh3b7511c2001-05-26 13:15:44 +00003724 if( rc ){
drh9b171272004-05-08 02:03:22 +00003725 releasePage(pToRelease);
danielk197728129562005-01-11 10:25:06 +00003726 /* clearCell(pPage, pCell); */
drh3b7511c2001-05-26 13:15:44 +00003727 return rc;
3728 }
drh3aac2dd2004-04-26 14:10:20 +00003729 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00003730 releasePage(pToRelease);
3731 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00003732 pPrior = pOvfl->aData;
3733 put4byte(pPrior, 0);
3734 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00003735 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00003736 }
3737 n = nPayload;
3738 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00003739 if( n>nSrc ) n = nSrc;
3740 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00003741 nPayload -= n;
drhde647132004-05-07 17:57:49 +00003742 pPayload += n;
drh9b171272004-05-08 02:03:22 +00003743 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00003744 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00003745 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00003746 if( nSrc==0 ){
3747 nSrc = nData;
3748 pSrc = pData;
3749 }
drhdd793422001-06-28 01:54:48 +00003750 }
drh9b171272004-05-08 02:03:22 +00003751 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00003752 return SQLITE_OK;
3753}
3754
3755/*
drhbd03cae2001-06-02 02:40:57 +00003756** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00003757** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00003758** pointer in the third argument.
3759*/
danielk1977aef0bf62005-12-30 16:28:01 +00003760static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00003761 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00003762 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00003763
danielk1977afcdd022004-10-31 16:25:42 +00003764 if( pgno==0 ) return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +00003765 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00003766 aData = sqlite3pager_lookup(pBt->pPager, pgno);
drhda200cc2004-05-09 11:51:38 +00003767 if( aData ){
drh07d183d2005-05-01 22:52:42 +00003768 pThis = (MemPage*)&aData[pBt->pageSize];
drh31276532004-09-27 12:20:52 +00003769 assert( pThis->aData==aData );
drhda200cc2004-05-09 11:51:38 +00003770 if( pThis->isInit ){
3771 if( pThis->pParent!=pNewParent ){
3772 if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
3773 pThis->pParent = pNewParent;
3774 if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
3775 }
3776 pThis->idxParent = idx;
drhdd793422001-06-28 01:54:48 +00003777 }
drha34b6762004-05-07 13:30:42 +00003778 sqlite3pager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00003779 }
danielk1977afcdd022004-10-31 16:25:42 +00003780
3781#ifndef SQLITE_OMIT_AUTOVACUUM
3782 if( pBt->autoVacuum ){
3783 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
3784 }
3785#endif
3786 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00003787}
3788
danielk1977ac11ee62005-01-15 12:45:51 +00003789
3790
drhbd03cae2001-06-02 02:40:57 +00003791/*
drh4b70f112004-05-02 21:12:19 +00003792** Change the pParent pointer of all children of pPage to point back
3793** to pPage.
3794**
drhbd03cae2001-06-02 02:40:57 +00003795** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00003796** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00003797**
3798** This routine gets called after you memcpy() one page into
3799** another.
3800*/
danielk1977afcdd022004-10-31 16:25:42 +00003801static int reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00003802 int i;
danielk1977aef0bf62005-12-30 16:28:01 +00003803 BtShared *pBt = pPage->pBt;
danielk1977afcdd022004-10-31 16:25:42 +00003804 int rc = SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +00003805
danielk1977afcdd022004-10-31 16:25:42 +00003806 if( pPage->leaf ) return SQLITE_OK;
danielk1977afcdd022004-10-31 16:25:42 +00003807
drhbd03cae2001-06-02 02:40:57 +00003808 for(i=0; i<pPage->nCell; i++){
danielk1977afcdd022004-10-31 16:25:42 +00003809 u8 *pCell = findCell(pPage, i);
3810 if( !pPage->leaf ){
3811 rc = reparentPage(pBt, get4byte(pCell), pPage, i);
3812 if( rc!=SQLITE_OK ) return rc;
3813 }
drhbd03cae2001-06-02 02:40:57 +00003814 }
danielk1977afcdd022004-10-31 16:25:42 +00003815 if( !pPage->leaf ){
3816 rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
3817 pPage, i);
3818 pPage->idxShift = 0;
3819 }
3820 return rc;
drh14acc042001-06-10 19:56:58 +00003821}
3822
3823/*
3824** Remove the i-th cell from pPage. This routine effects pPage only.
3825** The cell content is not freed or deallocated. It is assumed that
3826** the cell content has been copied someplace else. This routine just
3827** removes the reference to the cell from pPage.
3828**
3829** "sz" must be the number of bytes in the cell.
drh14acc042001-06-10 19:56:58 +00003830*/
drh4b70f112004-05-02 21:12:19 +00003831static void dropCell(MemPage *pPage, int idx, int sz){
drh43605152004-05-29 21:46:49 +00003832 int i; /* Loop counter */
3833 int pc; /* Offset to cell content of cell being deleted */
3834 u8 *data; /* pPage->aData */
3835 u8 *ptr; /* Used to move bytes around within data[] */
3836
drh8c42ca92001-06-22 19:15:00 +00003837 assert( idx>=0 && idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00003838 assert( sz==cellSize(pPage, idx) );
drha34b6762004-05-07 13:30:42 +00003839 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00003840 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00003841 ptr = &data[pPage->cellOffset + 2*idx];
3842 pc = get2byte(ptr);
3843 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
drhde647132004-05-07 17:57:49 +00003844 freeSpace(pPage, pc, sz);
drh43605152004-05-29 21:46:49 +00003845 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
3846 ptr[0] = ptr[2];
3847 ptr[1] = ptr[3];
drh14acc042001-06-10 19:56:58 +00003848 }
3849 pPage->nCell--;
drh43605152004-05-29 21:46:49 +00003850 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
3851 pPage->nFree += 2;
drh428ae8c2003-01-04 16:48:09 +00003852 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00003853}
3854
3855/*
3856** Insert a new cell on pPage at cell index "i". pCell points to the
3857** content of the cell.
3858**
3859** If the cell content will fit on the page, then put it there. If it
drh43605152004-05-29 21:46:49 +00003860** will not fit, then make a copy of the cell content into pTemp if
3861** pTemp is not null. Regardless of pTemp, allocate a new entry
3862** in pPage->aOvfl[] and make it point to the cell content (either
3863** in pTemp or the original pCell) and also record its index.
3864** Allocating a new entry in pPage->aCell[] implies that
3865** pPage->nOverflow is incremented.
danielk1977a3ad5e72005-01-07 08:56:44 +00003866**
3867** If nSkip is non-zero, then do not copy the first nSkip bytes of the
3868** cell. The caller will overwrite them after this function returns. If
drh4b238df2005-01-08 15:43:18 +00003869** nSkip is non-zero, then pCell may not point to an invalid memory location
danielk1977a3ad5e72005-01-07 08:56:44 +00003870** (but pCell+nSkip is always valid).
drh14acc042001-06-10 19:56:58 +00003871*/
danielk1977e80463b2004-11-03 03:01:16 +00003872static int insertCell(
drh24cd67e2004-05-10 16:18:47 +00003873 MemPage *pPage, /* Page into which we are copying */
drh43605152004-05-29 21:46:49 +00003874 int i, /* New cell becomes the i-th cell of the page */
3875 u8 *pCell, /* Content of the new cell */
3876 int sz, /* Bytes of content in pCell */
danielk1977a3ad5e72005-01-07 08:56:44 +00003877 u8 *pTemp, /* Temp storage space for pCell, if needed */
3878 u8 nSkip /* Do not write the first nSkip bytes of the cell */
drh24cd67e2004-05-10 16:18:47 +00003879){
drh43605152004-05-29 21:46:49 +00003880 int idx; /* Where to write new cell content in data[] */
3881 int j; /* Loop counter */
3882 int top; /* First byte of content for any cell in data[] */
3883 int end; /* First byte past the last cell pointer in data[] */
3884 int ins; /* Index in data[] where new cell pointer is inserted */
3885 int hdr; /* Offset into data[] of the page header */
3886 int cellOffset; /* Address of first cell pointer in data[] */
3887 u8 *data; /* The content of the whole page */
3888 u8 *ptr; /* Used for moving information around in data[] */
3889
3890 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
3891 assert( sz==cellSizePtr(pPage, pCell) );
drha34b6762004-05-07 13:30:42 +00003892 assert( sqlite3pager_iswriteable(pPage->aData) );
drh43605152004-05-29 21:46:49 +00003893 if( pPage->nOverflow || sz+2>pPage->nFree ){
drh24cd67e2004-05-10 16:18:47 +00003894 if( pTemp ){
danielk1977a3ad5e72005-01-07 08:56:44 +00003895 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00003896 pCell = pTemp;
drh24cd67e2004-05-10 16:18:47 +00003897 }
drh43605152004-05-29 21:46:49 +00003898 j = pPage->nOverflow++;
3899 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
3900 pPage->aOvfl[j].pCell = pCell;
3901 pPage->aOvfl[j].idx = i;
3902 pPage->nFree = 0;
drh14acc042001-06-10 19:56:58 +00003903 }else{
drh43605152004-05-29 21:46:49 +00003904 data = pPage->aData;
3905 hdr = pPage->hdrOffset;
3906 top = get2byte(&data[hdr+5]);
3907 cellOffset = pPage->cellOffset;
3908 end = cellOffset + 2*pPage->nCell + 2;
3909 ins = cellOffset + 2*i;
3910 if( end > top - sz ){
danielk19776b456a22005-03-21 04:04:02 +00003911 int rc = defragmentPage(pPage);
3912 if( rc!=SQLITE_OK ) return rc;
drh43605152004-05-29 21:46:49 +00003913 top = get2byte(&data[hdr+5]);
3914 assert( end + sz <= top );
3915 }
3916 idx = allocateSpace(pPage, sz);
3917 assert( idx>0 );
3918 assert( end <= get2byte(&data[hdr+5]) );
3919 pPage->nCell++;
3920 pPage->nFree -= 2;
danielk1977a3ad5e72005-01-07 08:56:44 +00003921 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00003922 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
3923 ptr[0] = ptr[-2];
3924 ptr[1] = ptr[-1];
drhda200cc2004-05-09 11:51:38 +00003925 }
drh43605152004-05-29 21:46:49 +00003926 put2byte(&data[ins], idx);
3927 put2byte(&data[hdr+3], pPage->nCell);
3928 pPage->idxShift = 1;
drhda200cc2004-05-09 11:51:38 +00003929 pageIntegrity(pPage);
danielk1977a19df672004-11-03 11:37:07 +00003930#ifndef SQLITE_OMIT_AUTOVACUUM
3931 if( pPage->pBt->autoVacuum ){
3932 /* The cell may contain a pointer to an overflow page. If so, write
3933 ** the entry for the overflow page into the pointer map.
3934 */
3935 CellInfo info;
3936 parseCellPtr(pPage, pCell, &info);
3937 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
3938 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
3939 int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
3940 if( rc!=SQLITE_OK ) return rc;
3941 }
3942 }
3943#endif
drh14acc042001-06-10 19:56:58 +00003944 }
danielk1977e80463b2004-11-03 03:01:16 +00003945
danielk1977e80463b2004-11-03 03:01:16 +00003946 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00003947}
3948
3949/*
drhfa1a98a2004-05-14 19:08:17 +00003950** Add a list of cells to a page. The page should be initially empty.
3951** The cells are guaranteed to fit on the page.
3952*/
3953static void assemblePage(
3954 MemPage *pPage, /* The page to be assemblied */
3955 int nCell, /* The number of cells to add to this page */
drh43605152004-05-29 21:46:49 +00003956 u8 **apCell, /* Pointers to cell bodies */
drhfa1a98a2004-05-14 19:08:17 +00003957 int *aSize /* Sizes of the cells */
3958){
3959 int i; /* Loop counter */
3960 int totalSize; /* Total size of all cells */
3961 int hdr; /* Index of page header */
drh43605152004-05-29 21:46:49 +00003962 int cellptr; /* Address of next cell pointer */
3963 int cellbody; /* Address of next cell body */
drhfa1a98a2004-05-14 19:08:17 +00003964 u8 *data; /* Data for the page */
3965
drh43605152004-05-29 21:46:49 +00003966 assert( pPage->nOverflow==0 );
drhfa1a98a2004-05-14 19:08:17 +00003967 totalSize = 0;
3968 for(i=0; i<nCell; i++){
3969 totalSize += aSize[i];
3970 }
drh43605152004-05-29 21:46:49 +00003971 assert( totalSize+2*nCell<=pPage->nFree );
drhfa1a98a2004-05-14 19:08:17 +00003972 assert( pPage->nCell==0 );
drh43605152004-05-29 21:46:49 +00003973 cellptr = pPage->cellOffset;
drhfa1a98a2004-05-14 19:08:17 +00003974 data = pPage->aData;
3975 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00003976 put2byte(&data[hdr+3], nCell);
drh09d0deb2005-08-02 17:13:09 +00003977 if( nCell ){
3978 cellbody = allocateSpace(pPage, totalSize);
3979 assert( cellbody>0 );
3980 assert( pPage->nFree >= 2*nCell );
3981 pPage->nFree -= 2*nCell;
3982 for(i=0; i<nCell; i++){
3983 put2byte(&data[cellptr], cellbody);
3984 memcpy(&data[cellbody], apCell[i], aSize[i]);
3985 cellptr += 2;
3986 cellbody += aSize[i];
3987 }
3988 assert( cellbody==pPage->pBt->usableSize );
drhfa1a98a2004-05-14 19:08:17 +00003989 }
3990 pPage->nCell = nCell;
drhfa1a98a2004-05-14 19:08:17 +00003991}
3992
drh14acc042001-06-10 19:56:58 +00003993/*
drhc3b70572003-01-04 19:44:07 +00003994** The following parameters determine how many adjacent pages get involved
3995** in a balancing operation. NN is the number of neighbors on either side
3996** of the page that participate in the balancing operation. NB is the
3997** total number of pages that participate, including the target page and
3998** NN neighbors on either side.
3999**
4000** The minimum value of NN is 1 (of course). Increasing NN above 1
4001** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4002** in exchange for a larger degradation in INSERT and UPDATE performance.
4003** The value of NN appears to give the best results overall.
4004*/
4005#define NN 1 /* Number of neighbors on either side of pPage */
4006#define NB (NN*2+1) /* Total pages involved in the balance */
4007
drh43605152004-05-29 21:46:49 +00004008/* Forward reference */
danielk1977ac245ec2005-01-14 13:50:11 +00004009static int balance(MemPage*, int);
4010
drh615ae552005-01-16 23:21:00 +00004011#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004012/*
4013** This version of balance() handles the common special case where
4014** a new entry is being inserted on the extreme right-end of the
4015** tree, in other words, when the new entry will become the largest
4016** entry in the tree.
4017**
4018** Instead of trying balance the 3 right-most leaf pages, just add
4019** a new page to the right-hand side and put the one new entry in
4020** that page. This leaves the right side of the tree somewhat
4021** unbalanced. But odds are that we will be inserting new entries
4022** at the end soon afterwards so the nearly empty page will quickly
4023** fill up. On average.
4024**
4025** pPage is the leaf page which is the right-most page in the tree.
4026** pParent is its parent. pPage must have a single overflow entry
4027** which is also the right-most entry on the page.
4028*/
danielk1977ac245ec2005-01-14 13:50:11 +00004029static int balance_quick(MemPage *pPage, MemPage *pParent){
4030 int rc;
4031 MemPage *pNew;
4032 Pgno pgnoNew;
4033 u8 *pCell;
4034 int szCell;
4035 CellInfo info;
danielk1977aef0bf62005-12-30 16:28:01 +00004036 BtShared *pBt = pPage->pBt;
danielk197779a40da2005-01-16 08:00:01 +00004037 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4038 int parentSize; /* Size of new divider cell */
4039 u8 parentCell[64]; /* Space for the new divider cell */
danielk1977ac245ec2005-01-14 13:50:11 +00004040
4041 /* Allocate a new page. Insert the overflow cell from pPage
4042 ** into it. Then remove the overflow cell from pPage.
4043 */
danielk1977ac11ee62005-01-15 12:45:51 +00004044 rc = allocatePage(pBt, &pNew, &pgnoNew, 0, 0);
danielk1977ac245ec2005-01-14 13:50:11 +00004045 if( rc!=SQLITE_OK ){
4046 return rc;
4047 }
4048 pCell = pPage->aOvfl[0].pCell;
4049 szCell = cellSizePtr(pPage, pCell);
4050 zeroPage(pNew, pPage->aData[0]);
4051 assemblePage(pNew, 1, &pCell, &szCell);
4052 pPage->nOverflow = 0;
4053
danielk197779a40da2005-01-16 08:00:01 +00004054 /* Set the parent of the newly allocated page to pParent. */
4055 pNew->pParent = pParent;
4056 sqlite3pager_ref(pParent->aData);
4057
danielk1977ac245ec2005-01-14 13:50:11 +00004058 /* pPage is currently the right-child of pParent. Change this
4059 ** so that the right-child is the new page allocated above and
danielk197779a40da2005-01-16 08:00:01 +00004060 ** pPage is the next-to-right child.
danielk1977ac245ec2005-01-14 13:50:11 +00004061 */
danielk1977ac11ee62005-01-15 12:45:51 +00004062 assert( pPage->nCell>0 );
danielk1977ac245ec2005-01-14 13:50:11 +00004063 parseCellPtr(pPage, findCell(pPage, pPage->nCell-1), &info);
4064 rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, &parentSize);
4065 if( rc!=SQLITE_OK ){
danielk197779a40da2005-01-16 08:00:01 +00004066 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00004067 }
4068 assert( parentSize<64 );
4069 rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4070 if( rc!=SQLITE_OK ){
danielk197779a40da2005-01-16 08:00:01 +00004071 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00004072 }
4073 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4074 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4075
danielk197779a40da2005-01-16 08:00:01 +00004076#ifndef SQLITE_OMIT_AUTOVACUUM
4077 /* If this is an auto-vacuum database, update the pointer map
4078 ** with entries for the new page, and any pointer from the
4079 ** cell on the page to an overflow page.
4080 */
danielk1977ac11ee62005-01-15 12:45:51 +00004081 if( pBt->autoVacuum ){
4082 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4083 if( rc!=SQLITE_OK ){
4084 return rc;
4085 }
danielk197779a40da2005-01-16 08:00:01 +00004086 rc = ptrmapPutOvfl(pNew, 0);
4087 if( rc!=SQLITE_OK ){
4088 return rc;
danielk1977ac11ee62005-01-15 12:45:51 +00004089 }
4090 }
danielk197779a40da2005-01-16 08:00:01 +00004091#endif
danielk1977ac11ee62005-01-15 12:45:51 +00004092
danielk197779a40da2005-01-16 08:00:01 +00004093 /* Release the reference to the new page and balance the parent page,
4094 ** in case the divider cell inserted caused it to become overfull.
4095 */
danielk1977ac245ec2005-01-14 13:50:11 +00004096 releasePage(pNew);
4097 return balance(pParent, 0);
4098}
drh615ae552005-01-16 23:21:00 +00004099#endif /* SQLITE_OMIT_QUICKBALANCE */
drh43605152004-05-29 21:46:49 +00004100
drhc3b70572003-01-04 19:44:07 +00004101/*
danielk1977ac11ee62005-01-15 12:45:51 +00004102** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
4103** if the database supports auto-vacuum or not. Because it is used
4104** within an expression that is an argument to another macro
4105** (sqliteMallocRaw), it is not possible to use conditional compilation.
4106** So, this macro is defined instead.
4107*/
4108#ifndef SQLITE_OMIT_AUTOVACUUM
4109#define ISAUTOVACUUM (pBt->autoVacuum)
4110#else
4111#define ISAUTOVACUUM 0
4112#endif
4113
4114/*
drhab01f612004-05-22 02:55:23 +00004115** This routine redistributes Cells on pPage and up to NN*2 siblings
drh8b2f49b2001-06-08 00:21:52 +00004116** of pPage so that all pages have about the same amount of free space.
drh0c6cc4e2004-06-15 02:13:26 +00004117** Usually NN siblings on either side of pPage is used in the balancing,
4118** though more siblings might come from one side if pPage is the first
drhab01f612004-05-22 02:55:23 +00004119** or last child of its parent. If pPage has fewer than 2*NN siblings
drh8b2f49b2001-06-08 00:21:52 +00004120** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00004121** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00004122**
drh0c6cc4e2004-06-15 02:13:26 +00004123** The number of siblings of pPage might be increased or decreased by one or
4124** two in an effort to keep pages nearly full but not over full. The root page
drhab01f612004-05-22 02:55:23 +00004125** is special and is allowed to be nearly empty. If pPage is
drh8c42ca92001-06-22 19:15:00 +00004126** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00004127** or decreased by one, as necessary, to keep the root page from being
drhab01f612004-05-22 02:55:23 +00004128** overfull or completely empty.
drh14acc042001-06-10 19:56:58 +00004129**
drh8b2f49b2001-06-08 00:21:52 +00004130** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00004131** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00004132** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00004133** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00004134**
drh8c42ca92001-06-22 19:15:00 +00004135** In the course of balancing the siblings of pPage, the parent of pPage
4136** might become overfull or underfull. If that happens, then this routine
4137** is called recursively on the parent.
4138**
drh5e00f6c2001-09-13 13:46:56 +00004139** If this routine fails for any reason, it might leave the database
4140** in a corrupted state. So if this routine fails, the database should
4141** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00004142*/
drh43605152004-05-29 21:46:49 +00004143static int balance_nonroot(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00004144 MemPage *pParent; /* The parent of pPage */
danielk1977aef0bf62005-12-30 16:28:01 +00004145 BtShared *pBt; /* The whole database */
danielk1977634f2982005-03-28 08:44:07 +00004146 int nCell = 0; /* Number of cells in apCell[] */
4147 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
drh8b2f49b2001-06-08 00:21:52 +00004148 int nOld; /* Number of pages in apOld[] */
4149 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00004150 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00004151 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00004152 int idx; /* Index of pPage in pParent->aCell[] */
4153 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00004154 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00004155 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00004156 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00004157 int usableSpace; /* Bytes in pPage beyond the header */
4158 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00004159 int subtotal; /* Subtotal of bytes in cells on one page */
drhb6f41482004-05-14 01:58:11 +00004160 int iSpace = 0; /* First unused byte of aSpace[] */
drhc3b70572003-01-04 19:44:07 +00004161 MemPage *apOld[NB]; /* pPage and up to two siblings */
4162 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00004163 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drha2fce642004-06-05 00:01:44 +00004164 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4165 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
drh4b70f112004-05-02 21:12:19 +00004166 u8 *apDiv[NB]; /* Divider cells in pParent */
drha2fce642004-06-05 00:01:44 +00004167 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4168 int szNew[NB+2]; /* Combined size of cells place on i-th page */
danielk197750f059b2005-03-29 02:54:03 +00004169 u8 **apCell = 0; /* All cells begin balanced */
drh2e38c322004-09-03 18:38:44 +00004170 int *szCell; /* Local size of all cells in apCell[] */
4171 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4172 u8 *aSpace; /* Space to hold copies of dividers cells */
danielk19774e17d142005-01-16 09:06:33 +00004173#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977ac11ee62005-01-15 12:45:51 +00004174 u8 *aFrom = 0;
4175#endif
drh8b2f49b2001-06-08 00:21:52 +00004176
drh14acc042001-06-10 19:56:58 +00004177 /*
drh43605152004-05-29 21:46:49 +00004178 ** Find the parent page.
drh8b2f49b2001-06-08 00:21:52 +00004179 */
drh3a4c1412004-05-09 20:40:11 +00004180 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00004181 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00004182 pBt = pPage->pBt;
drh14acc042001-06-10 19:56:58 +00004183 pParent = pPage->pParent;
drh43605152004-05-29 21:46:49 +00004184 sqlite3pager_write(pParent->aData);
4185 assert( pParent );
4186 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
drh2e38c322004-09-03 18:38:44 +00004187
drh615ae552005-01-16 23:21:00 +00004188#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004189 /*
4190 ** A special case: If a new entry has just been inserted into a
4191 ** table (that is, a btree with integer keys and all data at the leaves)
drh09d0deb2005-08-02 17:13:09 +00004192 ** and the new entry is the right-most entry in the tree (it has the
drhf222e712005-01-14 22:55:49 +00004193 ** largest key) then use the special balance_quick() routine for
4194 ** balancing. balance_quick() is much faster and results in a tighter
4195 ** packing of data in the common case.
4196 */
danielk1977ac245ec2005-01-14 13:50:11 +00004197 if( pPage->leaf &&
4198 pPage->intKey &&
4199 pPage->leafData &&
4200 pPage->nOverflow==1 &&
4201 pPage->aOvfl[0].idx==pPage->nCell &&
danielk1977ac11ee62005-01-15 12:45:51 +00004202 pPage->pParent->pgno!=1 &&
danielk1977ac245ec2005-01-14 13:50:11 +00004203 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4204 ){
danielk1977ac11ee62005-01-15 12:45:51 +00004205 /*
4206 ** TODO: Check the siblings to the left of pPage. It may be that
4207 ** they are not full and no new page is required.
4208 */
danielk1977ac245ec2005-01-14 13:50:11 +00004209 return balance_quick(pPage, pParent);
4210 }
4211#endif
4212
drh2e38c322004-09-03 18:38:44 +00004213 /*
drh4b70f112004-05-02 21:12:19 +00004214 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00004215 ** to pPage. The "idx" variable is the index of that cell. If pPage
4216 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00004217 */
drhbb49aba2003-01-04 18:53:27 +00004218 if( pParent->idxShift ){
drha34b6762004-05-07 13:30:42 +00004219 Pgno pgno;
drh4b70f112004-05-02 21:12:19 +00004220 pgno = pPage->pgno;
drha34b6762004-05-07 13:30:42 +00004221 assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00004222 for(idx=0; idx<pParent->nCell; idx++){
drh43605152004-05-29 21:46:49 +00004223 if( get4byte(findCell(pParent, idx))==pgno ){
drhbb49aba2003-01-04 18:53:27 +00004224 break;
4225 }
drh8b2f49b2001-06-08 00:21:52 +00004226 }
drh4b70f112004-05-02 21:12:19 +00004227 assert( idx<pParent->nCell
drh43605152004-05-29 21:46:49 +00004228 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
drhbb49aba2003-01-04 18:53:27 +00004229 }else{
4230 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00004231 }
drh8b2f49b2001-06-08 00:21:52 +00004232
4233 /*
drh14acc042001-06-10 19:56:58 +00004234 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00004235 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00004236 */
drh14acc042001-06-10 19:56:58 +00004237 nOld = nNew = 0;
drha34b6762004-05-07 13:30:42 +00004238 sqlite3pager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00004239
4240 /*
drh4b70f112004-05-02 21:12:19 +00004241 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00004242 ** the siblings. An attempt is made to find NN siblings on either
4243 ** side of pPage. More siblings are taken from one side, however, if
4244 ** pPage there are fewer than NN siblings on the other side. If pParent
4245 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00004246 */
drhc3b70572003-01-04 19:44:07 +00004247 nxDiv = idx - NN;
4248 if( nxDiv + NB > pParent->nCell ){
4249 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00004250 }
drhc3b70572003-01-04 19:44:07 +00004251 if( nxDiv<0 ){
4252 nxDiv = 0;
4253 }
drh8b2f49b2001-06-08 00:21:52 +00004254 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00004255 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00004256 if( k<pParent->nCell ){
drh43605152004-05-29 21:46:49 +00004257 apDiv[i] = findCell(pParent, k);
drh8b2f49b2001-06-08 00:21:52 +00004258 nDiv++;
drha34b6762004-05-07 13:30:42 +00004259 assert( !pParent->leaf );
drh43605152004-05-29 21:46:49 +00004260 pgnoOld[i] = get4byte(apDiv[i]);
drh14acc042001-06-10 19:56:58 +00004261 }else if( k==pParent->nCell ){
drh43605152004-05-29 21:46:49 +00004262 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
drh14acc042001-06-10 19:56:58 +00004263 }else{
4264 break;
drh8b2f49b2001-06-08 00:21:52 +00004265 }
drhde647132004-05-07 17:57:49 +00004266 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00004267 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00004268 apOld[i]->idxParent = k;
drh91025292004-05-03 19:49:32 +00004269 apCopy[i] = 0;
4270 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00004271 nOld++;
danielk1977634f2982005-03-28 08:44:07 +00004272 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
drh8b2f49b2001-06-08 00:21:52 +00004273 }
4274
drh8d97f1f2005-05-05 18:14:13 +00004275 /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
4276 ** alignment */
4277 nMaxCells = (nMaxCells + 1)&~1;
4278
drh8b2f49b2001-06-08 00:21:52 +00004279 /*
danielk1977634f2982005-03-28 08:44:07 +00004280 ** Allocate space for memory structures
4281 */
4282 apCell = sqliteMallocRaw(
4283 nMaxCells*sizeof(u8*) /* apCell */
4284 + nMaxCells*sizeof(int) /* szCell */
drhc96d8532005-05-03 12:30:33 +00004285 + ROUND8(sizeof(MemPage))*NB /* aCopy */
drh07d183d2005-05-01 22:52:42 +00004286 + pBt->pageSize*(5+NB) /* aSpace */
drhc96d8532005-05-03 12:30:33 +00004287 + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */
danielk1977634f2982005-03-28 08:44:07 +00004288 );
4289 if( apCell==0 ){
4290 rc = SQLITE_NOMEM;
4291 goto balance_cleanup;
4292 }
4293 szCell = (int*)&apCell[nMaxCells];
4294 aCopy[0] = (u8*)&szCell[nMaxCells];
drhc96d8532005-05-03 12:30:33 +00004295 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00004296 for(i=1; i<NB; i++){
drhc96d8532005-05-03 12:30:33 +00004297 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4298 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00004299 }
drhc96d8532005-05-03 12:30:33 +00004300 aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4301 assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00004302#ifndef SQLITE_OMIT_AUTOVACUUM
4303 if( pBt->autoVacuum ){
drh07d183d2005-05-01 22:52:42 +00004304 aFrom = &aSpace[5*pBt->pageSize];
danielk1977634f2982005-03-28 08:44:07 +00004305 }
4306#endif
4307
4308 /*
drh14acc042001-06-10 19:56:58 +00004309 ** Make copies of the content of pPage and its siblings into aOld[].
4310 ** The rest of this function will use data from the copies rather
4311 ** that the original pages since the original pages will be in the
4312 ** process of being overwritten.
4313 */
4314 for(i=0; i<nOld; i++){
drh07d183d2005-05-01 22:52:42 +00004315 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize];
drh07d183d2005-05-01 22:52:42 +00004316 p->aData = &((u8*)p)[-pBt->pageSize];
4317 memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
4318 /* The memcpy() above changes the value of p->aData so we have to
4319 ** set it again. */
drh07d183d2005-05-01 22:52:42 +00004320 p->aData = &((u8*)p)[-pBt->pageSize];
drh14acc042001-06-10 19:56:58 +00004321 }
4322
4323 /*
4324 ** Load pointers to all cells on sibling pages and the divider cells
4325 ** into the local apCell[] array. Make copies of the divider cells
drhb6f41482004-05-14 01:58:11 +00004326 ** into space obtained form aSpace[] and remove the the divider Cells
4327 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00004328 **
4329 ** If the siblings are on leaf pages, then the child pointers of the
4330 ** divider cells are stripped from the cells before they are copied
drh96f5b762004-05-16 16:24:36 +00004331 ** into aSpace[]. In this way, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00004332 ** child pointers. If siblings are not leaves, then all cell in
4333 ** apCell[] include child pointers. Either way, all cells in apCell[]
4334 ** are alike.
drh96f5b762004-05-16 16:24:36 +00004335 **
4336 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
4337 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
drh8b2f49b2001-06-08 00:21:52 +00004338 */
4339 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00004340 leafCorrection = pPage->leaf*4;
drh8b18dd42004-05-12 19:18:15 +00004341 leafData = pPage->leafData && pPage->leaf;
drh8b2f49b2001-06-08 00:21:52 +00004342 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00004343 MemPage *pOld = apCopy[i];
drh43605152004-05-29 21:46:49 +00004344 int limit = pOld->nCell+pOld->nOverflow;
4345 for(j=0; j<limit; j++){
danielk1977634f2982005-03-28 08:44:07 +00004346 assert( nCell<nMaxCells );
drh43605152004-05-29 21:46:49 +00004347 apCell[nCell] = findOverflowCell(pOld, j);
4348 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
danielk1977ac11ee62005-01-15 12:45:51 +00004349#ifndef SQLITE_OMIT_AUTOVACUUM
4350 if( pBt->autoVacuum ){
4351 int a;
4352 aFrom[nCell] = i;
4353 for(a=0; a<pOld->nOverflow; a++){
4354 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
4355 aFrom[nCell] = 0xFF;
4356 break;
4357 }
4358 }
4359 }
4360#endif
drh14acc042001-06-10 19:56:58 +00004361 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00004362 }
4363 if( i<nOld-1 ){
drh43605152004-05-29 21:46:49 +00004364 int sz = cellSizePtr(pParent, apDiv[i]);
drh8b18dd42004-05-12 19:18:15 +00004365 if( leafData ){
drh96f5b762004-05-16 16:24:36 +00004366 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
4367 ** are duplicates of keys on the child pages. We need to remove
4368 ** the divider cells from pParent, but the dividers cells are not
4369 ** added to apCell[] because they are duplicates of child cells.
4370 */
drh8b18dd42004-05-12 19:18:15 +00004371 dropCell(pParent, nxDiv, sz);
drh4b70f112004-05-02 21:12:19 +00004372 }else{
drhb6f41482004-05-14 01:58:11 +00004373 u8 *pTemp;
danielk1977634f2982005-03-28 08:44:07 +00004374 assert( nCell<nMaxCells );
drhb6f41482004-05-14 01:58:11 +00004375 szCell[nCell] = sz;
4376 pTemp = &aSpace[iSpace];
4377 iSpace += sz;
drh07d183d2005-05-01 22:52:42 +00004378 assert( iSpace<=pBt->pageSize*5 );
drhb6f41482004-05-14 01:58:11 +00004379 memcpy(pTemp, apDiv[i], sz);
4380 apCell[nCell] = pTemp+leafCorrection;
danielk1977ac11ee62005-01-15 12:45:51 +00004381#ifndef SQLITE_OMIT_AUTOVACUUM
4382 if( pBt->autoVacuum ){
4383 aFrom[nCell] = 0xFF;
4384 }
4385#endif
drhb6f41482004-05-14 01:58:11 +00004386 dropCell(pParent, nxDiv, sz);
drh8b18dd42004-05-12 19:18:15 +00004387 szCell[nCell] -= leafCorrection;
drh43605152004-05-29 21:46:49 +00004388 assert( get4byte(pTemp)==pgnoOld[i] );
drh8b18dd42004-05-12 19:18:15 +00004389 if( !pOld->leaf ){
4390 assert( leafCorrection==0 );
4391 /* The right pointer of the child page pOld becomes the left
4392 ** pointer of the divider cell */
drh43605152004-05-29 21:46:49 +00004393 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
drh8b18dd42004-05-12 19:18:15 +00004394 }else{
4395 assert( leafCorrection==4 );
4396 }
4397 nCell++;
drh4b70f112004-05-02 21:12:19 +00004398 }
drh8b2f49b2001-06-08 00:21:52 +00004399 }
4400 }
4401
4402 /*
drh6019e162001-07-02 17:51:45 +00004403 ** Figure out the number of pages needed to hold all nCell cells.
4404 ** Store this number in "k". Also compute szNew[] which is the total
4405 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00004406 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00004407 ** cntNew[k] should equal nCell.
4408 **
drh96f5b762004-05-16 16:24:36 +00004409 ** Values computed by this block:
4410 **
4411 ** k: The total number of sibling pages
4412 ** szNew[i]: Spaced used on the i-th sibling page.
4413 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
4414 ** the right of the i-th sibling page.
4415 ** usableSpace: Number of bytes of space available on each sibling.
4416 **
drh8b2f49b2001-06-08 00:21:52 +00004417 */
drh43605152004-05-29 21:46:49 +00004418 usableSpace = pBt->usableSize - 12 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00004419 for(subtotal=k=i=0; i<nCell; i++){
danielk1977634f2982005-03-28 08:44:07 +00004420 assert( i<nMaxCells );
drh43605152004-05-29 21:46:49 +00004421 subtotal += szCell[i] + 2;
drh4b70f112004-05-02 21:12:19 +00004422 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00004423 szNew[k] = subtotal - szCell[i];
4424 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00004425 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00004426 subtotal = 0;
4427 k++;
4428 }
4429 }
4430 szNew[k] = subtotal;
4431 cntNew[k] = nCell;
4432 k++;
drh96f5b762004-05-16 16:24:36 +00004433
4434 /*
4435 ** The packing computed by the previous block is biased toward the siblings
4436 ** on the left side. The left siblings are always nearly full, while the
4437 ** right-most sibling might be nearly empty. This block of code attempts
4438 ** to adjust the packing of siblings to get a better balance.
4439 **
4440 ** This adjustment is more than an optimization. The packing above might
4441 ** be so out of balance as to be illegal. For example, the right-most
4442 ** sibling might be completely empty. This adjustment is not optional.
4443 */
drh6019e162001-07-02 17:51:45 +00004444 for(i=k-1; i>0; i--){
drh96f5b762004-05-16 16:24:36 +00004445 int szRight = szNew[i]; /* Size of sibling on the right */
4446 int szLeft = szNew[i-1]; /* Size of sibling on the left */
4447 int r; /* Index of right-most cell in left sibling */
4448 int d; /* Index of first cell to the left of right sibling */
4449
4450 r = cntNew[i-1] - 1;
4451 d = r + 1 - leafData;
danielk1977634f2982005-03-28 08:44:07 +00004452 assert( d<nMaxCells );
4453 assert( r<nMaxCells );
drh43605152004-05-29 21:46:49 +00004454 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
4455 szRight += szCell[d] + 2;
4456 szLeft -= szCell[r] + 2;
drh6019e162001-07-02 17:51:45 +00004457 cntNew[i-1]--;
drh96f5b762004-05-16 16:24:36 +00004458 r = cntNew[i-1] - 1;
4459 d = r + 1 - leafData;
drh6019e162001-07-02 17:51:45 +00004460 }
drh96f5b762004-05-16 16:24:36 +00004461 szNew[i] = szRight;
4462 szNew[i-1] = szLeft;
drh6019e162001-07-02 17:51:45 +00004463 }
drh09d0deb2005-08-02 17:13:09 +00004464
4465 /* Either we found one or more cells (cntnew[0])>0) or we are the
4466 ** a virtual root page. A virtual root page is when the real root
4467 ** page is page 1 and we are the only child of that page.
4468 */
4469 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
drh8b2f49b2001-06-08 00:21:52 +00004470
4471 /*
drh6b308672002-07-08 02:16:37 +00004472 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00004473 */
drh4b70f112004-05-02 21:12:19 +00004474 assert( pPage->pgno>1 );
4475 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00004476 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00004477 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00004478 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00004479 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00004480 pgnoNew[i] = pgnoOld[i];
4481 apOld[i] = 0;
danielk197728129562005-01-11 10:25:06 +00004482 rc = sqlite3pager_write(pNew->aData);
4483 if( rc ) goto balance_cleanup;
drh6b308672002-07-08 02:16:37 +00004484 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00004485 rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
drh6b308672002-07-08 02:16:37 +00004486 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00004487 apNew[i] = pNew;
drh6b308672002-07-08 02:16:37 +00004488 }
drh14acc042001-06-10 19:56:58 +00004489 nNew++;
drhda200cc2004-05-09 11:51:38 +00004490 zeroPage(pNew, pageFlags);
drh8b2f49b2001-06-08 00:21:52 +00004491 }
4492
danielk1977299b1872004-11-22 10:02:10 +00004493 /* Free any old pages that were not reused as new pages.
4494 */
4495 while( i<nOld ){
4496 rc = freePage(apOld[i]);
4497 if( rc ) goto balance_cleanup;
4498 releasePage(apOld[i]);
4499 apOld[i] = 0;
4500 i++;
4501 }
4502
drh8b2f49b2001-06-08 00:21:52 +00004503 /*
drhf9ffac92002-03-02 19:00:31 +00004504 ** Put the new pages in accending order. This helps to
4505 ** keep entries in the disk file in order so that a scan
4506 ** of the table is a linear scan through the file. That
4507 ** in turn helps the operating system to deliver pages
4508 ** from the disk more rapidly.
4509 **
4510 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00004511 ** n is never more than NB (a small constant), that should
4512 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00004513 **
drhc3b70572003-01-04 19:44:07 +00004514 ** When NB==3, this one optimization makes the database
4515 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00004516 */
4517 for(i=0; i<k-1; i++){
4518 int minV = pgnoNew[i];
4519 int minI = i;
4520 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00004521 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00004522 minI = j;
4523 minV = pgnoNew[j];
4524 }
4525 }
4526 if( minI>i ){
4527 int t;
4528 MemPage *pT;
4529 t = pgnoNew[i];
4530 pT = apNew[i];
4531 pgnoNew[i] = pgnoNew[minI];
4532 apNew[i] = apNew[minI];
4533 pgnoNew[minI] = t;
4534 apNew[minI] = pT;
4535 }
4536 }
drha2fce642004-06-05 00:01:44 +00004537 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
drh24cd67e2004-05-10 16:18:47 +00004538 pgnoOld[0],
4539 nOld>=2 ? pgnoOld[1] : 0,
4540 nOld>=3 ? pgnoOld[2] : 0,
drh10c0fa62004-05-18 12:50:17 +00004541 pgnoNew[0], szNew[0],
4542 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
4543 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
drha2fce642004-06-05 00:01:44 +00004544 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
4545 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
drh24cd67e2004-05-10 16:18:47 +00004546
drhf9ffac92002-03-02 19:00:31 +00004547 /*
drh14acc042001-06-10 19:56:58 +00004548 ** Evenly distribute the data in apCell[] across the new pages.
4549 ** Insert divider cells into pParent as necessary.
4550 */
4551 j = 0;
4552 for(i=0; i<nNew; i++){
danielk1977ac11ee62005-01-15 12:45:51 +00004553 /* Assemble the new sibling page. */
drh14acc042001-06-10 19:56:58 +00004554 MemPage *pNew = apNew[i];
drh19642e52005-03-29 13:17:45 +00004555 assert( j<nMaxCells );
drh4b70f112004-05-02 21:12:19 +00004556 assert( pNew->pgno==pgnoNew[i] );
drhfa1a98a2004-05-14 19:08:17 +00004557 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
drh09d0deb2005-08-02 17:13:09 +00004558 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
drh43605152004-05-29 21:46:49 +00004559 assert( pNew->nOverflow==0 );
danielk1977ac11ee62005-01-15 12:45:51 +00004560
4561#ifndef SQLITE_OMIT_AUTOVACUUM
4562 /* If this is an auto-vacuum database, update the pointer map entries
4563 ** that point to the siblings that were rearranged. These can be: left
4564 ** children of cells, the right-child of the page, or overflow pages
4565 ** pointed to by cells.
4566 */
4567 if( pBt->autoVacuum ){
4568 for(k=j; k<cntNew[i]; k++){
danielk1977634f2982005-03-28 08:44:07 +00004569 assert( k<nMaxCells );
danielk1977ac11ee62005-01-15 12:45:51 +00004570 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
danielk197779a40da2005-01-16 08:00:01 +00004571 rc = ptrmapPutOvfl(pNew, k-j);
4572 if( rc!=SQLITE_OK ){
4573 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00004574 }
4575 }
4576 }
4577 }
4578#endif
4579
4580 j = cntNew[i];
4581
4582 /* If the sibling page assembled above was not the right-most sibling,
4583 ** insert a divider cell into the parent page.
4584 */
drh14acc042001-06-10 19:56:58 +00004585 if( i<nNew-1 && j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00004586 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00004587 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00004588 int sz;
danielk1977634f2982005-03-28 08:44:07 +00004589
4590 assert( j<nMaxCells );
drh8b18dd42004-05-12 19:18:15 +00004591 pCell = apCell[j];
4592 sz = szCell[j] + leafCorrection;
drh4b70f112004-05-02 21:12:19 +00004593 if( !pNew->leaf ){
drh43605152004-05-29 21:46:49 +00004594 memcpy(&pNew->aData[8], pCell, 4);
drh24cd67e2004-05-10 16:18:47 +00004595 pTemp = 0;
drh8b18dd42004-05-12 19:18:15 +00004596 }else if( leafData ){
danielk1977ac11ee62005-01-15 12:45:51 +00004597 /* If the tree is a leaf-data tree, and the siblings are leaves,
4598 ** then there is no divider cell in apCell[]. Instead, the divider
4599 ** cell consists of the integer key for the right-most cell of
4600 ** the sibling-page assembled above only.
4601 */
drh6f11bef2004-05-13 01:12:56 +00004602 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00004603 j--;
drh43605152004-05-29 21:46:49 +00004604 parseCellPtr(pNew, apCell[j], &info);
drhb6f41482004-05-14 01:58:11 +00004605 pCell = &aSpace[iSpace];
drh6f11bef2004-05-13 01:12:56 +00004606 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
drhb6f41482004-05-14 01:58:11 +00004607 iSpace += sz;
drh07d183d2005-05-01 22:52:42 +00004608 assert( iSpace<=pBt->pageSize*5 );
drh8b18dd42004-05-12 19:18:15 +00004609 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00004610 }else{
4611 pCell -= 4;
drhb6f41482004-05-14 01:58:11 +00004612 pTemp = &aSpace[iSpace];
4613 iSpace += sz;
drh07d183d2005-05-01 22:52:42 +00004614 assert( iSpace<=pBt->pageSize*5 );
drh4b70f112004-05-02 21:12:19 +00004615 }
danielk1977a3ad5e72005-01-07 08:56:44 +00004616 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
danielk1977e80463b2004-11-03 03:01:16 +00004617 if( rc!=SQLITE_OK ) goto balance_cleanup;
drh43605152004-05-29 21:46:49 +00004618 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
danielk1977ac11ee62005-01-15 12:45:51 +00004619#ifndef SQLITE_OMIT_AUTOVACUUM
4620 /* If this is an auto-vacuum database, and not a leaf-data tree,
4621 ** then update the pointer map with an entry for the overflow page
4622 ** that the cell just inserted points to (if any).
4623 */
4624 if( pBt->autoVacuum && !leafData ){
danielk197779a40da2005-01-16 08:00:01 +00004625 rc = ptrmapPutOvfl(pParent, nxDiv);
4626 if( rc!=SQLITE_OK ){
4627 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00004628 }
4629 }
4630#endif
drh14acc042001-06-10 19:56:58 +00004631 j++;
4632 nxDiv++;
4633 }
4634 }
drh6019e162001-07-02 17:51:45 +00004635 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00004636 if( (pageFlags & PTF_LEAF)==0 ){
drh43605152004-05-29 21:46:49 +00004637 memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
drh14acc042001-06-10 19:56:58 +00004638 }
drh43605152004-05-29 21:46:49 +00004639 if( nxDiv==pParent->nCell+pParent->nOverflow ){
drh4b70f112004-05-02 21:12:19 +00004640 /* Right-most sibling is the right-most child of pParent */
drh43605152004-05-29 21:46:49 +00004641 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
drh4b70f112004-05-02 21:12:19 +00004642 }else{
4643 /* Right-most sibling is the left child of the first entry in pParent
4644 ** past the right-most divider entry */
drh43605152004-05-29 21:46:49 +00004645 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00004646 }
4647
4648 /*
4649 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00004650 */
4651 for(i=0; i<nNew; i++){
danielk1977afcdd022004-10-31 16:25:42 +00004652 rc = reparentChildPages(apNew[i]);
4653 if( rc!=SQLITE_OK ) goto balance_cleanup;
drh8b2f49b2001-06-08 00:21:52 +00004654 }
danielk1977afcdd022004-10-31 16:25:42 +00004655 rc = reparentChildPages(pParent);
4656 if( rc!=SQLITE_OK ) goto balance_cleanup;
drh8b2f49b2001-06-08 00:21:52 +00004657
4658 /*
drh3a4c1412004-05-09 20:40:11 +00004659 ** Balance the parent page. Note that the current page (pPage) might
danielk1977ac11ee62005-01-15 12:45:51 +00004660 ** have been added to the freelist so it might no longer be initialized.
drh3a4c1412004-05-09 20:40:11 +00004661 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00004662 */
drhda200cc2004-05-09 11:51:38 +00004663 assert( pParent->isInit );
drh3a4c1412004-05-09 20:40:11 +00004664 /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
4665 /* pageIntegrity(pPage); // No! pPage might have been added to freelist */
danielk1977ac245ec2005-01-14 13:50:11 +00004666 rc = balance(pParent, 0);
drhda200cc2004-05-09 11:51:38 +00004667
drh8b2f49b2001-06-08 00:21:52 +00004668 /*
drh14acc042001-06-10 19:56:58 +00004669 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00004670 */
drh14acc042001-06-10 19:56:58 +00004671balance_cleanup:
drh2e38c322004-09-03 18:38:44 +00004672 sqliteFree(apCell);
drh8b2f49b2001-06-08 00:21:52 +00004673 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00004674 releasePage(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00004675 }
drh14acc042001-06-10 19:56:58 +00004676 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00004677 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00004678 }
drh91025292004-05-03 19:49:32 +00004679 releasePage(pParent);
drh3a4c1412004-05-09 20:40:11 +00004680 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
4681 pPage->pgno, nOld, nNew, nCell));
drh8b2f49b2001-06-08 00:21:52 +00004682 return rc;
4683}
4684
4685/*
drh43605152004-05-29 21:46:49 +00004686** This routine is called for the root page of a btree when the root
4687** page contains no cells. This is an opportunity to make the tree
4688** shallower by one level.
4689*/
4690static int balance_shallower(MemPage *pPage){
4691 MemPage *pChild; /* The only child page of pPage */
4692 Pgno pgnoChild; /* Page number for pChild */
drh2e38c322004-09-03 18:38:44 +00004693 int rc = SQLITE_OK; /* Return code from subprocedures */
danielk1977aef0bf62005-12-30 16:28:01 +00004694 BtShared *pBt; /* The main BTree structure */
drh2e38c322004-09-03 18:38:44 +00004695 int mxCellPerPage; /* Maximum number of cells per page */
4696 u8 **apCell; /* All cells from pages being balanced */
4697 int *szCell; /* Local size of all cells */
drh43605152004-05-29 21:46:49 +00004698
4699 assert( pPage->pParent==0 );
4700 assert( pPage->nCell==0 );
drh2e38c322004-09-03 18:38:44 +00004701 pBt = pPage->pBt;
4702 mxCellPerPage = MX_CELL(pBt);
4703 apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
4704 if( apCell==0 ) return SQLITE_NOMEM;
4705 szCell = (int*)&apCell[mxCellPerPage];
drh43605152004-05-29 21:46:49 +00004706 if( pPage->leaf ){
4707 /* The table is completely empty */
4708 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
4709 }else{
4710 /* The root page is empty but has one child. Transfer the
4711 ** information from that one child into the root page if it
4712 ** will fit. This reduces the depth of the tree by one.
4713 **
4714 ** If the root page is page 1, it has less space available than
4715 ** its child (due to the 100 byte header that occurs at the beginning
4716 ** of the database fle), so it might not be able to hold all of the
4717 ** information currently contained in the child. If this is the
4718 ** case, then do not do the transfer. Leave page 1 empty except
4719 ** for the right-pointer to the child page. The child page becomes
4720 ** the virtual root of the tree.
4721 */
4722 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4723 assert( pgnoChild>0 );
4724 assert( pgnoChild<=sqlite3pager_pagecount(pPage->pBt->pPager) );
4725 rc = getPage(pPage->pBt, pgnoChild, &pChild);
drh2e38c322004-09-03 18:38:44 +00004726 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00004727 if( pPage->pgno==1 ){
4728 rc = initPage(pChild, pPage);
drh2e38c322004-09-03 18:38:44 +00004729 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00004730 assert( pChild->nOverflow==0 );
4731 if( pChild->nFree>=100 ){
4732 /* The child information will fit on the root page, so do the
4733 ** copy */
4734 int i;
4735 zeroPage(pPage, pChild->aData[0]);
4736 for(i=0; i<pChild->nCell; i++){
4737 apCell[i] = findCell(pChild,i);
4738 szCell[i] = cellSizePtr(pChild, apCell[i]);
4739 }
4740 assemblePage(pPage, pChild->nCell, apCell, szCell);
danielk1977ae825582004-11-23 09:06:55 +00004741 /* Copy the right-pointer of the child to the parent. */
4742 put4byte(&pPage->aData[pPage->hdrOffset+8],
4743 get4byte(&pChild->aData[pChild->hdrOffset+8]));
drh43605152004-05-29 21:46:49 +00004744 freePage(pChild);
4745 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
4746 }else{
4747 /* The child has more information that will fit on the root.
4748 ** The tree is already balanced. Do nothing. */
4749 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
4750 }
4751 }else{
4752 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
4753 pPage->isInit = 0;
4754 pPage->pParent = 0;
4755 rc = initPage(pPage, 0);
4756 assert( rc==SQLITE_OK );
4757 freePage(pChild);
4758 TRACE(("BALANCE: transfer child %d into root %d\n",
4759 pChild->pgno, pPage->pgno));
4760 }
danielk1977afcdd022004-10-31 16:25:42 +00004761 rc = reparentChildPages(pPage);
danielk1977ac11ee62005-01-15 12:45:51 +00004762 assert( pPage->nOverflow==0 );
4763#ifndef SQLITE_OMIT_AUTOVACUUM
4764 if( pBt->autoVacuum ){
danielk1977aac0a382005-01-16 11:07:06 +00004765 int i;
danielk1977ac11ee62005-01-15 12:45:51 +00004766 for(i=0; i<pPage->nCell; i++){
danielk197779a40da2005-01-16 08:00:01 +00004767 rc = ptrmapPutOvfl(pPage, i);
4768 if( rc!=SQLITE_OK ){
4769 goto end_shallow_balance;
danielk1977ac11ee62005-01-15 12:45:51 +00004770 }
4771 }
4772 }
4773#endif
danielk1977afcdd022004-10-31 16:25:42 +00004774 if( rc!=SQLITE_OK ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00004775 releasePage(pChild);
4776 }
drh2e38c322004-09-03 18:38:44 +00004777end_shallow_balance:
4778 sqliteFree(apCell);
4779 return rc;
drh43605152004-05-29 21:46:49 +00004780}
4781
4782
4783/*
4784** The root page is overfull
4785**
4786** When this happens, Create a new child page and copy the
4787** contents of the root into the child. Then make the root
4788** page an empty page with rightChild pointing to the new
4789** child. Finally, call balance_internal() on the new child
4790** to cause it to split.
4791*/
4792static int balance_deeper(MemPage *pPage){
4793 int rc; /* Return value from subprocedures */
4794 MemPage *pChild; /* Pointer to a new child page */
4795 Pgno pgnoChild; /* Page number of the new child page */
danielk1977aef0bf62005-12-30 16:28:01 +00004796 BtShared *pBt; /* The BTree */
drh43605152004-05-29 21:46:49 +00004797 int usableSize; /* Total usable size of a page */
4798 u8 *data; /* Content of the parent page */
4799 u8 *cdata; /* Content of the child page */
4800 int hdr; /* Offset to page header in parent */
4801 int brk; /* Offset to content of first cell in parent */
4802
4803 assert( pPage->pParent==0 );
4804 assert( pPage->nOverflow>0 );
4805 pBt = pPage->pBt;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004806 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
drh43605152004-05-29 21:46:49 +00004807 if( rc ) return rc;
4808 assert( sqlite3pager_iswriteable(pChild->aData) );
4809 usableSize = pBt->usableSize;
4810 data = pPage->aData;
4811 hdr = pPage->hdrOffset;
4812 brk = get2byte(&data[hdr+5]);
4813 cdata = pChild->aData;
4814 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
4815 memcpy(&cdata[brk], &data[brk], usableSize-brk);
danielk1977c7dc7532004-11-17 10:22:03 +00004816 assert( pChild->isInit==0 );
drh43605152004-05-29 21:46:49 +00004817 rc = initPage(pChild, pPage);
danielk19776b456a22005-03-21 04:04:02 +00004818 if( rc ) goto balancedeeper_out;
drh43605152004-05-29 21:46:49 +00004819 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
4820 pChild->nOverflow = pPage->nOverflow;
4821 if( pChild->nOverflow ){
4822 pChild->nFree = 0;
4823 }
4824 assert( pChild->nCell==pPage->nCell );
4825 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
4826 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
4827 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
danielk19774e17d142005-01-16 09:06:33 +00004828#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977ac11ee62005-01-15 12:45:51 +00004829 if( pBt->autoVacuum ){
4830 int i;
4831 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
danielk19776b456a22005-03-21 04:04:02 +00004832 if( rc ) goto balancedeeper_out;
danielk1977ac11ee62005-01-15 12:45:51 +00004833 for(i=0; i<pChild->nCell; i++){
danielk197779a40da2005-01-16 08:00:01 +00004834 rc = ptrmapPutOvfl(pChild, i);
4835 if( rc!=SQLITE_OK ){
4836 return rc;
danielk1977ac11ee62005-01-15 12:45:51 +00004837 }
4838 }
4839 }
danielk19774e17d142005-01-16 09:06:33 +00004840#endif
drh43605152004-05-29 21:46:49 +00004841 rc = balance_nonroot(pChild);
danielk19776b456a22005-03-21 04:04:02 +00004842
4843balancedeeper_out:
drh43605152004-05-29 21:46:49 +00004844 releasePage(pChild);
4845 return rc;
4846}
4847
4848/*
4849** Decide if the page pPage needs to be balanced. If balancing is
4850** required, call the appropriate balancing routine.
4851*/
danielk1977ac245ec2005-01-14 13:50:11 +00004852static int balance(MemPage *pPage, int insert){
drh43605152004-05-29 21:46:49 +00004853 int rc = SQLITE_OK;
4854 if( pPage->pParent==0 ){
4855 if( pPage->nOverflow>0 ){
4856 rc = balance_deeper(pPage);
4857 }
danielk1977687566d2004-11-02 12:56:41 +00004858 if( rc==SQLITE_OK && pPage->nCell==0 ){
drh43605152004-05-29 21:46:49 +00004859 rc = balance_shallower(pPage);
4860 }
4861 }else{
danielk1977ac245ec2005-01-14 13:50:11 +00004862 if( pPage->nOverflow>0 ||
4863 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
drh43605152004-05-29 21:46:49 +00004864 rc = balance_nonroot(pPage);
4865 }
4866 }
4867 return rc;
4868}
4869
4870/*
drh8dcd7ca2004-08-08 19:43:29 +00004871** This routine checks all cursors that point to table pgnoRoot.
4872** If any of those cursors other than pExclude were opened with
drhf74b8d92002-09-01 23:20:45 +00004873** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
drh8dcd7ca2004-08-08 19:43:29 +00004874** cursors that point to pgnoRoot were opened with wrFlag==1
drhf74b8d92002-09-01 23:20:45 +00004875** then this routine returns SQLITE_OK.
danielk1977299b1872004-11-22 10:02:10 +00004876**
4877** In addition to checking for read-locks (where a read-lock
4878** means a cursor opened with wrFlag==0) this routine also moves
4879** all cursors other than pExclude so that they are pointing to the
4880** first Cell on root page. This is necessary because an insert
4881** or delete might change the number of cells on a page or delete
4882** a page entirely and we do not want to leave any cursors
4883** pointing to non-existant pages or cells.
drhf74b8d92002-09-01 23:20:45 +00004884*/
danielk1977aef0bf62005-12-30 16:28:01 +00004885static int checkReadLocks(BtShared *pBt, Pgno pgnoRoot, BtCursor *pExclude){
danielk1977299b1872004-11-22 10:02:10 +00004886 BtCursor *p;
4887 for(p=pBt->pCursor; p; p=p->pNext){
4888 if( p->pgnoRoot!=pgnoRoot || p==pExclude ) continue;
4889 if( p->wrFlag==0 ) return SQLITE_LOCKED;
4890 if( p->pPage->pgno!=p->pgnoRoot ){
4891 moveToRoot(p);
4892 }
4893 }
drhf74b8d92002-09-01 23:20:45 +00004894 return SQLITE_OK;
4895}
4896
4897/*
drh3b7511c2001-05-26 13:15:44 +00004898** Insert a new record into the BTree. The key is given by (pKey,nKey)
4899** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00004900** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00004901** is left pointing at a random location.
4902**
4903** For an INTKEY table, only the nKey value of the key is used. pKey is
4904** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00004905*/
drh3aac2dd2004-04-26 14:10:20 +00004906int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00004907 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00004908 const void *pKey, i64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00004909 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00004910){
drh3b7511c2001-05-26 13:15:44 +00004911 int rc;
4912 int loc;
drh14acc042001-06-10 19:56:58 +00004913 int szNew;
drh3b7511c2001-05-26 13:15:44 +00004914 MemPage *pPage;
danielk1977aef0bf62005-12-30 16:28:01 +00004915 BtShared *pBt = pCur->pBtree->pBt;
drha34b6762004-05-07 13:30:42 +00004916 unsigned char *oldCell;
drh2e38c322004-09-03 18:38:44 +00004917 unsigned char *newCell = 0;
drh3b7511c2001-05-26 13:15:44 +00004918
danielk1977aef0bf62005-12-30 16:28:01 +00004919 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00004920 /* Must start a transaction before doing an insert */
4921 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00004922 }
drhf74b8d92002-09-01 23:20:45 +00004923 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00004924 if( !pCur->wrFlag ){
4925 return SQLITE_PERM; /* Cursor not open for writing */
4926 }
drh8dcd7ca2004-08-08 19:43:29 +00004927 if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
drhf74b8d92002-09-01 23:20:45 +00004928 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
4929 }
drh3aac2dd2004-04-26 14:10:20 +00004930 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00004931 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00004932 pPage = pCur->pPage;
drh4a1c3802004-05-12 15:15:47 +00004933 assert( pPage->intKey || nKey>=0 );
drh8b18dd42004-05-12 19:18:15 +00004934 assert( pPage->leaf || !pPage->leafData );
drh3a4c1412004-05-09 20:40:11 +00004935 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
4936 pCur->pgnoRoot, nKey, nData, pPage->pgno,
4937 loc==0 ? "overwrite" : "new entry"));
drh7aa128d2002-06-21 13:09:16 +00004938 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00004939 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00004940 if( rc ) return rc;
drh2e38c322004-09-03 18:38:44 +00004941 newCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
4942 if( newCell==0 ) return SQLITE_NOMEM;
drha34b6762004-05-07 13:30:42 +00004943 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
drh2e38c322004-09-03 18:38:44 +00004944 if( rc ) goto end_insert;
drh43605152004-05-29 21:46:49 +00004945 assert( szNew==cellSizePtr(pPage, newCell) );
drh2e38c322004-09-03 18:38:44 +00004946 assert( szNew<=MX_CELL_SIZE(pBt) );
drhf328bc82004-05-10 23:29:49 +00004947 if( loc==0 && pCur->isValid ){
drha34b6762004-05-07 13:30:42 +00004948 int szOld;
4949 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00004950 oldCell = findCell(pPage, pCur->idx);
drh4b70f112004-05-02 21:12:19 +00004951 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00004952 memcpy(newCell, oldCell, 4);
drh4b70f112004-05-02 21:12:19 +00004953 }
drh43605152004-05-29 21:46:49 +00004954 szOld = cellSizePtr(pPage, oldCell);
drh4b70f112004-05-02 21:12:19 +00004955 rc = clearCell(pPage, oldCell);
drh2e38c322004-09-03 18:38:44 +00004956 if( rc ) goto end_insert;
drh4b70f112004-05-02 21:12:19 +00004957 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00004958 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00004959 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00004960 pCur->idx++;
drh271efa52004-05-30 19:19:05 +00004961 pCur->info.nSize = 0;
drh14acc042001-06-10 19:56:58 +00004962 }else{
drh4b70f112004-05-02 21:12:19 +00004963 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00004964 }
danielk1977a3ad5e72005-01-07 08:56:44 +00004965 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
danielk1977e80463b2004-11-03 03:01:16 +00004966 if( rc!=SQLITE_OK ) goto end_insert;
danielk1977ac245ec2005-01-14 13:50:11 +00004967 rc = balance(pPage, 1);
drh23e11ca2004-05-04 17:27:28 +00004968 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
drh3fc190c2001-09-14 03:24:23 +00004969 /* fflush(stdout); */
danielk1977299b1872004-11-22 10:02:10 +00004970 if( rc==SQLITE_OK ){
4971 moveToRoot(pCur);
4972 }
drh2e38c322004-09-03 18:38:44 +00004973end_insert:
4974 sqliteFree(newCell);
drh5e2f8b92001-05-28 00:41:15 +00004975 return rc;
4976}
4977
4978/*
drh4b70f112004-05-02 21:12:19 +00004979** Delete the entry that the cursor is pointing to. The cursor
4980** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00004981*/
drh3aac2dd2004-04-26 14:10:20 +00004982int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00004983 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00004984 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00004985 int rc;
danielk1977cfe9a692004-06-16 12:00:29 +00004986 Pgno pgnoChild = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00004987 BtShared *pBt = pCur->pBtree->pBt;
drh8b2f49b2001-06-08 00:21:52 +00004988
drh7aa128d2002-06-21 13:09:16 +00004989 assert( pPage->isInit );
danielk1977aef0bf62005-12-30 16:28:01 +00004990 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00004991 /* Must start a transaction before doing a delete */
4992 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00004993 }
drhf74b8d92002-09-01 23:20:45 +00004994 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00004995 if( pCur->idx >= pPage->nCell ){
4996 return SQLITE_ERROR; /* The cursor is not pointing to anything */
4997 }
drhecdc7532001-09-23 02:35:53 +00004998 if( !pCur->wrFlag ){
4999 return SQLITE_PERM; /* Did not open this cursor for writing */
5000 }
drh8dcd7ca2004-08-08 19:43:29 +00005001 if( checkReadLocks(pBt, pCur->pgnoRoot, pCur) ){
drhf74b8d92002-09-01 23:20:45 +00005002 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5003 }
drha34b6762004-05-07 13:30:42 +00005004 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00005005 if( rc ) return rc;
danielk1977e6efa742004-11-10 11:55:10 +00005006
5007 /* Locate the cell within it's page and leave pCell pointing to the
5008 ** data. The clearCell() call frees any overflow pages associated with the
5009 ** cell. The cell itself is still intact.
5010 */
danielk1977299b1872004-11-22 10:02:10 +00005011 pCell = findCell(pPage, pCur->idx);
drh4b70f112004-05-02 21:12:19 +00005012 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005013 pgnoChild = get4byte(pCell);
drh4b70f112004-05-02 21:12:19 +00005014 }
danielk197728129562005-01-11 10:25:06 +00005015 rc = clearCell(pPage, pCell);
5016 if( rc ) return rc;
danielk1977e6efa742004-11-10 11:55:10 +00005017
drh4b70f112004-05-02 21:12:19 +00005018 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00005019 /*
drh5e00f6c2001-09-13 13:46:56 +00005020 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00005021 ** do something we will leave a hole on an internal page.
5022 ** We have to fill the hole by moving in a cell from a leaf. The
5023 ** next Cell after the one to be deleted is guaranteed to exist and
danielk1977299b1872004-11-22 10:02:10 +00005024 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00005025 */
drh14acc042001-06-10 19:56:58 +00005026 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00005027 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00005028 int szNext;
danielk1977299b1872004-11-22 10:02:10 +00005029 int notUsed;
danielk19776b456a22005-03-21 04:04:02 +00005030 unsigned char *tempCell = 0;
drh8b18dd42004-05-12 19:18:15 +00005031 assert( !pPage->leafData );
drh14acc042001-06-10 19:56:58 +00005032 getTempCursor(pCur, &leafCur);
danielk1977299b1872004-11-22 10:02:10 +00005033 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00005034 if( rc!=SQLITE_OK ){
drhee696e22004-08-30 16:52:17 +00005035 if( rc!=SQLITE_NOMEM ){
drh49285702005-09-17 15:20:26 +00005036 rc = SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00005037 }
drh5e2f8b92001-05-28 00:41:15 +00005038 }
danielk19776b456a22005-03-21 04:04:02 +00005039 if( rc==SQLITE_OK ){
5040 rc = sqlite3pager_write(leafCur.pPage->aData);
5041 }
5042 if( rc==SQLITE_OK ){
5043 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5044 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
5045 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5046 pNext = findCell(leafCur.pPage, leafCur.idx);
5047 szNext = cellSizePtr(leafCur.pPage, pNext);
5048 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5049 tempCell = sqliteMallocRaw( MX_CELL_SIZE(pBt) );
5050 if( tempCell==0 ){
5051 rc = SQLITE_NOMEM;
5052 }
5053 }
5054 if( rc==SQLITE_OK ){
5055 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
5056 }
5057 if( rc==SQLITE_OK ){
5058 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
5059 rc = balance(pPage, 0);
5060 }
5061 if( rc==SQLITE_OK ){
5062 dropCell(leafCur.pPage, leafCur.idx, szNext);
5063 rc = balance(leafCur.pPage, 0);
5064 }
drh2e38c322004-09-03 18:38:44 +00005065 sqliteFree(tempCell);
drh8c42ca92001-06-22 19:15:00 +00005066 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00005067 }else{
danielk1977299b1872004-11-22 10:02:10 +00005068 TRACE(("DELETE: table=%d delete from leaf %d\n",
5069 pCur->pgnoRoot, pPage->pgno));
5070 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
danielk1977ac245ec2005-01-14 13:50:11 +00005071 rc = balance(pPage, 0);
drh5e2f8b92001-05-28 00:41:15 +00005072 }
danielk19776b456a22005-03-21 04:04:02 +00005073 if( rc==SQLITE_OK ){
5074 moveToRoot(pCur);
5075 }
drh5e2f8b92001-05-28 00:41:15 +00005076 return rc;
drh3b7511c2001-05-26 13:15:44 +00005077}
drh8b2f49b2001-06-08 00:21:52 +00005078
5079/*
drhc6b52df2002-01-04 03:09:29 +00005080** Create a new BTree table. Write into *piTable the page
5081** number for the root page of the new table.
5082**
drhab01f612004-05-22 02:55:23 +00005083** The type of type is determined by the flags parameter. Only the
5084** following values of flags are currently in use. Other values for
5085** flags might not work:
5086**
5087** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
5088** BTREE_ZERODATA Used for SQL indices
drh8b2f49b2001-06-08 00:21:52 +00005089*/
danielk1977aef0bf62005-12-30 16:28:01 +00005090int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
5091 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00005092 MemPage *pRoot;
5093 Pgno pgnoRoot;
5094 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00005095 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005096 /* Must start a transaction first */
5097 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00005098 }
danielk197728129562005-01-11 10:25:06 +00005099 assert( !pBt->readOnly );
danielk1977e6efa742004-11-10 11:55:10 +00005100
5101 /* It is illegal to create a table if any cursors are open on the
5102 ** database. This is because in auto-vacuum mode the backend may
5103 ** need to move a database page to make room for the new root-page.
5104 ** If an open cursor was using the page a problem would occur.
5105 */
5106 if( pBt->pCursor ){
5107 return SQLITE_LOCKED;
5108 }
5109
danielk1977003ba062004-11-04 02:57:33 +00005110#ifdef SQLITE_OMIT_AUTOVACUUM
danielk1977cb1a7eb2004-11-05 12:27:02 +00005111 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
drh8b2f49b2001-06-08 00:21:52 +00005112 if( rc ) return rc;
danielk1977003ba062004-11-04 02:57:33 +00005113#else
danielk1977687566d2004-11-02 12:56:41 +00005114 if( pBt->autoVacuum ){
danielk1977003ba062004-11-04 02:57:33 +00005115 Pgno pgnoMove; /* Move a page here to make room for the root-page */
5116 MemPage *pPageMove; /* The page to move to. */
5117
danielk1977003ba062004-11-04 02:57:33 +00005118 /* Read the value of meta[3] from the database to determine where the
5119 ** root page of the new table should go. meta[3] is the largest root-page
5120 ** created so far, so the new root-page is (meta[3]+1).
5121 */
danielk1977aef0bf62005-12-30 16:28:01 +00005122 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00005123 if( rc!=SQLITE_OK ) return rc;
5124 pgnoRoot++;
5125
danielk1977599fcba2004-11-08 07:13:13 +00005126 /* The new root-page may not be allocated on a pointer-map page, or the
5127 ** PENDING_BYTE page.
5128 */
drh42cac6d2004-11-20 20:31:11 +00005129 if( pgnoRoot==PTRMAP_PAGENO(pBt->usableSize, pgnoRoot) ||
danielk1977599fcba2004-11-08 07:13:13 +00005130 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
danielk1977003ba062004-11-04 02:57:33 +00005131 pgnoRoot++;
5132 }
5133 assert( pgnoRoot>=3 );
5134
5135 /* Allocate a page. The page that currently resides at pgnoRoot will
5136 ** be moved to the allocated page (unless the allocated page happens
5137 ** to reside at pgnoRoot).
5138 */
danielk1977cb1a7eb2004-11-05 12:27:02 +00005139 rc = allocatePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
danielk1977003ba062004-11-04 02:57:33 +00005140 if( rc!=SQLITE_OK ){
danielk1977687566d2004-11-02 12:56:41 +00005141 return rc;
5142 }
danielk1977003ba062004-11-04 02:57:33 +00005143
5144 if( pgnoMove!=pgnoRoot ){
5145 u8 eType;
5146 Pgno iPtrPage;
5147
5148 releasePage(pPageMove);
5149 rc = getPage(pBt, pgnoRoot, &pRoot);
5150 if( rc!=SQLITE_OK ){
5151 return rc;
5152 }
5153 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
drhccae6022005-02-26 17:31:26 +00005154 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00005155 releasePage(pRoot);
5156 return rc;
5157 }
drhccae6022005-02-26 17:31:26 +00005158 assert( eType!=PTRMAP_ROOTPAGE );
5159 assert( eType!=PTRMAP_FREEPAGE );
danielk19775fd057a2005-03-09 13:09:43 +00005160 rc = sqlite3pager_write(pRoot->aData);
5161 if( rc!=SQLITE_OK ){
5162 releasePage(pRoot);
5163 return rc;
5164 }
danielk1977003ba062004-11-04 02:57:33 +00005165 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove);
5166 releasePage(pRoot);
5167 if( rc!=SQLITE_OK ){
5168 return rc;
5169 }
5170 rc = getPage(pBt, pgnoRoot, &pRoot);
5171 if( rc!=SQLITE_OK ){
5172 return rc;
5173 }
5174 rc = sqlite3pager_write(pRoot->aData);
5175 if( rc!=SQLITE_OK ){
5176 releasePage(pRoot);
5177 return rc;
5178 }
5179 }else{
5180 pRoot = pPageMove;
5181 }
5182
danielk197742741be2005-01-08 12:42:39 +00005183 /* Update the pointer-map and meta-data with the new root-page number. */
danielk1977003ba062004-11-04 02:57:33 +00005184 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
5185 if( rc ){
5186 releasePage(pRoot);
5187 return rc;
5188 }
danielk1977aef0bf62005-12-30 16:28:01 +00005189 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00005190 if( rc ){
5191 releasePage(pRoot);
5192 return rc;
5193 }
danielk197742741be2005-01-08 12:42:39 +00005194
danielk1977003ba062004-11-04 02:57:33 +00005195 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00005196 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1, 0);
danielk1977003ba062004-11-04 02:57:33 +00005197 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00005198 }
5199#endif
drha34b6762004-05-07 13:30:42 +00005200 assert( sqlite3pager_iswriteable(pRoot->aData) );
drhde647132004-05-07 17:57:49 +00005201 zeroPage(pRoot, flags | PTF_LEAF);
drha34b6762004-05-07 13:30:42 +00005202 sqlite3pager_unref(pRoot->aData);
drh8b2f49b2001-06-08 00:21:52 +00005203 *piTable = (int)pgnoRoot;
5204 return SQLITE_OK;
5205}
5206
5207/*
5208** Erase the given database page and all its children. Return
5209** the page to the freelist.
5210*/
drh4b70f112004-05-02 21:12:19 +00005211static int clearDatabasePage(
danielk1977aef0bf62005-12-30 16:28:01 +00005212 BtShared *pBt, /* The BTree that contains the table */
drh4b70f112004-05-02 21:12:19 +00005213 Pgno pgno, /* Page number to clear */
5214 MemPage *pParent, /* Parent page. NULL for the root */
5215 int freePageFlag /* Deallocate page if true */
5216){
danielk19776b456a22005-03-21 04:04:02 +00005217 MemPage *pPage = 0;
drh8b2f49b2001-06-08 00:21:52 +00005218 int rc;
drh4b70f112004-05-02 21:12:19 +00005219 unsigned char *pCell;
5220 int i;
drh8b2f49b2001-06-08 00:21:52 +00005221
danielk1977a1cb1832005-02-12 08:59:55 +00005222 if( pgno>sqlite3pager_pagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00005223 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00005224 }
5225
drhde647132004-05-07 17:57:49 +00005226 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
danielk19776b456a22005-03-21 04:04:02 +00005227 if( rc ) goto cleardatabasepage_out;
drha34b6762004-05-07 13:30:42 +00005228 rc = sqlite3pager_write(pPage->aData);
danielk19776b456a22005-03-21 04:04:02 +00005229 if( rc ) goto cleardatabasepage_out;
drh4b70f112004-05-02 21:12:19 +00005230 for(i=0; i<pPage->nCell; i++){
drh43605152004-05-29 21:46:49 +00005231 pCell = findCell(pPage, i);
drh4b70f112004-05-02 21:12:19 +00005232 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005233 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
danielk19776b456a22005-03-21 04:04:02 +00005234 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00005235 }
drh4b70f112004-05-02 21:12:19 +00005236 rc = clearCell(pPage, pCell);
danielk19776b456a22005-03-21 04:04:02 +00005237 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00005238 }
drha34b6762004-05-07 13:30:42 +00005239 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005240 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
danielk19776b456a22005-03-21 04:04:02 +00005241 if( rc ) goto cleardatabasepage_out;
drh2aa679f2001-06-25 02:11:07 +00005242 }
5243 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00005244 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00005245 }else{
drh3a4c1412004-05-09 20:40:11 +00005246 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00005247 }
danielk19776b456a22005-03-21 04:04:02 +00005248
5249cleardatabasepage_out:
drh4b70f112004-05-02 21:12:19 +00005250 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00005251 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005252}
5253
5254/*
drhab01f612004-05-22 02:55:23 +00005255** Delete all information from a single table in the database. iTable is
5256** the page number of the root of the table. After this routine returns,
5257** the root page is empty, but still exists.
5258**
5259** This routine will fail with SQLITE_LOCKED if there are any open
5260** read cursors on the table. Open write cursors are moved to the
5261** root of the table.
drh8b2f49b2001-06-08 00:21:52 +00005262*/
danielk1977aef0bf62005-12-30 16:28:01 +00005263int sqlite3BtreeClearTable(Btree *p, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00005264 int rc;
drhf74b8d92002-09-01 23:20:45 +00005265 BtCursor *pCur;
danielk1977aef0bf62005-12-30 16:28:01 +00005266 BtShared *pBt = p->pBt;
5267 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005268 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00005269 }
drhf74b8d92002-09-01 23:20:45 +00005270 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
5271 if( pCur->pgnoRoot==(Pgno)iTable ){
5272 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
5273 moveToRoot(pCur);
5274 }
drhecdc7532001-09-23 02:35:53 +00005275 }
drha34b6762004-05-07 13:30:42 +00005276 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
danielk197771fd80b2005-12-16 06:54:01 +00005277#if 0
drh8b2f49b2001-06-08 00:21:52 +00005278 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00005279 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00005280 }
danielk197771fd80b2005-12-16 06:54:01 +00005281#endif
drh8c42ca92001-06-22 19:15:00 +00005282 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005283}
5284
5285/*
5286** Erase all information in a table and add the root of the table to
5287** the freelist. Except, the root of the principle table (the one on
drhab01f612004-05-22 02:55:23 +00005288** page 1) is never added to the freelist.
5289**
5290** This routine will fail with SQLITE_LOCKED if there are any open
5291** cursors on the table.
drh205f48e2004-11-05 00:43:11 +00005292**
5293** If AUTOVACUUM is enabled and the page at iTable is not the last
5294** root page in the database file, then the last root page
5295** in the database file is moved into the slot formerly occupied by
5296** iTable and that last slot formerly occupied by the last root page
5297** is added to the freelist instead of iTable. In this say, all
5298** root pages are kept at the beginning of the database file, which
5299** is necessary for AUTOVACUUM to work right. *piMoved is set to the
5300** page number that used to be the last root page in the file before
5301** the move. If no page gets moved, *piMoved is set to 0.
5302** The last root page is recorded in meta[3] and the value of
5303** meta[3] is updated by this procedure.
drh8b2f49b2001-06-08 00:21:52 +00005304*/
danielk1977aef0bf62005-12-30 16:28:01 +00005305int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
drh8b2f49b2001-06-08 00:21:52 +00005306 int rc;
danielk1977a0bf2652004-11-04 14:30:04 +00005307 MemPage *pPage = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00005308 BtShared *pBt = p->pBt;
danielk1977a0bf2652004-11-04 14:30:04 +00005309
danielk1977aef0bf62005-12-30 16:28:01 +00005310 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005311 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00005312 }
danielk1977a0bf2652004-11-04 14:30:04 +00005313
danielk1977e6efa742004-11-10 11:55:10 +00005314 /* It is illegal to drop a table if any cursors are open on the
5315 ** database. This is because in auto-vacuum mode the backend may
5316 ** need to move another root-page to fill a gap left by the deleted
5317 ** root page. If an open cursor was using this page a problem would
5318 ** occur.
5319 */
5320 if( pBt->pCursor ){
5321 return SQLITE_LOCKED;
drh5df72a52002-06-06 23:16:05 +00005322 }
danielk1977a0bf2652004-11-04 14:30:04 +00005323
drha34b6762004-05-07 13:30:42 +00005324 rc = getPage(pBt, (Pgno)iTable, &pPage);
drh2aa679f2001-06-25 02:11:07 +00005325 if( rc ) return rc;
danielk1977aef0bf62005-12-30 16:28:01 +00005326 rc = sqlite3BtreeClearTable(p, iTable);
danielk19776b456a22005-03-21 04:04:02 +00005327 if( rc ){
5328 releasePage(pPage);
5329 return rc;
5330 }
danielk1977a0bf2652004-11-04 14:30:04 +00005331
drh205f48e2004-11-05 00:43:11 +00005332 *piMoved = 0;
danielk1977a0bf2652004-11-04 14:30:04 +00005333
drh4b70f112004-05-02 21:12:19 +00005334 if( iTable>1 ){
danielk1977a0bf2652004-11-04 14:30:04 +00005335#ifdef SQLITE_OMIT_AUTOVACUUM
drha34b6762004-05-07 13:30:42 +00005336 rc = freePage(pPage);
danielk1977a0bf2652004-11-04 14:30:04 +00005337 releasePage(pPage);
5338#else
5339 if( pBt->autoVacuum ){
5340 Pgno maxRootPgno;
danielk1977aef0bf62005-12-30 16:28:01 +00005341 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00005342 if( rc!=SQLITE_OK ){
5343 releasePage(pPage);
5344 return rc;
5345 }
5346
5347 if( iTable==maxRootPgno ){
5348 /* If the table being dropped is the table with the largest root-page
5349 ** number in the database, put the root page on the free list.
5350 */
5351 rc = freePage(pPage);
5352 releasePage(pPage);
5353 if( rc!=SQLITE_OK ){
5354 return rc;
5355 }
5356 }else{
5357 /* The table being dropped does not have the largest root-page
5358 ** number in the database. So move the page that does into the
5359 ** gap left by the deleted root-page.
5360 */
5361 MemPage *pMove;
5362 releasePage(pPage);
5363 rc = getPage(pBt, maxRootPgno, &pMove);
5364 if( rc!=SQLITE_OK ){
5365 return rc;
5366 }
5367 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable);
5368 releasePage(pMove);
5369 if( rc!=SQLITE_OK ){
5370 return rc;
5371 }
5372 rc = getPage(pBt, maxRootPgno, &pMove);
5373 if( rc!=SQLITE_OK ){
5374 return rc;
5375 }
5376 rc = freePage(pMove);
5377 releasePage(pMove);
5378 if( rc!=SQLITE_OK ){
5379 return rc;
5380 }
5381 *piMoved = maxRootPgno;
5382 }
5383
danielk1977599fcba2004-11-08 07:13:13 +00005384 /* Set the new 'max-root-page' value in the database header. This
5385 ** is the old value less one, less one more if that happens to
5386 ** be a root-page number, less one again if that is the
5387 ** PENDING_BYTE_PAGE.
5388 */
danielk197787a6e732004-11-05 12:58:25 +00005389 maxRootPgno--;
danielk1977599fcba2004-11-08 07:13:13 +00005390 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
5391 maxRootPgno--;
5392 }
drh42cac6d2004-11-20 20:31:11 +00005393 if( maxRootPgno==PTRMAP_PAGENO(pBt->usableSize, maxRootPgno) ){
danielk197787a6e732004-11-05 12:58:25 +00005394 maxRootPgno--;
5395 }
danielk1977599fcba2004-11-08 07:13:13 +00005396 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
5397
danielk1977aef0bf62005-12-30 16:28:01 +00005398 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00005399 }else{
5400 rc = freePage(pPage);
5401 releasePage(pPage);
5402 }
5403#endif
drh2aa679f2001-06-25 02:11:07 +00005404 }else{
danielk1977a0bf2652004-11-04 14:30:04 +00005405 /* If sqlite3BtreeDropTable was called on page 1. */
drha34b6762004-05-07 13:30:42 +00005406 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
danielk1977a0bf2652004-11-04 14:30:04 +00005407 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00005408 }
drh8b2f49b2001-06-08 00:21:52 +00005409 return rc;
5410}
5411
drh001bbcb2003-03-19 03:14:00 +00005412
drh8b2f49b2001-06-08 00:21:52 +00005413/*
drh23e11ca2004-05-04 17:27:28 +00005414** Read the meta-information out of a database file. Meta[0]
5415** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00005416** through meta[15] are available for use by higher layers. Meta[0]
5417** is read-only, the others are read/write.
5418**
5419** The schema layer numbers meta values differently. At the schema
5420** layer (and the SetCookie and ReadCookie opcodes) the number of
5421** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00005422*/
danielk1977aef0bf62005-12-30 16:28:01 +00005423int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00005424 int rc;
drh4b70f112004-05-02 21:12:19 +00005425 unsigned char *pP1;
danielk1977aef0bf62005-12-30 16:28:01 +00005426 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00005427
drh23e11ca2004-05-04 17:27:28 +00005428 assert( idx>=0 && idx<=15 );
drha34b6762004-05-07 13:30:42 +00005429 rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00005430 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00005431 *pMeta = get4byte(&pP1[36 + idx*4]);
drha34b6762004-05-07 13:30:42 +00005432 sqlite3pager_unref(pP1);
drhae157872004-08-14 19:20:09 +00005433
danielk1977599fcba2004-11-08 07:13:13 +00005434 /* If autovacuumed is disabled in this build but we are trying to
5435 ** access an autovacuumed database, then make the database readonly.
5436 */
danielk1977003ba062004-11-04 02:57:33 +00005437#ifdef SQLITE_OMIT_AUTOVACUUM
drhae157872004-08-14 19:20:09 +00005438 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
danielk1977003ba062004-11-04 02:57:33 +00005439#endif
drhae157872004-08-14 19:20:09 +00005440
drh8b2f49b2001-06-08 00:21:52 +00005441 return SQLITE_OK;
5442}
5443
5444/*
drh23e11ca2004-05-04 17:27:28 +00005445** Write meta-information back into the database. Meta[0] is
5446** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00005447*/
danielk1977aef0bf62005-12-30 16:28:01 +00005448int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
5449 BtShared *pBt = p->pBt;
drh4b70f112004-05-02 21:12:19 +00005450 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00005451 int rc;
drh23e11ca2004-05-04 17:27:28 +00005452 assert( idx>=1 && idx<=15 );
danielk1977aef0bf62005-12-30 16:28:01 +00005453 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005454 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00005455 }
drhde647132004-05-07 17:57:49 +00005456 assert( pBt->pPage1!=0 );
5457 pP1 = pBt->pPage1->aData;
drha34b6762004-05-07 13:30:42 +00005458 rc = sqlite3pager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00005459 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00005460 put4byte(&pP1[36 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00005461 return SQLITE_OK;
5462}
drh8c42ca92001-06-22 19:15:00 +00005463
drhf328bc82004-05-10 23:29:49 +00005464/*
5465** Return the flag byte at the beginning of the page that the cursor
5466** is currently pointing to.
5467*/
5468int sqlite3BtreeFlags(BtCursor *pCur){
5469 MemPage *pPage = pCur->pPage;
5470 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
5471}
5472
danielk1977b5402fb2005-01-12 07:15:04 +00005473#ifdef SQLITE_DEBUG
drh8c42ca92001-06-22 19:15:00 +00005474/*
5475** Print a disassembly of the given page on standard output. This routine
5476** is used for debugging and testing only.
5477*/
danielk1977aef0bf62005-12-30 16:28:01 +00005478static int btreePageDump(BtShared *pBt, int pgno, int recursive, MemPage *pParent){
drh8c42ca92001-06-22 19:15:00 +00005479 int rc;
5480 MemPage *pPage;
drhc8629a12004-05-08 20:07:40 +00005481 int i, j, c;
drh8c42ca92001-06-22 19:15:00 +00005482 int nFree;
5483 u16 idx;
drhab9f7f12004-05-08 10:56:11 +00005484 int hdr;
drh43605152004-05-29 21:46:49 +00005485 int nCell;
drha2fce642004-06-05 00:01:44 +00005486 int isInit;
drhab9f7f12004-05-08 10:56:11 +00005487 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00005488 char range[20];
5489 unsigned char payload[20];
drhab9f7f12004-05-08 10:56:11 +00005490
drh4b70f112004-05-02 21:12:19 +00005491 rc = getPage(pBt, (Pgno)pgno, &pPage);
drha2fce642004-06-05 00:01:44 +00005492 isInit = pPage->isInit;
5493 if( pPage->isInit==0 ){
danielk1977c7dc7532004-11-17 10:22:03 +00005494 initPage(pPage, pParent);
drha2fce642004-06-05 00:01:44 +00005495 }
drh8c42ca92001-06-22 19:15:00 +00005496 if( rc ){
5497 return rc;
5498 }
drhab9f7f12004-05-08 10:56:11 +00005499 hdr = pPage->hdrOffset;
5500 data = pPage->aData;
drhc8629a12004-05-08 20:07:40 +00005501 c = data[hdr];
drh8b18dd42004-05-12 19:18:15 +00005502 pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
drhc8629a12004-05-08 20:07:40 +00005503 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh8b18dd42004-05-12 19:18:15 +00005504 pPage->leafData = (c & PTF_LEAFDATA)!=0;
drhc8629a12004-05-08 20:07:40 +00005505 pPage->leaf = (c & PTF_LEAF)!=0;
drh8b18dd42004-05-12 19:18:15 +00005506 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
drh43605152004-05-29 21:46:49 +00005507 nCell = get2byte(&data[hdr+3]);
drhfe63d1c2004-09-08 20:13:04 +00005508 sqlite3DebugPrintf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno,
drh43605152004-05-29 21:46:49 +00005509 data[hdr], data[hdr+7],
drhda200cc2004-05-09 11:51:38 +00005510 (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
drhab9f7f12004-05-08 10:56:11 +00005511 assert( hdr == (pgno==1 ? 100 : 0) );
drh43605152004-05-29 21:46:49 +00005512 idx = hdr + 12 - pPage->leaf*4;
5513 for(i=0; i<nCell; i++){
drh6f11bef2004-05-13 01:12:56 +00005514 CellInfo info;
drh4b70f112004-05-02 21:12:19 +00005515 Pgno child;
drh43605152004-05-29 21:46:49 +00005516 unsigned char *pCell;
drh6f11bef2004-05-13 01:12:56 +00005517 int sz;
drh43605152004-05-29 21:46:49 +00005518 int addr;
drh6f11bef2004-05-13 01:12:56 +00005519
drh43605152004-05-29 21:46:49 +00005520 addr = get2byte(&data[idx + 2*i]);
5521 pCell = &data[addr];
5522 parseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00005523 sz = info.nSize;
drh43605152004-05-29 21:46:49 +00005524 sprintf(range,"%d..%d", addr, addr+sz-1);
drh4b70f112004-05-02 21:12:19 +00005525 if( pPage->leaf ){
5526 child = 0;
5527 }else{
drh43605152004-05-29 21:46:49 +00005528 child = get4byte(pCell);
drh4b70f112004-05-02 21:12:19 +00005529 }
drh6f11bef2004-05-13 01:12:56 +00005530 sz = info.nData;
5531 if( !pPage->intKey ) sz += info.nKey;
drh8c42ca92001-06-22 19:15:00 +00005532 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
drh6f11bef2004-05-13 01:12:56 +00005533 memcpy(payload, &pCell[info.nHeader], sz);
drh8c42ca92001-06-22 19:15:00 +00005534 for(j=0; j<sz; j++){
5535 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
5536 }
5537 payload[sz] = 0;
drhfe63d1c2004-09-08 20:13:04 +00005538 sqlite3DebugPrintf(
drh6f11bef2004-05-13 01:12:56 +00005539 "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n",
5540 i, range, child, info.nKey, info.nData, payload
drh8c42ca92001-06-22 19:15:00 +00005541 );
drh8c42ca92001-06-22 19:15:00 +00005542 }
drh4b70f112004-05-02 21:12:19 +00005543 if( !pPage->leaf ){
drhfe63d1c2004-09-08 20:13:04 +00005544 sqlite3DebugPrintf("right_child: %d\n", get4byte(&data[hdr+8]));
drh4b70f112004-05-02 21:12:19 +00005545 }
drh8c42ca92001-06-22 19:15:00 +00005546 nFree = 0;
5547 i = 0;
drhab9f7f12004-05-08 10:56:11 +00005548 idx = get2byte(&data[hdr+1]);
drhb6f41482004-05-14 01:58:11 +00005549 while( idx>0 && idx<pPage->pBt->usableSize ){
drhab9f7f12004-05-08 10:56:11 +00005550 int sz = get2byte(&data[idx+2]);
drh4b70f112004-05-02 21:12:19 +00005551 sprintf(range,"%d..%d", idx, idx+sz-1);
5552 nFree += sz;
drhfe63d1c2004-09-08 20:13:04 +00005553 sqlite3DebugPrintf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00005554 i, range, sz, nFree);
drhab9f7f12004-05-08 10:56:11 +00005555 idx = get2byte(&data[idx]);
drh2aa679f2001-06-25 02:11:07 +00005556 i++;
drh8c42ca92001-06-22 19:15:00 +00005557 }
5558 if( idx!=0 ){
drhfe63d1c2004-09-08 20:13:04 +00005559 sqlite3DebugPrintf("ERROR: next freeblock index out of range: %d\n", idx);
drh8c42ca92001-06-22 19:15:00 +00005560 }
drha34b6762004-05-07 13:30:42 +00005561 if( recursive && !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005562 for(i=0; i<nCell; i++){
5563 unsigned char *pCell = findCell(pPage, i);
danielk1977c7dc7532004-11-17 10:22:03 +00005564 btreePageDump(pBt, get4byte(pCell), 1, pPage);
drha34b6762004-05-07 13:30:42 +00005565 idx = get2byte(pCell);
drh6019e162001-07-02 17:51:45 +00005566 }
danielk1977c7dc7532004-11-17 10:22:03 +00005567 btreePageDump(pBt, get4byte(&data[hdr+8]), 1, pPage);
drh6019e162001-07-02 17:51:45 +00005568 }
drha2fce642004-06-05 00:01:44 +00005569 pPage->isInit = isInit;
drhab9f7f12004-05-08 10:56:11 +00005570 sqlite3pager_unref(data);
drh3644f082004-05-10 18:45:09 +00005571 fflush(stdout);
drh8c42ca92001-06-22 19:15:00 +00005572 return SQLITE_OK;
5573}
danielk1977aef0bf62005-12-30 16:28:01 +00005574int sqlite3BtreePageDump(Btree *p, int pgno, int recursive){
5575 return btreePageDump(p->pBt, pgno, recursive, 0);
danielk1977c7dc7532004-11-17 10:22:03 +00005576}
drhaaab5722002-02-19 13:39:21 +00005577#endif
drh8c42ca92001-06-22 19:15:00 +00005578
drhaaab5722002-02-19 13:39:21 +00005579#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00005580/*
drh2aa679f2001-06-25 02:11:07 +00005581** Fill aResult[] with information about the entry and page that the
5582** cursor is pointing to.
5583**
5584** aResult[0] = The page number
5585** aResult[1] = The entry number
5586** aResult[2] = Total number of entries on this page
drh3e27c022004-07-23 00:01:38 +00005587** aResult[3] = Cell size (local payload + header)
drh2aa679f2001-06-25 02:11:07 +00005588** aResult[4] = Number of free bytes on this page
5589** aResult[5] = Number of free blocks on the page
drh3e27c022004-07-23 00:01:38 +00005590** aResult[6] = Total payload size (local + overflow)
5591** aResult[7] = Header size in bytes
5592** aResult[8] = Local payload size
5593** aResult[9] = Parent page number
drh5eddca62001-06-30 21:53:53 +00005594**
5595** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00005596*/
drh3e27c022004-07-23 00:01:38 +00005597int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult, int upCnt){
drh2aa679f2001-06-25 02:11:07 +00005598 int cnt, idx;
5599 MemPage *pPage = pCur->pPage;
drh3e27c022004-07-23 00:01:38 +00005600 BtCursor tmpCur;
drhda200cc2004-05-09 11:51:38 +00005601
5602 pageIntegrity(pPage);
drh4b70f112004-05-02 21:12:19 +00005603 assert( pPage->isInit );
drh3e27c022004-07-23 00:01:38 +00005604 getTempCursor(pCur, &tmpCur);
5605 while( upCnt-- ){
5606 moveToParent(&tmpCur);
5607 }
5608 pPage = tmpCur.pPage;
5609 pageIntegrity(pPage);
drha34b6762004-05-07 13:30:42 +00005610 aResult[0] = sqlite3pager_pagenumber(pPage->aData);
drh91025292004-05-03 19:49:32 +00005611 assert( aResult[0]==pPage->pgno );
drh3e27c022004-07-23 00:01:38 +00005612 aResult[1] = tmpCur.idx;
drh2aa679f2001-06-25 02:11:07 +00005613 aResult[2] = pPage->nCell;
drh3e27c022004-07-23 00:01:38 +00005614 if( tmpCur.idx>=0 && tmpCur.idx<pPage->nCell ){
5615 getCellInfo(&tmpCur);
5616 aResult[3] = tmpCur.info.nSize;
5617 aResult[6] = tmpCur.info.nData;
5618 aResult[7] = tmpCur.info.nHeader;
5619 aResult[8] = tmpCur.info.nLocal;
drh2aa679f2001-06-25 02:11:07 +00005620 }else{
5621 aResult[3] = 0;
5622 aResult[6] = 0;
drh3e27c022004-07-23 00:01:38 +00005623 aResult[7] = 0;
5624 aResult[8] = 0;
drh2aa679f2001-06-25 02:11:07 +00005625 }
5626 aResult[4] = pPage->nFree;
5627 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00005628 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhb6f41482004-05-14 01:58:11 +00005629 while( idx>0 && idx<pPage->pBt->usableSize ){
drh2aa679f2001-06-25 02:11:07 +00005630 cnt++;
drh4b70f112004-05-02 21:12:19 +00005631 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00005632 }
5633 aResult[5] = cnt;
drh3e27c022004-07-23 00:01:38 +00005634 if( pPage->pParent==0 || isRootPage(pPage) ){
5635 aResult[9] = 0;
5636 }else{
5637 aResult[9] = pPage->pParent->pgno;
5638 }
5639 releaseTempCursor(&tmpCur);
drh8c42ca92001-06-22 19:15:00 +00005640 return SQLITE_OK;
5641}
drhaaab5722002-02-19 13:39:21 +00005642#endif
drhdd793422001-06-28 01:54:48 +00005643
drhdd793422001-06-28 01:54:48 +00005644/*
drh5eddca62001-06-30 21:53:53 +00005645** Return the pager associated with a BTree. This routine is used for
5646** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00005647*/
danielk1977aef0bf62005-12-30 16:28:01 +00005648Pager *sqlite3BtreePager(Btree *p){
5649 return p->pBt->pPager;
drhdd793422001-06-28 01:54:48 +00005650}
drh5eddca62001-06-30 21:53:53 +00005651
5652/*
5653** This structure is passed around through all the sanity checking routines
5654** in order to keep track of some global state information.
5655*/
drhaaab5722002-02-19 13:39:21 +00005656typedef struct IntegrityCk IntegrityCk;
5657struct IntegrityCk {
danielk1977aef0bf62005-12-30 16:28:01 +00005658 BtShared *pBt; /* The tree being checked out */
drh100569d2001-10-02 13:01:48 +00005659 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
5660 int nPage; /* Number of pages in the database */
5661 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00005662 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00005663};
5664
drhb7f91642004-10-31 02:22:47 +00005665#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00005666/*
5667** Append a message to the error message string.
5668*/
drh2e38c322004-09-03 18:38:44 +00005669static void checkAppendMsg(
5670 IntegrityCk *pCheck,
5671 char *zMsg1,
5672 const char *zFormat,
5673 ...
5674){
5675 va_list ap;
5676 char *zMsg2;
5677 va_start(ap, zFormat);
5678 zMsg2 = sqlite3VMPrintf(zFormat, ap);
5679 va_end(ap);
5680 if( zMsg1==0 ) zMsg1 = "";
drh5eddca62001-06-30 21:53:53 +00005681 if( pCheck->zErrMsg ){
5682 char *zOld = pCheck->zErrMsg;
5683 pCheck->zErrMsg = 0;
danielk19774adee202004-05-08 08:23:19 +00005684 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00005685 sqliteFree(zOld);
5686 }else{
danielk19774adee202004-05-08 08:23:19 +00005687 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00005688 }
drh2e38c322004-09-03 18:38:44 +00005689 sqliteFree(zMsg2);
drh5eddca62001-06-30 21:53:53 +00005690}
drhb7f91642004-10-31 02:22:47 +00005691#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00005692
drhb7f91642004-10-31 02:22:47 +00005693#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00005694/*
5695** Add 1 to the reference count for page iPage. If this is the second
5696** reference to the page, add an error message to pCheck->zErrMsg.
5697** Return 1 if there are 2 ore more references to the page and 0 if
5698** if this is the first reference to the page.
5699**
5700** Also check that the page number is in bounds.
5701*/
drhaaab5722002-02-19 13:39:21 +00005702static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00005703 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00005704 if( iPage>pCheck->nPage || iPage<0 ){
drh2e38c322004-09-03 18:38:44 +00005705 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
drh5eddca62001-06-30 21:53:53 +00005706 return 1;
5707 }
5708 if( pCheck->anRef[iPage]==1 ){
drh2e38c322004-09-03 18:38:44 +00005709 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00005710 return 1;
5711 }
5712 return (pCheck->anRef[iPage]++)>1;
5713}
5714
danielk1977afcdd022004-10-31 16:25:42 +00005715#ifndef SQLITE_OMIT_AUTOVACUUM
5716/*
5717** Check that the entry in the pointer-map for page iChild maps to
5718** page iParent, pointer type ptrType. If not, append an error message
5719** to pCheck.
5720*/
5721static void checkPtrmap(
5722 IntegrityCk *pCheck, /* Integrity check context */
5723 Pgno iChild, /* Child page number */
5724 u8 eType, /* Expected pointer map type */
5725 Pgno iParent, /* Expected pointer map parent page number */
5726 char *zContext /* Context description (used for error msg) */
5727){
5728 int rc;
5729 u8 ePtrmapType;
5730 Pgno iPtrmapParent;
5731
5732 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
5733 if( rc!=SQLITE_OK ){
5734 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
5735 return;
5736 }
5737
5738 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
5739 checkAppendMsg(pCheck, zContext,
5740 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
5741 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
5742 }
5743}
5744#endif
5745
drh5eddca62001-06-30 21:53:53 +00005746/*
5747** Check the integrity of the freelist or of an overflow page list.
5748** Verify that the number of pages on the list is N.
5749*/
drh30e58752002-03-02 20:41:57 +00005750static void checkList(
5751 IntegrityCk *pCheck, /* Integrity checking context */
5752 int isFreeList, /* True for a freelist. False for overflow page list */
5753 int iPage, /* Page number for first page in the list */
5754 int N, /* Expected number of pages in the list */
5755 char *zContext /* Context for error messages */
5756){
5757 int i;
drh3a4c1412004-05-09 20:40:11 +00005758 int expected = N;
5759 int iFirst = iPage;
drh30e58752002-03-02 20:41:57 +00005760 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00005761 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00005762 if( iPage<1 ){
drh2e38c322004-09-03 18:38:44 +00005763 checkAppendMsg(pCheck, zContext,
5764 "%d of %d pages missing from overflow list starting at %d",
drh3a4c1412004-05-09 20:40:11 +00005765 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00005766 break;
5767 }
5768 if( checkRef(pCheck, iPage, zContext) ) break;
drha34b6762004-05-07 13:30:42 +00005769 if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
drh2e38c322004-09-03 18:38:44 +00005770 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00005771 break;
5772 }
drh30e58752002-03-02 20:41:57 +00005773 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00005774 int n = get4byte(&pOvfl[4]);
danielk1977687566d2004-11-02 12:56:41 +00005775#ifndef SQLITE_OMIT_AUTOVACUUM
5776 if( pCheck->pBt->autoVacuum ){
5777 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
5778 }
5779#endif
drh855eb1c2004-08-31 13:45:11 +00005780 if( n>pCheck->pBt->usableSize/4-8 ){
drh2e38c322004-09-03 18:38:44 +00005781 checkAppendMsg(pCheck, zContext,
5782 "freelist leaf count too big on page %d", iPage);
drhee696e22004-08-30 16:52:17 +00005783 N--;
5784 }else{
5785 for(i=0; i<n; i++){
danielk1977687566d2004-11-02 12:56:41 +00005786 Pgno iFreePage = get4byte(&pOvfl[8+i*4]);
5787#ifndef SQLITE_OMIT_AUTOVACUUM
5788 if( pCheck->pBt->autoVacuum ){
5789 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
5790 }
5791#endif
5792 checkRef(pCheck, iFreePage, zContext);
drhee696e22004-08-30 16:52:17 +00005793 }
5794 N -= n;
drh30e58752002-03-02 20:41:57 +00005795 }
drh30e58752002-03-02 20:41:57 +00005796 }
danielk1977afcdd022004-10-31 16:25:42 +00005797#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00005798 else{
5799 /* If this database supports auto-vacuum and iPage is not the last
5800 ** page in this overflow list, check that the pointer-map entry for
5801 ** the following page matches iPage.
5802 */
5803 if( pCheck->pBt->autoVacuum && N>0 ){
5804 i = get4byte(pOvfl);
5805 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
5806 }
danielk1977afcdd022004-10-31 16:25:42 +00005807 }
5808#endif
drh4b70f112004-05-02 21:12:19 +00005809 iPage = get4byte(pOvfl);
drha34b6762004-05-07 13:30:42 +00005810 sqlite3pager_unref(pOvfl);
drh5eddca62001-06-30 21:53:53 +00005811 }
5812}
drhb7f91642004-10-31 02:22:47 +00005813#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00005814
drhb7f91642004-10-31 02:22:47 +00005815#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00005816/*
5817** Do various sanity checks on a single page of a tree. Return
5818** the tree depth. Root pages return 0. Parents of root pages
5819** return 1, and so forth.
5820**
5821** These checks are done:
5822**
5823** 1. Make sure that cells and freeblocks do not overlap
5824** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00005825** NO 2. Make sure cell keys are in order.
5826** NO 3. Make sure no key is less than or equal to zLowerBound.
5827** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00005828** 5. Check the integrity of overflow pages.
5829** 6. Recursively call checkTreePage on all children.
5830** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00005831** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00005832** the root of the tree.
5833*/
5834static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00005835 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00005836 int iPage, /* Page number of the page to check */
5837 MemPage *pParent, /* Parent page */
5838 char *zParentContext, /* Parent context */
5839 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00005840 int nLower, /* Number of characters in zLowerBound */
5841 char *zUpperBound, /* All keys should be less than this, if not NULL */
5842 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00005843){
5844 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00005845 int i, rc, depth, d2, pgno, cnt;
drh43605152004-05-29 21:46:49 +00005846 int hdr, cellStart;
5847 int nCell;
drhda200cc2004-05-09 11:51:38 +00005848 u8 *data;
danielk1977aef0bf62005-12-30 16:28:01 +00005849 BtShared *pBt;
drh4f26bb62005-09-08 14:17:20 +00005850 int usableSize;
drh5eddca62001-06-30 21:53:53 +00005851 char zContext[100];
drh2e38c322004-09-03 18:38:44 +00005852 char *hit;
drh5eddca62001-06-30 21:53:53 +00005853
danielk1977ef73ee92004-11-06 12:26:07 +00005854 sprintf(zContext, "Page %d: ", iPage);
5855
drh5eddca62001-06-30 21:53:53 +00005856 /* Check that the page exists
5857 */
drhd9cb6ac2005-10-20 07:28:17 +00005858 pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00005859 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00005860 if( iPage==0 ) return 0;
5861 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh4b70f112004-05-02 21:12:19 +00005862 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh2e38c322004-09-03 18:38:44 +00005863 checkAppendMsg(pCheck, zContext,
5864 "unable to get the page. error code=%d", rc);
drh5eddca62001-06-30 21:53:53 +00005865 return 0;
5866 }
drh4b70f112004-05-02 21:12:19 +00005867 if( (rc = initPage(pPage, pParent))!=0 ){
drh2e38c322004-09-03 18:38:44 +00005868 checkAppendMsg(pCheck, zContext, "initPage() returns error code %d", rc);
drh91025292004-05-03 19:49:32 +00005869 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00005870 return 0;
5871 }
5872
5873 /* Check out all the cells.
5874 */
5875 depth = 0;
drh5eddca62001-06-30 21:53:53 +00005876 for(i=0; i<pPage->nCell; i++){
drh6f11bef2004-05-13 01:12:56 +00005877 u8 *pCell;
5878 int sz;
5879 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00005880
5881 /* Check payload overflow pages
5882 */
drh3a4c1412004-05-09 20:40:11 +00005883 sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
drh43605152004-05-29 21:46:49 +00005884 pCell = findCell(pPage,i);
5885 parseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00005886 sz = info.nData;
5887 if( !pPage->intKey ) sz += info.nKey;
5888 if( sz>info.nLocal ){
drhb6f41482004-05-14 01:58:11 +00005889 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
danielk1977afcdd022004-10-31 16:25:42 +00005890 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
5891#ifndef SQLITE_OMIT_AUTOVACUUM
5892 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00005893 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
danielk1977afcdd022004-10-31 16:25:42 +00005894 }
5895#endif
5896 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00005897 }
5898
5899 /* Check sanity of left child page.
5900 */
drhda200cc2004-05-09 11:51:38 +00005901 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005902 pgno = get4byte(pCell);
danielk1977afcdd022004-10-31 16:25:42 +00005903#ifndef SQLITE_OMIT_AUTOVACUUM
5904 if( pBt->autoVacuum ){
5905 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
5906 }
5907#endif
drhda200cc2004-05-09 11:51:38 +00005908 d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
5909 if( i>0 && d2!=depth ){
5910 checkAppendMsg(pCheck, zContext, "Child page depth differs");
5911 }
5912 depth = d2;
drh5eddca62001-06-30 21:53:53 +00005913 }
drh5eddca62001-06-30 21:53:53 +00005914 }
drhda200cc2004-05-09 11:51:38 +00005915 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005916 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drhda200cc2004-05-09 11:51:38 +00005917 sprintf(zContext, "On page %d at right child: ", iPage);
danielk1977afcdd022004-10-31 16:25:42 +00005918#ifndef SQLITE_OMIT_AUTOVACUUM
5919 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00005920 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
danielk1977afcdd022004-10-31 16:25:42 +00005921 }
5922#endif
drhda200cc2004-05-09 11:51:38 +00005923 checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
5924 }
drh5eddca62001-06-30 21:53:53 +00005925
5926 /* Check for complete coverage of the page
5927 */
drhda200cc2004-05-09 11:51:38 +00005928 data = pPage->aData;
5929 hdr = pPage->hdrOffset;
drh2e38c322004-09-03 18:38:44 +00005930 hit = sqliteMalloc( usableSize );
5931 if( hit ){
5932 memset(hit, 1, get2byte(&data[hdr+5]));
5933 nCell = get2byte(&data[hdr+3]);
5934 cellStart = hdr + 12 - 4*pPage->leaf;
5935 for(i=0; i<nCell; i++){
5936 int pc = get2byte(&data[cellStart+i*2]);
5937 int size = cellSizePtr(pPage, &data[pc]);
5938 int j;
danielk19777701e812005-01-10 12:59:51 +00005939 if( (pc+size-1)>=usableSize || pc<0 ){
5940 checkAppendMsg(pCheck, 0,
5941 "Corruption detected in cell %d on page %d",i,iPage,0);
5942 }else{
5943 for(j=pc+size-1; j>=pc; j--) hit[j]++;
5944 }
drh2e38c322004-09-03 18:38:44 +00005945 }
5946 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
5947 cnt++){
5948 int size = get2byte(&data[i+2]);
5949 int j;
danielk19777701e812005-01-10 12:59:51 +00005950 if( (i+size-1)>=usableSize || i<0 ){
5951 checkAppendMsg(pCheck, 0,
5952 "Corruption detected in cell %d on page %d",i,iPage,0);
5953 }else{
5954 for(j=i+size-1; j>=i; j--) hit[j]++;
5955 }
drh2e38c322004-09-03 18:38:44 +00005956 i = get2byte(&data[i]);
5957 }
5958 for(i=cnt=0; i<usableSize; i++){
5959 if( hit[i]==0 ){
5960 cnt++;
5961 }else if( hit[i]>1 ){
5962 checkAppendMsg(pCheck, 0,
5963 "Multiple uses for byte %d of page %d", i, iPage);
5964 break;
5965 }
5966 }
5967 if( cnt!=data[hdr+7] ){
5968 checkAppendMsg(pCheck, 0,
5969 "Fragmented space is %d byte reported as %d on page %d",
5970 cnt, data[hdr+7], iPage);
drh5eddca62001-06-30 21:53:53 +00005971 }
5972 }
drh2e38c322004-09-03 18:38:44 +00005973 sqliteFree(hit);
drh6019e162001-07-02 17:51:45 +00005974
drh4b70f112004-05-02 21:12:19 +00005975 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00005976 return depth+1;
drh5eddca62001-06-30 21:53:53 +00005977}
drhb7f91642004-10-31 02:22:47 +00005978#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00005979
drhb7f91642004-10-31 02:22:47 +00005980#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00005981/*
5982** This routine does a complete check of the given BTree file. aRoot[] is
5983** an array of pages numbers were each page number is the root page of
5984** a table. nRoot is the number of entries in aRoot.
5985**
5986** If everything checks out, this routine returns NULL. If something is
5987** amiss, an error message is written into memory obtained from malloc()
5988** and a pointer to that error message is returned. The calling function
5989** is responsible for freeing the error message when it is done.
5990*/
danielk1977aef0bf62005-12-30 16:28:01 +00005991char *sqlite3BtreeIntegrityCheck(Btree *p, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00005992 int i;
5993 int nRef;
drhaaab5722002-02-19 13:39:21 +00005994 IntegrityCk sCheck;
danielk1977aef0bf62005-12-30 16:28:01 +00005995 BtShared *pBt = p->pBt;
drh5eddca62001-06-30 21:53:53 +00005996
drha34b6762004-05-07 13:30:42 +00005997 nRef = *sqlite3pager_stats(pBt->pPager);
danielk1977aef0bf62005-12-30 16:28:01 +00005998 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
drhefc251d2001-07-01 22:12:01 +00005999 return sqliteStrDup("Unable to acquire a read lock on the database");
6000 }
drh5eddca62001-06-30 21:53:53 +00006001 sCheck.pBt = pBt;
6002 sCheck.pPager = pBt->pPager;
drha34b6762004-05-07 13:30:42 +00006003 sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00006004 if( sCheck.nPage==0 ){
6005 unlockBtreeIfUnused(pBt);
6006 return 0;
6007 }
drh8c1238a2003-01-02 14:43:55 +00006008 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
danielk1977ac245ec2005-01-14 13:50:11 +00006009 if( !sCheck.anRef ){
6010 unlockBtreeIfUnused(pBt);
6011 return sqlite3MPrintf("Unable to malloc %d bytes",
6012 (sCheck.nPage+1)*sizeof(sCheck.anRef[0]));
6013 }
drhda200cc2004-05-09 11:51:38 +00006014 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh42cac6d2004-11-20 20:31:11 +00006015 i = PENDING_BYTE_PAGE(pBt);
drh1f595712004-06-15 01:40:29 +00006016 if( i<=sCheck.nPage ){
6017 sCheck.anRef[i] = 1;
6018 }
drh5eddca62001-06-30 21:53:53 +00006019 sCheck.zErrMsg = 0;
6020
6021 /* Check the integrity of the freelist
6022 */
drha34b6762004-05-07 13:30:42 +00006023 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6024 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00006025
6026 /* Check all the tables.
6027 */
6028 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00006029 if( aRoot[i]==0 ) continue;
danielk1977687566d2004-11-02 12:56:41 +00006030#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006031 if( pBt->autoVacuum && aRoot[i]>1 ){
6032 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6033 }
6034#endif
drh1bffb9c2002-02-03 17:37:36 +00006035 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00006036 }
6037
6038 /* Make sure every page in the file is referenced
6039 */
6040 for(i=1; i<=sCheck.nPage; i++){
danielk1977afcdd022004-10-31 16:25:42 +00006041#ifdef SQLITE_OMIT_AUTOVACUUM
drh5eddca62001-06-30 21:53:53 +00006042 if( sCheck.anRef[i]==0 ){
drh2e38c322004-09-03 18:38:44 +00006043 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
drh5eddca62001-06-30 21:53:53 +00006044 }
danielk1977afcdd022004-10-31 16:25:42 +00006045#else
6046 /* If the database supports auto-vacuum, make sure no tables contain
6047 ** references to pointer-map pages.
6048 */
6049 if( sCheck.anRef[i]==0 &&
drh42cac6d2004-11-20 20:31:11 +00006050 (PTRMAP_PAGENO(pBt->usableSize, i)!=i || !pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006051 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6052 }
6053 if( sCheck.anRef[i]!=0 &&
drh42cac6d2004-11-20 20:31:11 +00006054 (PTRMAP_PAGENO(pBt->usableSize, i)==i && pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006055 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6056 }
6057#endif
drh5eddca62001-06-30 21:53:53 +00006058 }
6059
6060 /* Make sure this analysis did not leave any unref() pages
6061 */
drh5e00f6c2001-09-13 13:46:56 +00006062 unlockBtreeIfUnused(pBt);
drha34b6762004-05-07 13:30:42 +00006063 if( nRef != *sqlite3pager_stats(pBt->pPager) ){
drh2e38c322004-09-03 18:38:44 +00006064 checkAppendMsg(&sCheck, 0,
drh5eddca62001-06-30 21:53:53 +00006065 "Outstanding page count goes from %d to %d during this analysis",
drha34b6762004-05-07 13:30:42 +00006066 nRef, *sqlite3pager_stats(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00006067 );
drh5eddca62001-06-30 21:53:53 +00006068 }
6069
6070 /* Clean up and report errors.
6071 */
6072 sqliteFree(sCheck.anRef);
6073 return sCheck.zErrMsg;
6074}
drhb7f91642004-10-31 02:22:47 +00006075#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
paulb95a8862003-04-01 21:16:41 +00006076
drh73509ee2003-04-06 20:44:45 +00006077/*
6078** Return the full pathname of the underlying database file.
6079*/
danielk1977aef0bf62005-12-30 16:28:01 +00006080const char *sqlite3BtreeGetFilename(Btree *p){
6081 assert( p->pBt->pPager!=0 );
6082 return sqlite3pager_filename(p->pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00006083}
6084
6085/*
danielk19775865e3d2004-06-14 06:03:57 +00006086** Return the pathname of the directory that contains the database file.
6087*/
danielk1977aef0bf62005-12-30 16:28:01 +00006088const char *sqlite3BtreeGetDirname(Btree *p){
6089 assert( p->pBt->pPager!=0 );
6090 return sqlite3pager_dirname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00006091}
6092
6093/*
6094** Return the pathname of the journal file for this database. The return
6095** value of this routine is the same regardless of whether the journal file
6096** has been created or not.
6097*/
danielk1977aef0bf62005-12-30 16:28:01 +00006098const char *sqlite3BtreeGetJournalname(Btree *p){
6099 assert( p->pBt->pPager!=0 );
6100 return sqlite3pager_journalname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00006101}
6102
drhb7f91642004-10-31 02:22:47 +00006103#ifndef SQLITE_OMIT_VACUUM
danielk19775865e3d2004-06-14 06:03:57 +00006104/*
drhf7c57532003-04-25 13:22:51 +00006105** Copy the complete content of pBtFrom into pBtTo. A transaction
6106** must be active for both files.
6107**
6108** The size of file pBtFrom may be reduced by this operation.
drh43605152004-05-29 21:46:49 +00006109** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00006110*/
danielk1977aef0bf62005-12-30 16:28:01 +00006111int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
drhf7c57532003-04-25 13:22:51 +00006112 int rc = SQLITE_OK;
drh50f2f432005-09-16 11:32:18 +00006113 Pgno i, nPage, nToPage, iSkip;
drhf7c57532003-04-25 13:22:51 +00006114
danielk1977aef0bf62005-12-30 16:28:01 +00006115 BtShared *pBtTo = pTo->pBt;
6116 BtShared *pBtFrom = pFrom->pBt;
6117
6118 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
danielk1977ee5741e2004-05-31 10:01:34 +00006119 return SQLITE_ERROR;
6120 }
drhf7c57532003-04-25 13:22:51 +00006121 if( pBtTo->pCursor ) return SQLITE_BUSY;
drha34b6762004-05-07 13:30:42 +00006122 nToPage = sqlite3pager_pagecount(pBtTo->pPager);
6123 nPage = sqlite3pager_pagecount(pBtFrom->pPager);
drh50f2f432005-09-16 11:32:18 +00006124 iSkip = PENDING_BYTE_PAGE(pBtTo);
danielk1977369f27e2004-06-15 11:40:04 +00006125 for(i=1; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00006126 void *pPage;
drh50f2f432005-09-16 11:32:18 +00006127 if( i==iSkip ) continue;
drha34b6762004-05-07 13:30:42 +00006128 rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
drhf7c57532003-04-25 13:22:51 +00006129 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00006130 rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
drh2e6d11b2003-04-25 15:37:57 +00006131 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00006132 sqlite3pager_unref(pPage);
drhf7c57532003-04-25 13:22:51 +00006133 }
drh2e6d11b2003-04-25 15:37:57 +00006134 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
6135 void *pPage;
drh49285702005-09-17 15:20:26 +00006136 if( i==iSkip ) continue;
drha34b6762004-05-07 13:30:42 +00006137 rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
drh2e6d11b2003-04-25 15:37:57 +00006138 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00006139 rc = sqlite3pager_write(pPage);
6140 sqlite3pager_unref(pPage);
6141 sqlite3pager_dont_write(pBtTo->pPager, i);
drh2e6d11b2003-04-25 15:37:57 +00006142 }
6143 if( !rc && nPage<nToPage ){
drha34b6762004-05-07 13:30:42 +00006144 rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
drh2e6d11b2003-04-25 15:37:57 +00006145 }
drhf7c57532003-04-25 13:22:51 +00006146 if( rc ){
danielk1977aef0bf62005-12-30 16:28:01 +00006147 sqlite3BtreeRollback(pTo);
drhf7c57532003-04-25 13:22:51 +00006148 }
6149 return rc;
drh73509ee2003-04-06 20:44:45 +00006150}
drhb7f91642004-10-31 02:22:47 +00006151#endif /* SQLITE_OMIT_VACUUM */
danielk19771d850a72004-05-31 08:26:49 +00006152
6153/*
6154** Return non-zero if a transaction is active.
6155*/
danielk1977aef0bf62005-12-30 16:28:01 +00006156int sqlite3BtreeIsInTrans(Btree *p){
6157 return (p && (p->inTrans==TRANS_WRITE));
danielk19771d850a72004-05-31 08:26:49 +00006158}
6159
6160/*
6161** Return non-zero if a statement transaction is active.
6162*/
danielk1977aef0bf62005-12-30 16:28:01 +00006163int sqlite3BtreeIsInStmt(Btree *p){
6164 return (p->pBt && p->pBt->inStmt);
danielk19771d850a72004-05-31 08:26:49 +00006165}
danielk197713adf8a2004-06-03 16:08:41 +00006166
6167/*
6168** This call is a no-op if no write-transaction is currently active on pBt.
6169**
6170** Otherwise, sync the database file for the btree pBt. zMaster points to
6171** the name of a master journal file that should be written into the
6172** individual journal file, or is NULL, indicating no master journal file
6173** (single database transaction).
6174**
6175** When this is called, the master journal should already have been
6176** created, populated with this journal pointer and synced to disk.
6177**
6178** Once this is routine has returned, the only thing required to commit
6179** the write-transaction for this database file is to delete the journal.
6180*/
danielk1977aef0bf62005-12-30 16:28:01 +00006181int sqlite3BtreeSync(Btree *p, const char *zMaster){
6182 if( p->inTrans==TRANS_WRITE ){
6183 BtShared *pBt = p->pBt;
danielk1977687566d2004-11-02 12:56:41 +00006184#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977d761c0c2004-11-05 16:37:02 +00006185 Pgno nTrunc = 0;
danielk1977687566d2004-11-02 12:56:41 +00006186 if( pBt->autoVacuum ){
danielk1977d761c0c2004-11-05 16:37:02 +00006187 int rc = autoVacuumCommit(pBt, &nTrunc);
danielk1977687566d2004-11-02 12:56:41 +00006188 if( rc!=SQLITE_OK ) return rc;
6189 }
danielk1977d761c0c2004-11-05 16:37:02 +00006190 return sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
danielk1977687566d2004-11-02 12:56:41 +00006191#endif
danielk1977d761c0c2004-11-05 16:37:02 +00006192 return sqlite3pager_sync(pBt->pPager, zMaster, 0);
danielk197713adf8a2004-06-03 16:08:41 +00006193 }
6194 return SQLITE_OK;
6195}
danielk1977aef0bf62005-12-30 16:28:01 +00006196
6197#ifndef SQLITE_OMIT_SHARED_CACHE
6198/*
6199** Enable the shared pager and schema features.
6200*/
6201int sqlite3_enable_shared_cache(int enable){
6202 SqliteTsd *pTsd = sqlite3Tsd();
6203 if( pTsd->pPager ){
6204 return SQLITE_MISUSE;
6205 }
6206 pTsd->useSharedData = enable;
6207 return SQLITE_OK;
6208}
6209#endif
6210