blob: 54e13ad082f2c51b9452fc24703558f0abc38679 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drh6d2fb152004-05-14 16:50:06 +000012** $Id: btree.c,v 1.137 2004/05/14 16:50:06 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
56** page contain a special header that describes the file. The format
57** of that header is as follows:
58**
59** OFFSET SIZE DESCRIPTION
drhde647132004-05-07 17:57:49 +000060** 0 16 Header string: "SQLite format 3\000"
drh9e572e62004-04-23 23:43:10 +000061** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
drh6f11bef2004-05-13 01:12:56 +000064** 20 1 Bytes of unused space at the end of each page
65** 21 1 Max embedded payload fraction
66** 22 1 Min embedded payload fraction
67** 23 1 Min leaf payload fraction
68** 24 4 File change counter
69** 28 4 Reserved for future use
drh9e572e62004-04-23 23:43:10 +000070** 32 4 First freelist page
71** 36 4 Number of freelist pages in the file
72** 40 60 15 4-byte meta values passed to higher layers
73**
74** All of the integer values are big-endian (most significant byte first).
drh6f11bef2004-05-13 01:12:56 +000075**
76** The file change counter is incremented every time the database is more
77** than once within the same second. This counter, together with the
78** modification time of the file, allows other processes to know
79** when the file has changed and thus when they need to flush their
80** cache.
81**
82** The max embedded payload fraction is the amount of the total usable
83** space in a page that can be consumed by a single cell for standard
84** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
85** is to limit the maximum cell size so that at least 4 cells will fit
86** on one pages. Thus the default max embedded payload fraction is 64.
87**
88** If the payload for a cell is larger than the max payload, then extra
89** payload is spilled to overflow pages. Once an overflow page is allocated,
90** as many bytes as possible are moved into the overflow pages without letting
91** the cell size drop below the min embedded payload fraction.
92**
93** The min leaf payload fraction is like the min embedded payload fraction
94** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
95** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96** not specified in the header.
drh9e572e62004-04-23 23:43:10 +000097**
98** Each btree page begins with a header described below. Note that the
99** header for page one begins at byte 100. For all other btree pages, the
100** header begins on byte zero.
101**
102** OFFSET SIZE DESCRIPTION
drh6f11bef2004-05-13 01:12:56 +0000103** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
drh9e572e62004-04-23 23:43:10 +0000104** 1 2 byte offset to the first freeblock
105** 3 2 byte offset to the first cell
106** 5 1 number of fragmented free bytes
107** 6 4 Right child (the Ptr(N+1) value). Omitted if leaf
108**
109** The flags define the format of this btree page. The leaf flag means that
110** this page has no children. The zerodata flag means that this page carries
111** only keys and no data. The intkey flag means that the key is a single
112** variable length integer at the beginning of the payload.
113**
114** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
115** byte are used. The integer consists of all bytes that have bit 8 set and
drh6f11bef2004-05-13 01:12:56 +0000116** the first byte with bit 8 clear. The most significant byte of the integer
117** appears first.
drh9e572e62004-04-23 23:43:10 +0000118**
119** 0x00 becomes 0x00000000
drh6f11bef2004-05-13 01:12:56 +0000120** 0x7f becomes 0x0000007f
121** 0x81 0x00 becomes 0x00000080
122** 0x82 0x00 becomes 0x00000100
123** 0x80 0x7f becomes 0x0000007f
124** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
drh9e572e62004-04-23 23:43:10 +0000125** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
126**
127** Variable length integers are used for rowids and to hold the number of
128** bytes of key and data in a btree cell.
129**
130** Unused space within a btree page is collected into a linked list of
131** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
132** to the first freeblock is given in the header. Freeblocks occur in
133** increasing order. Because a freeblock is 4 bytes in size, the minimum
134** size allocation on a btree page is 4 bytes. Because a freeblock must be
135** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
136** exist on the freeblock chain. The total number of such fragmented bytes
137** is recorded in the page header at offset 5.
138**
139** SIZE DESCRIPTION
140** 2 Byte offset of the next freeblock
141** 2 Bytes in this freeblock
142**
143** Cells are of variable length. The first cell begins on the byte defined
144** in the page header. Cells do not necessarily occur in order - they can
145** skip around on the page.
146**
147** SIZE DESCRIPTION
148** 2 Byte offset of the next cell. 0 if this is the last cell
drh3aac2dd2004-04-26 14:10:20 +0000149** 4 Page number of the left child. Omitted if leaf flag is set.
150** var Number of bytes of data. Omitted if the zerodata flag is set.
151** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000152** * Payload
153** 4 First page of the overflow chain. Omitted if no overflow
154**
155** Overflow pages form a linked list. Each page except the last is completely
156** filled with data (pagesize - 4 bytes). The last page can have as little
157** as 1 byte of data.
158**
159** SIZE DESCRIPTION
160** 4 Page number of next overflow page
161** * Data
162**
163** Freelist pages come in two subtypes: trunk pages and leaf pages. The
164** file header points to first in a linked list of trunk page. Each trunk
165** page points to multiple leaf pages. The content of a leaf page is
166** unspecified. A trunk page looks like this:
167**
168** SIZE DESCRIPTION
169** 4 Page number of next trunk page
170** 4 Number of leaf pointers on this page
171** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000172*/
173#include "sqliteInt.h"
174#include "pager.h"
175#include "btree.h"
176#include <assert.h>
177
drh4b70f112004-05-02 21:12:19 +0000178
179/* Maximum page size. The upper bound on this value is 65536 (a limit
180** imposed by the 2-byte offset at the beginning of each cell.) The
181** maximum page size determines the amount of stack space allocated
182** by many of the routines in this module. On embedded architectures
183** or any machine where memory and especially stack memory is limited,
184** one may wish to chose a smaller value for the maximum page size.
185*/
186#ifndef MX_PAGE_SIZE
187# define MX_PAGE_SIZE 1024
188#endif
189
drh4b70f112004-05-02 21:12:19 +0000190/* The following value is the maximum cell size assuming a maximum page
191** size give above.
192*/
drh6f11bef2004-05-13 01:12:56 +0000193#define MX_CELL_SIZE (MX_PAGE_SIZE-10)
drh4b70f112004-05-02 21:12:19 +0000194
195/* The maximum number of cells on a single page of the database. This
196** assumes a minimum cell size of 3 bytes. Such small cells will be
197** exceedingly rare, but they are possible.
198*/
199#define MX_CELL ((MX_PAGE_SIZE-10)/3)
200
paulb95a8862003-04-01 21:16:41 +0000201/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000202typedef struct MemPage MemPage;
paulb95a8862003-04-01 21:16:41 +0000203
drh8c42ca92001-06-22 19:15:00 +0000204/*
drhbd03cae2001-06-02 02:40:57 +0000205** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000206** SQLite database in order to identify the file as a real database.
drhde647132004-05-07 17:57:49 +0000207** 123456789 123456 */
208static const char zMagicHeader[] = "SQLite format 3";
drh08ed44e2001-04-29 23:32:55 +0000209
210/*
drh4b70f112004-05-02 21:12:19 +0000211** Page type flags. An ORed combination of these flags appear as the
212** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000213*/
drhde647132004-05-07 17:57:49 +0000214#define PTF_INTKEY 0x01
drh9e572e62004-04-23 23:43:10 +0000215#define PTF_ZERODATA 0x02
drh8b18dd42004-05-12 19:18:15 +0000216#define PTF_LEAFDATA 0x04
217#define PTF_LEAF 0x08
drh8c42ca92001-06-22 19:15:00 +0000218
219/*
drh9e572e62004-04-23 23:43:10 +0000220** As each page of the file is loaded into memory, an instance of the following
221** structure is appended and initialized to zero. This structure stores
222** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000223**
drh72f82862001-05-24 21:06:34 +0000224** The pParent field points back to the parent page. This allows us to
225** walk up the BTree from any leaf to the root. Care must be taken to
226** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000227** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000228*/
229struct MemPage {
drhde647132004-05-07 17:57:49 +0000230 u32 notUsed;
drh3aac2dd2004-04-26 14:10:20 +0000231 u8 isInit; /* True if previously initialized */
drh9e572e62004-04-23 23:43:10 +0000232 u8 idxShift; /* True if Cell indices have changed */
drh3aac2dd2004-04-26 14:10:20 +0000233 u8 isOverfull; /* Some aCell[] do not fit on page */
drh9e572e62004-04-23 23:43:10 +0000234 u8 intKey; /* True if intkey flag is set */
235 u8 leaf; /* True if leaf flag is set */
drh8b18dd42004-05-12 19:18:15 +0000236 u8 zeroData; /* True if table stores keys only */
237 u8 leafData; /* True if tables stores data on leaves only */
238 u8 hasData; /* True if this page stores data */
drh9e572e62004-04-23 23:43:10 +0000239 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
drhda200cc2004-05-09 11:51:38 +0000240 u8 needRelink; /* True if need to run relinkCellList() */
drh3aac2dd2004-04-26 14:10:20 +0000241 int idxParent; /* Index in pParent->aCell[] of this node */
drh9e572e62004-04-23 23:43:10 +0000242 int nFree; /* Number of free bytes on the page */
drh306dc212001-05-21 13:45:10 +0000243 int nCell; /* Number of entries on this page */
drh4b70f112004-05-02 21:12:19 +0000244 int nCellAlloc; /* Number of slots allocated in aCell[] */
drh9e572e62004-04-23 23:43:10 +0000245 unsigned char **aCell; /* Pointer to start of each cell */
drhc8629a12004-05-08 20:07:40 +0000246 struct Btree *pBt; /* Pointer back to BTree structure */
247
248 unsigned char *aData; /* Pointer back to the start of the page */
249 Pgno pgno; /* Page number for this page */
250 MemPage *pParent; /* The parent of this page. NULL for root */
drh8c42ca92001-06-22 19:15:00 +0000251};
drh7e3b0a02001-04-28 16:52:40 +0000252
253/*
drh3b7511c2001-05-26 13:15:44 +0000254** The in-memory image of a disk page has the auxiliary information appended
255** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
256** that extra information.
257*/
drh3aac2dd2004-04-26 14:10:20 +0000258#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000259
260/*
drha059ad02001-04-17 20:09:11 +0000261** Everything we need to know about an open database
262*/
263struct Btree {
264 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000265 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000266 MemPage *pPage1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000267 u8 inTrans; /* True if a transaction is in progress */
drh3aac2dd2004-04-26 14:10:20 +0000268 u8 inStmt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000269 u8 readOnly; /* True if the underlying file is readonly */
drhb6f41482004-05-14 01:58:11 +0000270 int pageSize; /* Total number of bytes on a page */
271 int usableSize; /* Number of usable bytes on each page */
drh6f11bef2004-05-13 01:12:56 +0000272 int maxLocal; /* Maximum local payload in non-LEAFDATA tables */
273 int minLocal; /* Minimum local payload in non-LEAFDATA tables */
274 int maxLeaf; /* Maximum local payload in a LEAFDATA table */
275 int minLeaf; /* Minimum local payload in a LEAFDATA table */
276 u8 maxEmbedFrac; /* Maximum payload as % of total page size */
277 u8 minEmbedFrac; /* Minimum payload as % of total page size */
278 u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
drha059ad02001-04-17 20:09:11 +0000279};
280typedef Btree Bt;
281
drh365d68f2001-05-11 11:02:46 +0000282/*
283** A cursor is a pointer to a particular entry in the BTree.
284** The entry is identified by its MemPage and the index in
drha34b6762004-05-07 13:30:42 +0000285** MemPage.aCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000286*/
drh72f82862001-05-24 21:06:34 +0000287struct BtCursor {
drh5e2f8b92001-05-28 00:41:15 +0000288 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000289 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000290 BtCursor *pShared; /* Loop of cursors with the same root page */
drh3aac2dd2004-04-26 14:10:20 +0000291 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
292 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000293 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000294 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000295 int idx; /* Index of the entry in pPage->aCell[] */
drhecdc7532001-09-23 02:35:53 +0000296 u8 wrFlag; /* True if writable */
drh23e11ca2004-05-04 17:27:28 +0000297 u8 iMatch; /* compare result from last sqlite3BtreeMoveto() */
drhc39e0002004-05-07 23:50:57 +0000298 u8 isValid; /* TRUE if points to a valid entry */
299 u8 status; /* Set to SQLITE_ABORT if cursors is invalidated */
drh365d68f2001-05-11 11:02:46 +0000300};
drh7e3b0a02001-04-28 16:52:40 +0000301
drha059ad02001-04-17 20:09:11 +0000302/*
drh6f11bef2004-05-13 01:12:56 +0000303** An instance of the following structure is used to hold information
304** about a cell. The parseCell() function fills the structure in.
305*/
306typedef struct CellInfo CellInfo;
307struct CellInfo {
308 i64 nKey; /* The key for INTKEY tables, or number of bytes in key */
309 u32 nData; /* Number of bytes of data */
310 int nHeader; /* Size of the header in bytes */
311 int nLocal; /* Amount of payload held locally */
312 int iOverflow; /* Offset to overflow page number. Zero if none */
313 int nSize; /* Size of the cell */
314};
315
316/*
drh3aac2dd2004-04-26 14:10:20 +0000317** Read or write a two-, four-, and eight-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000318*/
drh9e572e62004-04-23 23:43:10 +0000319static u32 get2byte(unsigned char *p){
320 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000321}
drh9e572e62004-04-23 23:43:10 +0000322static u32 get4byte(unsigned char *p){
323 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
324}
drh9e572e62004-04-23 23:43:10 +0000325static void put2byte(unsigned char *p, u32 v){
326 p[0] = v>>8;
327 p[1] = v;
328}
329static void put4byte(unsigned char *p, u32 v){
330 p[0] = v>>24;
331 p[1] = v>>16;
332 p[2] = v>>8;
333 p[3] = v;
334}
drh6f11bef2004-05-13 01:12:56 +0000335
drh9e572e62004-04-23 23:43:10 +0000336/*
drh6d2fb152004-05-14 16:50:06 +0000337** Routines to read and write variable-length integers.
drh9e572e62004-04-23 23:43:10 +0000338*/
drh6d2fb152004-05-14 16:50:06 +0000339#define getVarint sqlite3GetVarint
340#define getVarint32 sqlite3GetVarint32
341#define putVarint sqlite3PutVarint
drh0d316a42002-08-11 20:10:47 +0000342
343/*
drh3aac2dd2004-04-26 14:10:20 +0000344** Parse a cell header and fill in the CellInfo structure.
345*/
drh6f11bef2004-05-13 01:12:56 +0000346static void parseCell(
drh3aac2dd2004-04-26 14:10:20 +0000347 MemPage *pPage, /* Page containing the cell */
348 unsigned char *pCell, /* The cell */
drh6f11bef2004-05-13 01:12:56 +0000349 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000350){
351 int n;
drh6f11bef2004-05-13 01:12:56 +0000352 int nPayload;
353 Btree *pBt;
354 int minLocal, maxLocal;
drh3aac2dd2004-04-26 14:10:20 +0000355 if( pPage->leaf ){
356 n = 2;
357 }else{
358 n = 6;
359 }
drh8b18dd42004-05-12 19:18:15 +0000360 if( pPage->hasData ){
drh6f11bef2004-05-13 01:12:56 +0000361 n += getVarint32(&pCell[n], &pInfo->nData);
drh8b18dd42004-05-12 19:18:15 +0000362 }else{
drh6f11bef2004-05-13 01:12:56 +0000363 pInfo->nData = 0;
drh3aac2dd2004-04-26 14:10:20 +0000364 }
drh6f11bef2004-05-13 01:12:56 +0000365 n += getVarint(&pCell[n], &pInfo->nKey);
366 pInfo->nHeader = n;
367 nPayload = pInfo->nData;
368 if( !pPage->intKey ){
369 nPayload += pInfo->nKey;
370 }
371 pBt = pPage->pBt;
372 if( pPage->leafData ){
373 minLocal = pBt->minLeaf;
drhb6f41482004-05-14 01:58:11 +0000374 maxLocal = pBt->usableSize - 23;
drh6f11bef2004-05-13 01:12:56 +0000375 }else{
376 minLocal = pBt->minLocal;
377 maxLocal = pBt->maxLocal;
378 }
379 if( nPayload<=maxLocal ){
380 pInfo->nLocal = nPayload;
381 pInfo->iOverflow = 0;
382 pInfo->nSize = nPayload + n;
383 }else{
drhb6f41482004-05-14 01:58:11 +0000384 int surplus = minLocal + (nPayload - minLocal)%(pBt->usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +0000385 if( surplus <= maxLocal ){
386 pInfo->nLocal = surplus;
387 }else{
388 pInfo->nLocal = minLocal;
389 }
390 pInfo->iOverflow = pInfo->nLocal + n;
391 pInfo->nSize = pInfo->iOverflow + 4;
392 }
drh3aac2dd2004-04-26 14:10:20 +0000393}
394
395/*
drh3b7511c2001-05-26 13:15:44 +0000396** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000397** database page. The number returned includes the Cell header,
398** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000399** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000400** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000401*/
drh9e572e62004-04-23 23:43:10 +0000402static int cellSize(MemPage *pPage, unsigned char *pCell){
drh6f11bef2004-05-13 01:12:56 +0000403 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +0000404
drh6f11bef2004-05-13 01:12:56 +0000405 parseCell(pPage, pCell, &info);
406 return info.nSize;
drh3b7511c2001-05-26 13:15:44 +0000407}
408
409/*
drhda200cc2004-05-09 11:51:38 +0000410** Do sanity checking on a page. Throw an exception if anything is
411** not right.
412**
413** This routine is used for internal error checking only. It is omitted
414** from most builds.
415*/
416#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
417static void _pageIntegrity(MemPage *pPage){
drhb6f41482004-05-14 01:58:11 +0000418 int usableSize;
drhda200cc2004-05-09 11:51:38 +0000419 u8 *data;
420 int i, idx, c, pc, hdr, nFree;
421 u8 used[MX_PAGE_SIZE];
422
drhb6f41482004-05-14 01:58:11 +0000423 usableSize = pPage->pBt->usableSize;
424 assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhda200cc2004-05-09 11:51:38 +0000425 hdr = pPage->hdrOffset;
426 assert( hdr==(pPage->pgno==1 ? 100 : 0) );
427 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
428 c = pPage->aData[hdr];
429 if( pPage->isInit ){
430 assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
431 assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
drh8b18dd42004-05-12 19:18:15 +0000432 assert( pPage->leafData == ((c & PTF_LEAFDATA)!=0) );
433 assert( pPage->intKey == ((c & (PTF_INTKEY|PTF_LEAFDATA))!=0) );
434 assert( pPage->hasData ==
435 !(pPage->zeroData || (!pPage->leaf && pPage->leafData)) );
drhda200cc2004-05-09 11:51:38 +0000436 }
437 data = pPage->aData;
drhb6f41482004-05-14 01:58:11 +0000438 memset(used, 0, usableSize);
drhda200cc2004-05-09 11:51:38 +0000439 for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
440 nFree = 0;
441 pc = get2byte(&data[hdr+1]);
442 while( pc ){
443 int size;
drhb6f41482004-05-14 01:58:11 +0000444 assert( pc>0 && pc<usableSize-4 );
drhda200cc2004-05-09 11:51:38 +0000445 size = get2byte(&data[pc+2]);
drhb6f41482004-05-14 01:58:11 +0000446 assert( pc+size<=usableSize );
drhda200cc2004-05-09 11:51:38 +0000447 nFree += size;
448 for(i=pc; i<pc+size; i++){
449 assert( used[i]==0 );
450 used[i] = 1;
451 }
452 pc = get2byte(&data[pc]);
453 }
454 assert( pPage->isInit==0 || pPage->nFree==nFree+data[hdr+5] );
455 idx = 0;
456 pc = get2byte(&data[hdr+3]);
457 while( pc ){
458 int size;
459 assert( pPage->isInit==0 || idx<pPage->nCell );
drhb6f41482004-05-14 01:58:11 +0000460 assert( pc>0 && pc<usableSize-4 );
drhda200cc2004-05-09 11:51:38 +0000461 assert( pPage->isInit==0 || pPage->aCell[idx]==&data[pc] );
462 size = cellSize(pPage, &data[pc]);
drhb6f41482004-05-14 01:58:11 +0000463 assert( pc+size<=usableSize );
drhda200cc2004-05-09 11:51:38 +0000464 for(i=pc; i<pc+size; i++){
465 assert( used[i]==0 );
466 used[i] = 1;
467 }
468 pc = get2byte(&data[pc]);
469 idx++;
470 }
471 assert( idx==pPage->nCell );
472 nFree = 0;
drhb6f41482004-05-14 01:58:11 +0000473 for(i=0; i<usableSize; i++){
drhda200cc2004-05-09 11:51:38 +0000474 assert( used[i]<=1 );
475 if( used[i]==0 ) nFree++;
476 }
477 assert( nFree==data[hdr+5] );
478}
479#define pageIntegrity(X) _pageIntegrity(X)
480#else
481# define pageIntegrity(X)
482#endif
483
484/*
drh72f82862001-05-24 21:06:34 +0000485** Defragment the page given. All Cells are moved to the
486** beginning of the page and all free space is collected
487** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000488*/
drh9e572e62004-04-23 23:43:10 +0000489static void defragmentPage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000490 int pc, i, n, addr;
491 int start, hdr, size;
drh9e572e62004-04-23 23:43:10 +0000492 int leftover;
493 unsigned char *oldPage;
drh23e11ca2004-05-04 17:27:28 +0000494 unsigned char newPage[MX_PAGE_SIZE];
drh2af926b2001-05-15 00:39:25 +0000495
drha34b6762004-05-07 13:30:42 +0000496 assert( sqlite3pager_iswriteable(pPage->aData) );
drh9e572e62004-04-23 23:43:10 +0000497 assert( pPage->pBt!=0 );
drhb6f41482004-05-14 01:58:11 +0000498 assert( pPage->pBt->usableSize <= MX_PAGE_SIZE );
drhda200cc2004-05-09 11:51:38 +0000499 assert( !pPage->needRelink );
500 assert( !pPage->isOverfull );
drh9e572e62004-04-23 23:43:10 +0000501 oldPage = pPage->aData;
502 hdr = pPage->hdrOffset;
drh3aac2dd2004-04-26 14:10:20 +0000503 addr = 3+hdr;
drh9e572e62004-04-23 23:43:10 +0000504 n = 6+hdr;
505 if( !pPage->leaf ){
506 n += 4;
drh2af926b2001-05-15 00:39:25 +0000507 }
drhc39e0002004-05-07 23:50:57 +0000508 memcpy(&newPage[hdr], &oldPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000509 start = n;
drh3aac2dd2004-04-26 14:10:20 +0000510 pc = get2byte(&oldPage[addr]);
drh9e572e62004-04-23 23:43:10 +0000511 i = 0;
512 while( pc>0 ){
drhb6f41482004-05-14 01:58:11 +0000513 assert( n<pPage->pBt->usableSize );
drh9e572e62004-04-23 23:43:10 +0000514 size = cellSize(pPage, &oldPage[pc]);
515 memcpy(&newPage[n], &oldPage[pc], size);
drh3aac2dd2004-04-26 14:10:20 +0000516 put2byte(&newPage[addr],n);
drhda200cc2004-05-09 11:51:38 +0000517 assert( pPage->aCell[i]==&oldPage[pc] );
drhc39e0002004-05-07 23:50:57 +0000518 pPage->aCell[i++] = &oldPage[n];
drhda200cc2004-05-09 11:51:38 +0000519 addr = n;
drh9e572e62004-04-23 23:43:10 +0000520 n += size;
drh9e572e62004-04-23 23:43:10 +0000521 pc = get2byte(&oldPage[pc]);
drh2aa679f2001-06-25 02:11:07 +0000522 }
drhc39e0002004-05-07 23:50:57 +0000523 assert( i==pPage->nCell );
drhb6f41482004-05-14 01:58:11 +0000524 leftover = pPage->pBt->usableSize - n;
drh9e572e62004-04-23 23:43:10 +0000525 assert( leftover>=0 );
526 assert( pPage->nFree==leftover );
527 if( leftover<4 ){
528 oldPage[hdr+5] = leftover;
529 leftover = 0;
drhb6f41482004-05-14 01:58:11 +0000530 n = pPage->pBt->usableSize;
drh9e572e62004-04-23 23:43:10 +0000531 }
drhc39e0002004-05-07 23:50:57 +0000532 memcpy(&oldPage[hdr], &newPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000533 if( leftover==0 ){
drhc39e0002004-05-07 23:50:57 +0000534 put2byte(&oldPage[hdr+1], 0);
drh9e572e62004-04-23 23:43:10 +0000535 }else if( leftover>=4 ){
drhc39e0002004-05-07 23:50:57 +0000536 put2byte(&oldPage[hdr+1], n);
drh9e572e62004-04-23 23:43:10 +0000537 put2byte(&oldPage[n], 0);
538 put2byte(&oldPage[n+2], leftover);
539 memset(&oldPage[n+4], 0, leftover-4);
540 }
drhc39e0002004-05-07 23:50:57 +0000541 oldPage[hdr+5] = 0;
drh365d68f2001-05-11 11:02:46 +0000542}
543
drha059ad02001-04-17 20:09:11 +0000544/*
drh9e572e62004-04-23 23:43:10 +0000545** Allocate nByte bytes of space on a page. If nByte is less than
546** 4 it is rounded up to 4.
drhbd03cae2001-06-02 02:40:57 +0000547**
drh9e572e62004-04-23 23:43:10 +0000548** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000549** the new allocation. Or return 0 if there is not enough free
550** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000551**
drh72f82862001-05-24 21:06:34 +0000552** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000553** nBytes of contiguous free space, then this routine automatically
554** calls defragementPage() to consolidate all free space before
555** allocating the new chunk.
drh9e572e62004-04-23 23:43:10 +0000556**
557** Algorithm: Carve a piece off of the first freeblock that is
558** nByte in size or that larger.
drh7e3b0a02001-04-28 16:52:40 +0000559*/
drh9e572e62004-04-23 23:43:10 +0000560static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000561 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000562 int size;
drh24cd67e2004-05-10 16:18:47 +0000563 int nFrag;
drh9e572e62004-04-23 23:43:10 +0000564 unsigned char *data;
drh44ce7e22003-06-17 02:57:17 +0000565#ifndef NDEBUG
566 int cnt = 0;
567#endif
drh72f82862001-05-24 21:06:34 +0000568
drh9e572e62004-04-23 23:43:10 +0000569 data = pPage->aData;
drha34b6762004-05-07 13:30:42 +0000570 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000571 assert( pPage->pBt );
572 if( nByte<4 ) nByte = 4;
drh14acc042001-06-10 19:56:58 +0000573 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
drh9e572e62004-04-23 23:43:10 +0000574 hdr = pPage->hdrOffset;
drh24cd67e2004-05-10 16:18:47 +0000575 nFrag = data[hdr+5];
576 if( nFrag>=60 || nFrag>pPage->nFree-nByte ){
drh9e572e62004-04-23 23:43:10 +0000577 defragmentPage(pPage);
drh2af926b2001-05-15 00:39:25 +0000578 }
drh3aac2dd2004-04-26 14:10:20 +0000579 addr = hdr+1;
580 pc = get2byte(&data[addr]);
581 assert( addr<pc );
drhb6f41482004-05-14 01:58:11 +0000582 assert( pc<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000583 while( (size = get2byte(&data[pc+2]))<nByte ){
584 addr = pc;
585 pc = get2byte(&data[addr]);
drhb6f41482004-05-14 01:58:11 +0000586 assert( pc<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000587 assert( pc>=addr+size+4 || pc==0 );
drh9e572e62004-04-23 23:43:10 +0000588 if( pc==0 ){
589 assert( (cnt++)==0 );
590 defragmentPage(pPage);
591 assert( data[hdr+5]==0 );
drh3aac2dd2004-04-26 14:10:20 +0000592 addr = pPage->hdrOffset+1;
593 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000594 }
595 }
596 assert( pc>0 && size>=nByte );
drhb6f41482004-05-14 01:58:11 +0000597 assert( pc+size<=pPage->pBt->usableSize );
drh9e572e62004-04-23 23:43:10 +0000598 if( size>nByte+4 ){
drhde647132004-05-07 17:57:49 +0000599 int newStart = pc+nByte;
600 put2byte(&data[addr], newStart);
601 put2byte(&data[newStart], get2byte(&data[pc]));
602 put2byte(&data[newStart+2], size-nByte);
drh2af926b2001-05-15 00:39:25 +0000603 }else{
drh3aac2dd2004-04-26 14:10:20 +0000604 put2byte(&data[addr], get2byte(&data[pc]));
drh9e572e62004-04-23 23:43:10 +0000605 data[hdr+5] += size-nByte;
drh2af926b2001-05-15 00:39:25 +0000606 }
607 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000608 assert( pPage->nFree>=0 );
609 return pc;
drh7e3b0a02001-04-28 16:52:40 +0000610}
611
612/*
drh9e572e62004-04-23 23:43:10 +0000613** Return a section of the pPage->aData to the freelist.
614** The first byte of the new free block is pPage->aDisk[start]
615** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000616**
617** Most of the effort here is involved in coalesing adjacent
618** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000619*/
drh9e572e62004-04-23 23:43:10 +0000620static void freeSpace(MemPage *pPage, int start, int size){
621 int end = start + size; /* End of the segment being freed */
drha34b6762004-05-07 13:30:42 +0000622 int addr, pbegin;
drh9e572e62004-04-23 23:43:10 +0000623#ifndef NDEBUG
624 int tsize = 0; /* Total size of all freeblocks */
625#endif
626 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000627
drh9e572e62004-04-23 23:43:10 +0000628 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000629 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000630 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
drhb6f41482004-05-14 01:58:11 +0000631 assert( end<=pPage->pBt->usableSize );
drh9e572e62004-04-23 23:43:10 +0000632 if( size<4 ) size = 4;
633
634 /* Add the space back into the linked list of freeblocks */
drh3aac2dd2004-04-26 14:10:20 +0000635 addr = pPage->hdrOffset + 1;
636 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drhb6f41482004-05-14 01:58:11 +0000637 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000638 assert( pbegin>addr );
639 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000640 }
drhb6f41482004-05-14 01:58:11 +0000641 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000642 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000643 put2byte(&data[addr], start);
644 put2byte(&data[start], pbegin);
645 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000646 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000647
648 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000649 addr = pPage->hdrOffset + 1;
650 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000651 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000652 assert( pbegin>addr );
drhb6f41482004-05-14 01:58:11 +0000653 assert( pbegin<pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +0000654 pnext = get2byte(&data[pbegin]);
655 psize = get2byte(&data[pbegin+2]);
656 if( pbegin + psize + 3 >= pnext && pnext>0 ){
657 int frag = pnext - (pbegin+psize);
658 assert( frag<=data[pPage->hdrOffset+5] );
659 data[pPage->hdrOffset+5] -= frag;
660 put2byte(&data[pbegin], get2byte(&data[pnext]));
661 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
662 }else{
663 assert( (tsize += psize)>0 );
drh3aac2dd2004-04-26 14:10:20 +0000664 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000665 }
666 }
667 assert( tsize+data[pPage->hdrOffset+5]==pPage->nFree );
drh7e3b0a02001-04-28 16:52:40 +0000668}
669
drh9e572e62004-04-23 23:43:10 +0000670/*
drh4b70f112004-05-02 21:12:19 +0000671** Resize the aCell[] array of the given page so that it is able to
672** hold at least nNewSz entries.
673**
674** Return SQLITE_OK or SQLITE_NOMEM.
675*/
676static int resizeCellArray(MemPage *pPage, int nNewSz){
drha34b6762004-05-07 13:30:42 +0000677 if( pPage->nCellAlloc<nNewSz ){
drh4b70f112004-05-02 21:12:19 +0000678 pPage->aCell = sqliteRealloc(pPage->aCell, nNewSz*sizeof(pPage->aCell[0]) );
danielk197724b03fd2004-05-10 10:34:34 +0000679 if( sqlite3_malloc_failed ) return SQLITE_NOMEM;
drha34b6762004-05-07 13:30:42 +0000680 pPage->nCellAlloc = nNewSz;
drh4b70f112004-05-02 21:12:19 +0000681 }
682 return SQLITE_OK;
683}
684
685/*
drh7e3b0a02001-04-28 16:52:40 +0000686** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000687**
drhbd03cae2001-06-02 02:40:57 +0000688** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +0000689** is the parent of the page being initialized. The root of a
690** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000691**
drh72f82862001-05-24 21:06:34 +0000692** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000693** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000694** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
695** guarantee that the page is well-formed. It only shows that
696** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000697*/
drh9e572e62004-04-23 23:43:10 +0000698static int initPage(
drh3aac2dd2004-04-26 14:10:20 +0000699 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +0000700 MemPage *pParent /* The parent. Might be NULL */
701){
drh3aac2dd2004-04-26 14:10:20 +0000702 int c, pc, i, hdr;
drha34b6762004-05-07 13:30:42 +0000703 unsigned char *data;
drhb6f41482004-05-14 01:58:11 +0000704 int usableSize;
drh10617cd2004-05-14 15:27:27 +0000705 /* int sumCell = 0; // Total size of all cells */
706
drh2af926b2001-05-15 00:39:25 +0000707
drh3aac2dd2004-04-26 14:10:20 +0000708 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000709 assert( pParent==0 || pParent->pBt==pPage->pBt );
drha34b6762004-05-07 13:30:42 +0000710 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
drhde647132004-05-07 17:57:49 +0000711 assert( pPage->aData == &((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhda200cc2004-05-09 11:51:38 +0000712 assert( pPage->pParent==0 || pPage->pParent==pParent );
drh10617cd2004-05-14 15:27:27 +0000713 assert( pPage->pParent==pParent || !pPage->isInit );
714 if( pPage->isInit ) return SQLITE_OK;
drhda200cc2004-05-09 11:51:38 +0000715 if( pPage->pParent==0 && pParent!=0 ){
716 pPage->pParent = pParent;
drha34b6762004-05-07 13:30:42 +0000717 sqlite3pager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +0000718 }
drh4b70f112004-05-02 21:12:19 +0000719 pPage->nCell = pPage->nCellAlloc = 0;
drhde647132004-05-07 17:57:49 +0000720 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
721 hdr = pPage->hdrOffset;
drha34b6762004-05-07 13:30:42 +0000722 data = pPage->aData;
723 c = data[hdr];
drh8b18dd42004-05-12 19:18:15 +0000724 pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
drh9e572e62004-04-23 23:43:10 +0000725 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh8b18dd42004-05-12 19:18:15 +0000726 pPage->leafData = (c & PTF_LEAFDATA)!=0;
drh4b70f112004-05-02 21:12:19 +0000727 pPage->leaf = (c & PTF_LEAF)!=0;
drh8b18dd42004-05-12 19:18:15 +0000728 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
drhc8629a12004-05-08 20:07:40 +0000729 pPage->isOverfull = 0;
drhda200cc2004-05-09 11:51:38 +0000730 pPage->needRelink = 0;
drhc8629a12004-05-08 20:07:40 +0000731 pPage->idxShift = 0;
drhb6f41482004-05-14 01:58:11 +0000732 usableSize = pPage->pBt->usableSize;
drh2af926b2001-05-15 00:39:25 +0000733
drh9e572e62004-04-23 23:43:10 +0000734 /* Initialize the cell count and cell pointers */
735 pc = get2byte(&data[hdr+3]);
736 while( pc>0 ){
drhb6f41482004-05-14 01:58:11 +0000737 if( pc>=usableSize ) return SQLITE_CORRUPT;
738 if( pPage->nCell>usableSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000739 pPage->nCell++;
740 pc = get2byte(&data[pc]);
741 }
drh4b70f112004-05-02 21:12:19 +0000742 if( resizeCellArray(pPage, pPage->nCell) ){
drh9e572e62004-04-23 23:43:10 +0000743 return SQLITE_NOMEM;
744 }
745 pc = get2byte(&data[hdr+3]);
746 for(i=0; pc>0; i++){
747 pPage->aCell[i] = &data[pc];
drh10617cd2004-05-14 15:27:27 +0000748 /* sumCell += cellSize(pPage, &data[pc]); */
drhde647132004-05-07 17:57:49 +0000749 pc = get2byte(&data[pc]);
drh9e572e62004-04-23 23:43:10 +0000750 }
751
752 /* Compute the total free space on the page */
753 pPage->nFree = data[hdr+5];
754 pc = get2byte(&data[hdr+1]);
755 while( pc>0 ){
756 int next, size;
drhb6f41482004-05-14 01:58:11 +0000757 if( pc>=usableSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000758 next = get2byte(&data[pc]);
759 size = get2byte(&data[pc+2]);
drh3aac2dd2004-04-26 14:10:20 +0000760 if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000761 pPage->nFree += size;
762 pc = next;
763 }
drhb6f41482004-05-14 01:58:11 +0000764 if( pPage->nFree>=usableSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000765
766 /* Sanity check: Cells and freespace and header must sum to the size
drh10617cd2004-05-14 15:27:27 +0000767 ** a page.
drhb6f41482004-05-14 01:58:11 +0000768 if( sumCell+pPage->nFree+hdr+10-pPage->leaf*4 != usableSize ){
drha34b6762004-05-07 13:30:42 +0000769 return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000770 }
drh10617cd2004-05-14 15:27:27 +0000771 */
drh9e572e62004-04-23 23:43:10 +0000772
drhde647132004-05-07 17:57:49 +0000773 pPage->isInit = 1;
drhda200cc2004-05-09 11:51:38 +0000774 pageIntegrity(pPage);
drh9e572e62004-04-23 23:43:10 +0000775 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000776}
777
778/*
drh8b2f49b2001-06-08 00:21:52 +0000779** Set up a raw page so that it looks like a database page holding
780** no entries.
drhbd03cae2001-06-02 02:40:57 +0000781*/
drh9e572e62004-04-23 23:43:10 +0000782static void zeroPage(MemPage *pPage, int flags){
783 unsigned char *data = pPage->aData;
784 Btree *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +0000785 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +0000786 int first;
787
drhda200cc2004-05-09 11:51:38 +0000788 assert( sqlite3pager_pagenumber(data)==pPage->pgno );
789 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000790 assert( sqlite3pager_iswriteable(data) );
drhb6f41482004-05-14 01:58:11 +0000791 memset(&data[hdr], 0, pBt->usableSize - hdr);
drh9e572e62004-04-23 23:43:10 +0000792 data[hdr] = flags;
drhde647132004-05-07 17:57:49 +0000793 first = hdr + 6 + 4*((flags&PTF_LEAF)==0);
drh9e572e62004-04-23 23:43:10 +0000794 put2byte(&data[hdr+1], first);
drhb6f41482004-05-14 01:58:11 +0000795 put2byte(&data[first+2], pBt->usableSize - first);
drh9e572e62004-04-23 23:43:10 +0000796 sqliteFree(pPage->aCell);
797 pPage->aCell = 0;
drh8c42ca92001-06-22 19:15:00 +0000798 pPage->nCell = 0;
drhde647132004-05-07 17:57:49 +0000799 pPage->nCellAlloc = 0;
drhb6f41482004-05-14 01:58:11 +0000800 pPage->nFree = pBt->usableSize - first;
drh8b18dd42004-05-12 19:18:15 +0000801 pPage->intKey = (flags & (PTF_INTKEY|PTF_LEAFDATA))!=0;
drh9e572e62004-04-23 23:43:10 +0000802 pPage->zeroData = (flags & PTF_ZERODATA)!=0;
drh8b18dd42004-05-12 19:18:15 +0000803 pPage->leafData = (flags & PTF_LEAFDATA)!=0;
804 pPage->leaf = (flags & PTF_LEAF)!=0;
805 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
drh9e572e62004-04-23 23:43:10 +0000806 pPage->hdrOffset = hdr;
drhda200cc2004-05-09 11:51:38 +0000807 pPage->isOverfull = 0;
808 pPage->needRelink = 0;
809 pPage->idxShift = 0;
810 pPage->isInit = 1;
811 pageIntegrity(pPage);
drhbd03cae2001-06-02 02:40:57 +0000812}
813
814/*
drh3aac2dd2004-04-26 14:10:20 +0000815** Get a page from the pager. Initialize the MemPage.pBt and
816** MemPage.aData elements if needed.
817*/
818static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
819 int rc;
820 unsigned char *aData;
821 MemPage *pPage;
drha34b6762004-05-07 13:30:42 +0000822 rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
drh3aac2dd2004-04-26 14:10:20 +0000823 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +0000824 pPage = (MemPage*)&aData[pBt->pageSize];
drh3aac2dd2004-04-26 14:10:20 +0000825 pPage->aData = aData;
826 pPage->pBt = pBt;
827 pPage->pgno = pgno;
drhde647132004-05-07 17:57:49 +0000828 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
drh3aac2dd2004-04-26 14:10:20 +0000829 *ppPage = pPage;
830 return SQLITE_OK;
831}
832
833/*
drhde647132004-05-07 17:57:49 +0000834** Get a page from the pager and initialize it. This routine
835** is just a convenience wrapper around separate calls to
836** getPage() and initPage().
837*/
838static int getAndInitPage(
839 Btree *pBt, /* The database file */
840 Pgno pgno, /* Number of the page to get */
841 MemPage **ppPage, /* Write the page pointer here */
842 MemPage *pParent /* Parent of the page */
843){
844 int rc;
845 rc = getPage(pBt, pgno, ppPage);
drh10617cd2004-05-14 15:27:27 +0000846 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
drhde647132004-05-07 17:57:49 +0000847 rc = initPage(*ppPage, pParent);
848 }
849 return rc;
850}
851
852/*
drh3aac2dd2004-04-26 14:10:20 +0000853** Release a MemPage. This should be called once for each prior
854** call to getPage.
855*/
drh4b70f112004-05-02 21:12:19 +0000856static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000857 if( pPage ){
858 assert( pPage->aData );
859 assert( pPage->pBt );
860 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000861 sqlite3pager_unref(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +0000862 }
863}
864
865/*
drh72f82862001-05-24 21:06:34 +0000866** This routine is called when the reference count for a page
867** reaches zero. We need to unref the pParent pointer when that
868** happens.
869*/
drhb6f41482004-05-14 01:58:11 +0000870static void pageDestructor(void *pData, int pageSize){
871 MemPage *pPage = (MemPage*)&((char*)pData)[pageSize];
drhda200cc2004-05-09 11:51:38 +0000872 assert( pPage->isInit==0 || pPage->needRelink==0 );
drh72f82862001-05-24 21:06:34 +0000873 if( pPage->pParent ){
874 MemPage *pParent = pPage->pParent;
875 pPage->pParent = 0;
drha34b6762004-05-07 13:30:42 +0000876 releasePage(pParent);
drh72f82862001-05-24 21:06:34 +0000877 }
drh9e572e62004-04-23 23:43:10 +0000878 sqliteFree(pPage->aCell);
879 pPage->aCell = 0;
drh3aac2dd2004-04-26 14:10:20 +0000880 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +0000881}
882
883/*
drh306dc212001-05-21 13:45:10 +0000884** Open a new database.
885**
886** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000887** for accessing the database. We do not open the database file
888** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000889**
890** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000891** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +0000892** database file will be deleted when sqlite3BtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000893*/
drh23e11ca2004-05-04 17:27:28 +0000894int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +0000895 const char *zFilename, /* Name of the file containing the BTree database */
896 Btree **ppBtree, /* Pointer to new Btree object written here */
897 int nCache, /* Number of cache pages */
898 int flags /* Options */
drh6019e162001-07-02 17:51:45 +0000899){
drha059ad02001-04-17 20:09:11 +0000900 Btree *pBt;
drha34b6762004-05-07 13:30:42 +0000901 int rc;
drha059ad02001-04-17 20:09:11 +0000902
drhd62d3d02003-01-24 12:14:20 +0000903 /*
904 ** The following asserts make sure that structures used by the btree are
905 ** the right size. This is to guard against size changes that result
906 ** when compiling on a different architecture.
907 */
drh4a1c3802004-05-12 15:15:47 +0000908 assert( sizeof(i64)==8 );
drh9e572e62004-04-23 23:43:10 +0000909 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +0000910 assert( sizeof(u32)==4 );
911 assert( sizeof(u16)==2 );
912 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +0000913 assert( sizeof(ptr)==sizeof(char*) );
914 assert( sizeof(uptr)==sizeof(ptr) );
915
drha059ad02001-04-17 20:09:11 +0000916 pBt = sqliteMalloc( sizeof(*pBt) );
917 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000918 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000919 return SQLITE_NOMEM;
920 }
drh6019e162001-07-02 17:51:45 +0000921 if( nCache<10 ) nCache = 10;
drha34b6762004-05-07 13:30:42 +0000922 rc = sqlite3pager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
drh3aac2dd2004-04-26 14:10:20 +0000923 (flags & BTREE_OMIT_JOURNAL)==0);
drha059ad02001-04-17 20:09:11 +0000924 if( rc!=SQLITE_OK ){
drha34b6762004-05-07 13:30:42 +0000925 if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000926 sqliteFree(pBt);
927 *ppBtree = 0;
928 return rc;
929 }
drha34b6762004-05-07 13:30:42 +0000930 sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000931 pBt->pCursor = 0;
drha34b6762004-05-07 13:30:42 +0000932 pBt->pPage1 = 0;
933 pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
drh4b70f112004-05-02 21:12:19 +0000934 pBt->pageSize = SQLITE_PAGE_SIZE; /* FIX ME - read from header */
drhb6f41482004-05-14 01:58:11 +0000935 pBt->usableSize = pBt->pageSize;
drh6f11bef2004-05-13 01:12:56 +0000936 pBt->maxEmbedFrac = 64; /* FIX ME - read from header */
937 pBt->minEmbedFrac = 32; /* FIX ME - read from header */
938 pBt->minLeafFrac = 32; /* FIX ME - read from header */
drh3a4c1412004-05-09 20:40:11 +0000939
drha059ad02001-04-17 20:09:11 +0000940 *ppBtree = pBt;
941 return SQLITE_OK;
942}
943
944/*
945** Close an open database and invalidate all cursors.
946*/
drh3aac2dd2004-04-26 14:10:20 +0000947int sqlite3BtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000948 while( pBt->pCursor ){
drh3aac2dd2004-04-26 14:10:20 +0000949 sqlite3BtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000950 }
drha34b6762004-05-07 13:30:42 +0000951 sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000952 sqliteFree(pBt);
953 return SQLITE_OK;
954}
955
956/*
drhda47d772002-12-02 04:25:19 +0000957** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000958**
959** The maximum number of cache pages is set to the absolute
960** value of mxPage. If mxPage is negative, the pager will
961** operate asynchronously - it will not stop to do fsync()s
962** to insure data is written to the disk surface before
963** continuing. Transactions still work if synchronous is off,
964** and the database cannot be corrupted if this program
965** crashes. But if the operating system crashes or there is
966** an abrupt power failure when synchronous is off, the database
967** could be left in an inconsistent and unrecoverable state.
968** Synchronous is on by default so database corruption is not
969** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000970*/
drh23e11ca2004-05-04 17:27:28 +0000971int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
drha34b6762004-05-07 13:30:42 +0000972 sqlite3pager_set_cachesize(pBt->pPager, mxPage);
drhf57b14a2001-09-14 18:54:08 +0000973 return SQLITE_OK;
974}
975
976/*
drh973b6e32003-02-12 14:09:42 +0000977** Change the way data is synced to disk in order to increase or decrease
978** how well the database resists damage due to OS crashes and power
979** failures. Level 1 is the same as asynchronous (no syncs() occur and
980** there is a high probability of damage) Level 2 is the default. There
981** is a very low but non-zero probability of damage. Level 3 reduces the
982** probability of damage to near zero but with a write performance reduction.
983*/
drh3aac2dd2004-04-26 14:10:20 +0000984int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
drha34b6762004-05-07 13:30:42 +0000985 sqlite3pager_set_safety_level(pBt->pPager, level);
drh973b6e32003-02-12 14:09:42 +0000986 return SQLITE_OK;
987}
988
989/*
drha34b6762004-05-07 13:30:42 +0000990** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +0000991** also acquire a readlock on that file.
992**
993** SQLITE_OK is returned on success. If the file is not a
994** well-formed database file, then SQLITE_CORRUPT is returned.
995** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
996** is returned if we run out of memory. SQLITE_PROTOCOL is returned
997** if there is a locking protocol violation.
998*/
999static int lockBtree(Btree *pBt){
1000 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001001 MemPage *pPage1;
drha34b6762004-05-07 13:30:42 +00001002 if( pBt->pPage1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001003 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +00001004 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001005
drh306dc212001-05-21 13:45:10 +00001006
1007 /* Do some checking to help insure the file we opened really is
1008 ** a valid database file.
1009 */
drhb6f41482004-05-14 01:58:11 +00001010 rc = SQLITE_NOTADB;
drha34b6762004-05-07 13:30:42 +00001011 if( sqlite3pager_pagecount(pBt->pPager)>0 ){
drhb6f41482004-05-14 01:58:11 +00001012 u8 *page1 = pPage1->aData;
1013 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00001014 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00001015 }
drhb6f41482004-05-14 01:58:11 +00001016 if( page1[18]>1 || page1[19]>1 ){
1017 goto page1_init_failed;
1018 }
1019 pBt->pageSize = get2byte(&page1[16]);
1020 pBt->usableSize = pBt->pageSize - page1[20];
1021 if( pBt->usableSize<500 ){
1022 goto page1_init_failed;
1023 }
1024 pBt->maxEmbedFrac = page1[21];
1025 pBt->minEmbedFrac = page1[22];
1026 pBt->minLeafFrac = page1[23];
drh306dc212001-05-21 13:45:10 +00001027 }
drhb6f41482004-05-14 01:58:11 +00001028
1029 /* maxLocal is the maximum amount of payload to store locally for
1030 ** a cell. Make sure it is small enough so that at least minFanout
1031 ** cells can will fit on one page. We assume a 10-byte page header.
1032 ** Besides the payload, the cell must store:
1033 ** 2-byte pointer to next cell
1034 ** 4-byte child pointer
1035 ** 9-byte nKey value
1036 ** 4-byte nData value
1037 ** 4-byte overflow page pointer
1038 ** So a cell consists of a header which is as much as 19 bytes long,
1039 ** 0 to N bytes of payload, and an optional 4 byte overflow page pointer.
1040 */
1041 pBt->maxLocal = (pBt->usableSize-10)*pBt->maxEmbedFrac/255 - 23;
1042 pBt->minLocal = (pBt->usableSize-10)*pBt->minEmbedFrac/255 - 23;
1043 pBt->maxLeaf = pBt->usableSize - 33;
1044 pBt->minLeaf = (pBt->usableSize-10)*pBt->minLeafFrac/255 - 23;
1045 if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1046 goto page1_init_failed;
1047 }
1048 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE );
drh3aac2dd2004-04-26 14:10:20 +00001049 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00001050 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00001051
drh72f82862001-05-24 21:06:34 +00001052page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00001053 releasePage(pPage1);
1054 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00001055 return rc;
drh306dc212001-05-21 13:45:10 +00001056}
1057
1058/*
drhb8ca3072001-12-05 00:21:20 +00001059** If there are no outstanding cursors and we are not in the middle
1060** of a transaction but there is a read lock on the database, then
1061** this routine unrefs the first page of the database file which
1062** has the effect of releasing the read lock.
1063**
1064** If there are any outstanding cursors, this routine is a no-op.
1065**
1066** If there is a transaction in progress, this routine is a no-op.
1067*/
1068static void unlockBtreeIfUnused(Btree *pBt){
drh3aac2dd2004-04-26 14:10:20 +00001069 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->pPage1!=0 ){
1070 releasePage(pBt->pPage1);
1071 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +00001072 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001073 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001074 }
1075}
1076
1077/*
drh9e572e62004-04-23 23:43:10 +00001078** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001079** file.
drh8b2f49b2001-06-08 00:21:52 +00001080*/
1081static int newDatabase(Btree *pBt){
drh9e572e62004-04-23 23:43:10 +00001082 MemPage *pP1;
1083 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001084 int rc;
drhde647132004-05-07 17:57:49 +00001085 if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001086 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001087 assert( pP1!=0 );
1088 data = pP1->aData;
drha34b6762004-05-07 13:30:42 +00001089 rc = sqlite3pager_write(data);
drh8b2f49b2001-06-08 00:21:52 +00001090 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001091 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1092 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00001093 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00001094 data[18] = 1;
1095 data[19] = 1;
drhb6f41482004-05-14 01:58:11 +00001096 data[20] = pBt->pageSize - pBt->usableSize;
1097 data[21] = pBt->maxEmbedFrac;
1098 data[22] = pBt->minEmbedFrac;
1099 data[23] = pBt->minLeafFrac;
1100 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00001101 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drh8b2f49b2001-06-08 00:21:52 +00001102 return SQLITE_OK;
1103}
1104
1105/*
drh72f82862001-05-24 21:06:34 +00001106** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001107**
1108** A transaction must be started before attempting any changes
1109** to the database. None of the following routines will work
1110** unless a transaction is started first:
1111**
drh23e11ca2004-05-04 17:27:28 +00001112** sqlite3BtreeCreateTable()
1113** sqlite3BtreeCreateIndex()
1114** sqlite3BtreeClearTable()
1115** sqlite3BtreeDropTable()
1116** sqlite3BtreeInsert()
1117** sqlite3BtreeDelete()
1118** sqlite3BtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +00001119*/
drh3aac2dd2004-04-26 14:10:20 +00001120int sqlite3BtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001121 int rc;
1122 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +00001123 if( pBt->readOnly ) return SQLITE_READONLY;
drh3aac2dd2004-04-26 14:10:20 +00001124 if( pBt->pPage1==0 ){
drh7e3b0a02001-04-28 16:52:40 +00001125 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +00001126 if( rc!=SQLITE_OK ){
1127 return rc;
1128 }
drha059ad02001-04-17 20:09:11 +00001129 }
drha34b6762004-05-07 13:30:42 +00001130 rc = sqlite3pager_begin(pBt->pPage1->aData);
drhf74b8d92002-09-01 23:20:45 +00001131 if( rc==SQLITE_OK ){
1132 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +00001133 }
drhb8ca3072001-12-05 00:21:20 +00001134 if( rc==SQLITE_OK ){
1135 pBt->inTrans = 1;
drh3aac2dd2004-04-26 14:10:20 +00001136 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001137 }else{
1138 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001139 }
drhb8ca3072001-12-05 00:21:20 +00001140 return rc;
drha059ad02001-04-17 20:09:11 +00001141}
1142
1143/*
drh2aa679f2001-06-25 02:11:07 +00001144** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00001145**
1146** This will release the write lock on the database file. If there
1147** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001148*/
drh3aac2dd2004-04-26 14:10:20 +00001149int sqlite3BtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001150 int rc;
drha34b6762004-05-07 13:30:42 +00001151 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +00001152 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001153 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00001154 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001155 return rc;
1156}
1157
1158/*
drhc39e0002004-05-07 23:50:57 +00001159** Invalidate all cursors
1160*/
1161static void invalidateCursors(Btree *pBt){
1162 BtCursor *pCur;
1163 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1164 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001165 if( pPage /* && !pPage->isInit */ ){
1166 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001167 releasePage(pPage);
1168 pCur->pPage = 0;
1169 pCur->isValid = 0;
1170 pCur->status = SQLITE_ABORT;
1171 }
1172 }
1173}
1174
drhda200cc2004-05-09 11:51:38 +00001175#ifdef SQLITE_TEST
1176/*
1177** Print debugging information about all cursors to standard output.
1178*/
1179void sqlite3BtreeCursorList(Btree *pBt){
1180 BtCursor *pCur;
1181 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1182 MemPage *pPage = pCur->pPage;
1183 char *zMode = pCur->wrFlag ? "rw" : "ro";
1184 printf("CURSOR %08x rooted at %4d(%s) currently at %d.%d%s\n",
1185 (int)pCur, pCur->pgnoRoot, zMode,
1186 pPage ? pPage->pgno : 0, pCur->idx,
1187 pCur->isValid ? "" : " eof"
1188 );
1189 }
1190}
1191#endif
1192
drhc39e0002004-05-07 23:50:57 +00001193/*
drhecdc7532001-09-23 02:35:53 +00001194** Rollback the transaction in progress. All cursors will be
1195** invalided by this operation. Any attempt to use a cursor
1196** that was open at the beginning of this operation will result
1197** in an error.
drh5e00f6c2001-09-13 13:46:56 +00001198**
1199** This will release the write lock on the database file. If there
1200** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001201*/
drh3aac2dd2004-04-26 14:10:20 +00001202int sqlite3BtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001203 int rc;
drh24cd67e2004-05-10 16:18:47 +00001204 MemPage *pPage1;
drh7c717f72001-06-24 20:39:41 +00001205 if( pBt->inTrans==0 ) return SQLITE_OK;
1206 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001207 pBt->inStmt = 0;
drh24cd67e2004-05-10 16:18:47 +00001208 if( pBt->readOnly ){
1209 rc = SQLITE_OK;
1210 }else{
1211 rc = sqlite3pager_rollback(pBt->pPager);
1212 /* The rollback may have destroyed the pPage1->aData value. So
1213 ** call getPage() on page 1 again to make sure pPage1->aData is
1214 ** set correctly. */
1215 if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
1216 releasePage(pPage1);
1217 }
1218 }
drhc39e0002004-05-07 23:50:57 +00001219 invalidateCursors(pBt);
drh5e00f6c2001-09-13 13:46:56 +00001220 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001221 return rc;
1222}
1223
1224/*
drh663fc632002-02-02 18:49:19 +00001225** Set the checkpoint for the current transaction. The checkpoint serves
1226** as a sub-transaction that can be rolled back independently of the
1227** main transaction. You must start a transaction before starting a
1228** checkpoint. The checkpoint is ended automatically if the transaction
1229** commits or rolls back.
1230**
1231** Only one checkpoint may be active at a time. It is an error to try
1232** to start a new checkpoint if another checkpoint is already active.
1233*/
drh3aac2dd2004-04-26 14:10:20 +00001234int sqlite3BtreeBeginStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001235 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001236 if( !pBt->inTrans || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00001237 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00001238 }
drha34b6762004-05-07 13:30:42 +00001239 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00001240 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00001241 return rc;
1242}
1243
1244
1245/*
1246** Commit a checkpoint to transaction currently in progress. If no
1247** checkpoint is active, this is a no-op.
1248*/
drh3aac2dd2004-04-26 14:10:20 +00001249int sqlite3BtreeCommitStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001250 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001251 if( pBt->inStmt && !pBt->readOnly ){
drha34b6762004-05-07 13:30:42 +00001252 rc = sqlite3pager_stmt_commit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00001253 }else{
1254 rc = SQLITE_OK;
1255 }
drh3aac2dd2004-04-26 14:10:20 +00001256 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001257 return rc;
1258}
1259
1260/*
1261** Rollback the checkpoint to the current transaction. If there
1262** is no active checkpoint or transaction, this routine is a no-op.
1263**
1264** All cursors will be invalided by this operation. Any attempt
1265** to use a cursor that was open at the beginning of this operation
1266** will result in an error.
1267*/
drh3aac2dd2004-04-26 14:10:20 +00001268int sqlite3BtreeRollbackStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001269 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001270 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drha34b6762004-05-07 13:30:42 +00001271 rc = sqlite3pager_stmt_rollback(pBt->pPager);
drhc39e0002004-05-07 23:50:57 +00001272 invalidateCursors(pBt);
drh3aac2dd2004-04-26 14:10:20 +00001273 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001274 return rc;
1275}
1276
1277/*
drh3aac2dd2004-04-26 14:10:20 +00001278** Default key comparison function to be used if no comparison function
1279** is specified on the sqlite3BtreeCursor() call.
1280*/
1281static int dfltCompare(
1282 void *NotUsed, /* User data is not used */
1283 int n1, const void *p1, /* First key to compare */
1284 int n2, const void *p2 /* Second key to compare */
1285){
1286 int c;
1287 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1288 if( c==0 ){
1289 c = n1 - n2;
1290 }
1291 return c;
1292}
1293
1294/*
drh8b2f49b2001-06-08 00:21:52 +00001295** Create a new cursor for the BTree whose root is on the page
1296** iTable. The act of acquiring a cursor gets a read lock on
1297** the database file.
drh1bee3d72001-10-15 00:44:35 +00001298**
1299** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001300** If wrFlag==1, then the cursor can be used for reading or for
1301** writing if other conditions for writing are also met. These
1302** are the conditions that must be met in order for writing to
1303** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001304**
drhf74b8d92002-09-01 23:20:45 +00001305** 1: The cursor must have been opened with wrFlag==1
1306**
1307** 2: No other cursors may be open with wrFlag==0 on the same table
1308**
1309** 3: The database must be writable (not on read-only media)
1310**
1311** 4: There must be an active transaction.
1312**
1313** Condition 2 warrants further discussion. If any cursor is opened
1314** on a table with wrFlag==0, that prevents all other cursors from
1315** writing to that table. This is a kind of "read-lock". When a cursor
1316** is opened with wrFlag==0 it is guaranteed that the table will not
1317** change as long as the cursor is open. This allows the cursor to
1318** do a sequential scan of the table without having to worry about
1319** entries being inserted or deleted during the scan. Cursors should
1320** be opened with wrFlag==0 only if this read-lock property is needed.
1321** That is to say, cursors should be opened with wrFlag==0 only if they
drh23e11ca2004-05-04 17:27:28 +00001322** intend to use the sqlite3BtreeNext() system call. All other cursors
drhf74b8d92002-09-01 23:20:45 +00001323** should be opened with wrFlag==1 even if they never really intend
1324** to write.
1325**
drh6446c4d2001-12-15 14:22:18 +00001326** No checking is done to make sure that page iTable really is the
1327** root page of a b-tree. If it is not, then the cursor acquired
1328** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00001329**
1330** The comparison function must be logically the same for every cursor
1331** on a particular table. Changing the comparison function will result
1332** in incorrect operations. If the comparison function is NULL, a
1333** default comparison function is used. The comparison function is
1334** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00001335*/
drh3aac2dd2004-04-26 14:10:20 +00001336int sqlite3BtreeCursor(
1337 Btree *pBt, /* The btree */
1338 int iTable, /* Root page of table to open */
1339 int wrFlag, /* 1 to write. 0 read-only */
1340 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1341 void *pArg, /* First arg to xCompare() */
1342 BtCursor **ppCur /* Write new cursor here */
1343){
drha059ad02001-04-17 20:09:11 +00001344 int rc;
drhf74b8d92002-09-01 23:20:45 +00001345 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001346
drha0c9a112004-03-10 13:42:37 +00001347 if( pBt->readOnly && wrFlag ){
1348 *ppCur = 0;
1349 return SQLITE_READONLY;
1350 }
drh4b70f112004-05-02 21:12:19 +00001351 if( pBt->pPage1==0 ){
drha059ad02001-04-17 20:09:11 +00001352 rc = lockBtree(pBt);
1353 if( rc!=SQLITE_OK ){
1354 *ppCur = 0;
1355 return rc;
1356 }
1357 }
1358 pCur = sqliteMalloc( sizeof(*pCur) );
1359 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001360 rc = SQLITE_NOMEM;
1361 goto create_cursor_exception;
1362 }
drh8b2f49b2001-06-08 00:21:52 +00001363 pCur->pgnoRoot = (Pgno)iTable;
drh24cd67e2004-05-10 16:18:47 +00001364 if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
1365 rc = SQLITE_EMPTY;
1366 goto create_cursor_exception;
1367 }
drhde647132004-05-07 17:57:49 +00001368 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00001369 if( rc!=SQLITE_OK ){
1370 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001371 }
drh3aac2dd2004-04-26 14:10:20 +00001372 pCur->xCompare = xCmp ? xCmp : dfltCompare;
1373 pCur->pArg = pArg;
drh14acc042001-06-10 19:56:58 +00001374 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001375 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001376 pCur->idx = 0;
drha059ad02001-04-17 20:09:11 +00001377 pCur->pNext = pBt->pCursor;
1378 if( pCur->pNext ){
1379 pCur->pNext->pPrev = pCur;
1380 }
drh14acc042001-06-10 19:56:58 +00001381 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001382 pRing = pBt->pCursor;
1383 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1384 if( pRing ){
1385 pCur->pShared = pRing->pShared;
1386 pRing->pShared = pCur;
1387 }else{
1388 pCur->pShared = pCur;
1389 }
drha059ad02001-04-17 20:09:11 +00001390 pBt->pCursor = pCur;
drhc39e0002004-05-07 23:50:57 +00001391 pCur->isValid = 0;
1392 pCur->status = SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001393 *ppCur = pCur;
1394 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001395
1396create_cursor_exception:
1397 *ppCur = 0;
1398 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00001399 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001400 sqliteFree(pCur);
1401 }
drh5e00f6c2001-09-13 13:46:56 +00001402 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001403 return rc;
drha059ad02001-04-17 20:09:11 +00001404}
1405
1406/*
drh5e00f6c2001-09-13 13:46:56 +00001407** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001408** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001409*/
drh3aac2dd2004-04-26 14:10:20 +00001410int sqlite3BtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001411 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001412 if( pCur->pPrev ){
1413 pCur->pPrev->pNext = pCur->pNext;
1414 }else{
1415 pBt->pCursor = pCur->pNext;
1416 }
1417 if( pCur->pNext ){
1418 pCur->pNext->pPrev = pCur->pPrev;
1419 }
drh3aac2dd2004-04-26 14:10:20 +00001420 releasePage(pCur->pPage);
drhf74b8d92002-09-01 23:20:45 +00001421 if( pCur->pShared!=pCur ){
1422 BtCursor *pRing = pCur->pShared;
1423 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1424 pRing->pShared = pCur->pShared;
1425 }
drh5e00f6c2001-09-13 13:46:56 +00001426 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001427 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001428 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001429}
1430
drh7e3b0a02001-04-28 16:52:40 +00001431/*
drh5e2f8b92001-05-28 00:41:15 +00001432** Make a temporary cursor by filling in the fields of pTempCur.
1433** The temporary cursor is not on the cursor list for the Btree.
1434*/
drh14acc042001-06-10 19:56:58 +00001435static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001436 memcpy(pTempCur, pCur, sizeof(*pCur));
1437 pTempCur->pNext = 0;
1438 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001439 if( pTempCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001440 sqlite3pager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001441 }
drh5e2f8b92001-05-28 00:41:15 +00001442}
1443
1444/*
drhbd03cae2001-06-02 02:40:57 +00001445** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001446** function above.
1447*/
drh14acc042001-06-10 19:56:58 +00001448static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001449 if( pCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001450 sqlite3pager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001451 }
drh5e2f8b92001-05-28 00:41:15 +00001452}
1453
1454/*
drh3aac2dd2004-04-26 14:10:20 +00001455** Set *pSize to the size of the buffer needed to hold the value of
1456** the key for the current entry. If the cursor is not pointing
1457** to a valid entry, *pSize is set to 0.
1458**
drh4b70f112004-05-02 21:12:19 +00001459** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00001460** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00001461*/
drh4a1c3802004-05-12 15:15:47 +00001462int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
drh2af926b2001-05-15 00:39:25 +00001463 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001464 unsigned char *cell;
drh2af926b2001-05-15 00:39:25 +00001465
drhc39e0002004-05-07 23:50:57 +00001466 if( !pCur->isValid ){
drh72f82862001-05-24 21:06:34 +00001467 *pSize = 0;
1468 }else{
drhc39e0002004-05-07 23:50:57 +00001469 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001470 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001471 assert( pPage!=0 );
1472 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1473 cell = pPage->aCell[pCur->idx];
drh3aac2dd2004-04-26 14:10:20 +00001474 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001475 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001476 cell += 4; /* Skip the child pointer */
1477 }
drh8b18dd42004-05-12 19:18:15 +00001478 if( pPage->hasData ){
drha34b6762004-05-07 13:30:42 +00001479 while( (0x80&*(cell++))!=0 ){} /* Skip the data size number */
drh3aac2dd2004-04-26 14:10:20 +00001480 }
drha34b6762004-05-07 13:30:42 +00001481 getVarint(cell, pSize);
drh72f82862001-05-24 21:06:34 +00001482 }
1483 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001484}
drh2af926b2001-05-15 00:39:25 +00001485
drh72f82862001-05-24 21:06:34 +00001486/*
drh0e1c19e2004-05-11 00:58:56 +00001487** Set *pSize to the number of bytes of data in the entry the
1488** cursor currently points to. Always return SQLITE_OK.
1489** Failure is not possible. If the cursor is not currently
1490** pointing to an entry (which can happen, for example, if
1491** the database is empty) then *pSize is set to 0.
1492*/
1493int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
1494 MemPage *pPage;
1495 unsigned char *cell;
drh0e1c19e2004-05-11 00:58:56 +00001496
1497 if( !pCur->isValid ){
danielk197796fc5fe2004-05-13 11:34:16 +00001498 /* Not pointing at a valid entry - set *pSize to 0. */
drh0e1c19e2004-05-11 00:58:56 +00001499 *pSize = 0;
1500 }else{
danielk197796fc5fe2004-05-13 11:34:16 +00001501 pPage = pCur->pPage;
1502 assert( pPage!=0 );
1503 assert( pPage->isInit );
1504 pageIntegrity(pPage);
1505 if( !pPage->hasData ){
1506 *pSize = 0;
1507 }else{
1508 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1509 cell = pPage->aCell[pCur->idx];
1510 cell += 2; /* Skip the offset to the next cell */
1511 if( !pPage->leaf ){
1512 cell += 4; /* Skip the child pointer */
1513 }
1514 getVarint32(cell, pSize);
drh0e1c19e2004-05-11 00:58:56 +00001515 }
drh0e1c19e2004-05-11 00:58:56 +00001516 }
1517 return SQLITE_OK;
1518}
1519
1520/*
drh72f82862001-05-24 21:06:34 +00001521** Read payload information from the entry that the pCur cursor is
1522** pointing to. Begin reading the payload at "offset" and read
1523** a total of "amt" bytes. Put the result in zBuf.
1524**
1525** This routine does not make a distinction between key and data.
1526** It just reads bytes from the payload area.
1527*/
drh3aac2dd2004-04-26 14:10:20 +00001528static int getPayload(
1529 BtCursor *pCur, /* Cursor pointing to entry to read from */
1530 int offset, /* Begin reading this far into payload */
1531 int amt, /* Read this many bytes */
1532 unsigned char *pBuf, /* Write the bytes into this buffer */
1533 int skipKey /* offset begins at data if this is true */
1534){
1535 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001536 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001537 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001538 MemPage *pPage;
1539 Btree *pBt;
drh6f11bef2004-05-13 01:12:56 +00001540 int ovflSize;
1541 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00001542
drh72f82862001-05-24 21:06:34 +00001543 assert( pCur!=0 && pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001544 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001545 pBt = pCur->pBt;
1546 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001547 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00001548 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1549 aPayload = pPage->aCell[pCur->idx];
drh6f11bef2004-05-13 01:12:56 +00001550 parseCell(pPage, aPayload, &info);
1551 aPayload += info.nHeader;
drh3aac2dd2004-04-26 14:10:20 +00001552 if( pPage->intKey ){
drh6f11bef2004-05-13 01:12:56 +00001553 info.nKey = 0;
drh3aac2dd2004-04-26 14:10:20 +00001554 }
1555 assert( offset>=0 );
1556 if( skipKey ){
drh6f11bef2004-05-13 01:12:56 +00001557 offset += info.nKey;
drh3aac2dd2004-04-26 14:10:20 +00001558 }
drh6f11bef2004-05-13 01:12:56 +00001559 if( offset+amt > info.nKey+info.nData ){
drha34b6762004-05-07 13:30:42 +00001560 return SQLITE_ERROR;
drh3aac2dd2004-04-26 14:10:20 +00001561 }
drh6f11bef2004-05-13 01:12:56 +00001562 if( offset<info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00001563 int a = amt;
drh6f11bef2004-05-13 01:12:56 +00001564 if( a+offset>info.nLocal ){
1565 a = info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00001566 }
drha34b6762004-05-07 13:30:42 +00001567 memcpy(pBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001568 if( a==amt ){
1569 return SQLITE_OK;
1570 }
drh2aa679f2001-06-25 02:11:07 +00001571 offset = 0;
drha34b6762004-05-07 13:30:42 +00001572 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00001573 amt -= a;
drhdd793422001-06-28 01:54:48 +00001574 }else{
drh6f11bef2004-05-13 01:12:56 +00001575 offset -= info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00001576 }
1577 if( amt>0 ){
drh6f11bef2004-05-13 01:12:56 +00001578 nextPage = get4byte(&aPayload[info.nLocal]);
drh2af926b2001-05-15 00:39:25 +00001579 }
drhb6f41482004-05-14 01:58:11 +00001580 ovflSize = pBt->usableSize - 4;
drh2af926b2001-05-15 00:39:25 +00001581 while( amt>0 && nextPage ){
drha34b6762004-05-07 13:30:42 +00001582 rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
drh2af926b2001-05-15 00:39:25 +00001583 if( rc!=0 ){
1584 return rc;
1585 }
drha34b6762004-05-07 13:30:42 +00001586 nextPage = get4byte(aPayload);
drh3aac2dd2004-04-26 14:10:20 +00001587 if( offset<ovflSize ){
drh2af926b2001-05-15 00:39:25 +00001588 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001589 if( a + offset > ovflSize ){
1590 a = ovflSize - offset;
drh2af926b2001-05-15 00:39:25 +00001591 }
drh9b171272004-05-08 02:03:22 +00001592 memcpy(pBuf, &aPayload[offset+4], a);
drh2aa679f2001-06-25 02:11:07 +00001593 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001594 amt -= a;
drha34b6762004-05-07 13:30:42 +00001595 pBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001596 }else{
drh3aac2dd2004-04-26 14:10:20 +00001597 offset -= ovflSize;
drh2af926b2001-05-15 00:39:25 +00001598 }
drha34b6762004-05-07 13:30:42 +00001599 sqlite3pager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00001600 }
drha7fcb052001-12-14 15:09:55 +00001601 if( amt>0 ){
1602 return SQLITE_CORRUPT;
1603 }
1604 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001605}
1606
drh72f82862001-05-24 21:06:34 +00001607/*
drh3aac2dd2004-04-26 14:10:20 +00001608** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001609** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001610** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00001611**
drh3aac2dd2004-04-26 14:10:20 +00001612** Return SQLITE_OK on success or an error code if anything goes
1613** wrong. An error is returned if "offset+amt" is larger than
1614** the available payload.
drh72f82862001-05-24 21:06:34 +00001615*/
drha34b6762004-05-07 13:30:42 +00001616int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh8c1238a2003-01-02 14:43:55 +00001617 assert( amt>=0 );
1618 assert( offset>=0 );
drhc39e0002004-05-07 23:50:57 +00001619 if( pCur->isValid==0 ){
1620 return pCur->status;
drh3aac2dd2004-04-26 14:10:20 +00001621 }
drhc39e0002004-05-07 23:50:57 +00001622 assert( pCur->pPage!=0 );
1623 assert( pCur->pPage->intKey==0 );
1624 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001625 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1626}
1627
1628/*
drh3aac2dd2004-04-26 14:10:20 +00001629** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001630** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001631** begins at "offset".
1632**
1633** Return SQLITE_OK on success or an error code if anything goes
1634** wrong. An error is returned if "offset+amt" is larger than
1635** the available payload.
drh72f82862001-05-24 21:06:34 +00001636*/
drh3aac2dd2004-04-26 14:10:20 +00001637int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhc39e0002004-05-07 23:50:57 +00001638 if( !pCur->isValid ){
1639 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1640 }
drh8c1238a2003-01-02 14:43:55 +00001641 assert( amt>=0 );
1642 assert( offset>=0 );
1643 assert( pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001644 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001645 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00001646}
1647
drh72f82862001-05-24 21:06:34 +00001648/*
drh0e1c19e2004-05-11 00:58:56 +00001649** Return a pointer to payload information from the entry that the
1650** pCur cursor is pointing to. The pointer is to the beginning of
1651** the key if skipKey==0 and it points to the beginning of data if
1652** skipKey==1.
1653**
1654** At least amt bytes of information must be available on the local
1655** page or else this routine returns NULL. If amt<0 then the entire
1656** key/data must be available.
1657**
1658** This routine is an optimization. It is common for the entire key
1659** and data to fit on the local page and for there to be no overflow
1660** pages. When that is so, this routine can be used to access the
1661** key and data without making a copy. If the key and/or data spills
1662** onto overflow pages, then getPayload() must be used to reassembly
1663** the key/data and copy it into a preallocated buffer.
1664**
1665** The pointer returned by this routine looks directly into the cached
1666** page of the database. The data might change or move the next time
1667** any btree routine is called.
1668*/
1669static const unsigned char *fetchPayload(
1670 BtCursor *pCur, /* Cursor pointing to entry to read from */
1671 int amt, /* Amount requested */
1672 int skipKey /* read beginning at data if this is true */
1673){
1674 unsigned char *aPayload;
1675 MemPage *pPage;
1676 Btree *pBt;
drh6f11bef2004-05-13 01:12:56 +00001677 CellInfo info;
drh0e1c19e2004-05-11 00:58:56 +00001678
1679 assert( pCur!=0 && pCur->pPage!=0 );
1680 assert( pCur->isValid );
1681 pBt = pCur->pBt;
1682 pPage = pCur->pPage;
1683 pageIntegrity(pPage);
1684 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1685 aPayload = pPage->aCell[pCur->idx];
drh6f11bef2004-05-13 01:12:56 +00001686 parseCell(pPage, aPayload, &info);
1687 aPayload += info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00001688 if( pPage->intKey ){
drh6f11bef2004-05-13 01:12:56 +00001689 info.nKey = 0;
drh0e1c19e2004-05-11 00:58:56 +00001690 }
drh0e1c19e2004-05-11 00:58:56 +00001691 if( skipKey ){
drh6f11bef2004-05-13 01:12:56 +00001692 aPayload += info.nKey;
1693 info.nLocal -= info.nKey;
1694 if( amt<0 ) amt = info.nData;
1695 assert( amt<=info.nData );
drh0e1c19e2004-05-11 00:58:56 +00001696 }else{
drh6f11bef2004-05-13 01:12:56 +00001697 if( amt<0 ) amt = info.nKey;
1698 assert( amt<=info.nKey );
drh0e1c19e2004-05-11 00:58:56 +00001699 }
drh6f11bef2004-05-13 01:12:56 +00001700 if( amt>info.nLocal ){
drh0e1c19e2004-05-11 00:58:56 +00001701 return 0; /* If any of the data is not local, return nothing */
1702 }
1703 return aPayload;
1704}
1705
1706
1707/*
1708** Return a pointer to the first amt bytes of the key or data
1709** for record that cursor pCur is point to if the entire request
1710** exists in contiguous memory on the main tree page. If any
1711** any part of the request is on an overflow page, return 0.
1712** If pCur is not pointing to a valid entry return 0.
1713**
1714** If amt<0 then return the entire key or data.
1715**
1716** The pointer returned is ephemeral. The key/data may move
1717** or be destroyed on the next call to any Btree routine.
1718**
1719** These routines is used to get quick access to key and data
1720** in the common case where no overflow pages are used.
1721**
1722** It is a fatal error to call these routines with amt values that
1723** are larger than the key/data size.
1724*/
1725const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int amt){
1726 return (const void*)fetchPayload(pCur, amt, 0);
1727}
1728const void *sqlite3BtreeDataFetch(BtCursor *pCur, int amt){
1729 return (const void*)fetchPayload(pCur, amt, 1);
1730}
1731
1732
1733/*
drh8178a752003-01-05 21:41:40 +00001734** Move the cursor down to a new child page. The newPgno argument is the
1735** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001736*/
drh3aac2dd2004-04-26 14:10:20 +00001737static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00001738 int rc;
1739 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00001740 MemPage *pOldPage;
drh0d316a42002-08-11 20:10:47 +00001741 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001742
drhc39e0002004-05-07 23:50:57 +00001743 assert( pCur->isValid );
drhde647132004-05-07 17:57:49 +00001744 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001745 if( rc ) return rc;
drhda200cc2004-05-09 11:51:38 +00001746 pageIntegrity(pNewPage);
drh428ae8c2003-01-04 16:48:09 +00001747 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00001748 pOldPage = pCur->pPage;
1749 pOldPage->idxShift = 0;
1750 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00001751 pCur->pPage = pNewPage;
1752 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001753 if( pNewPage->nCell<1 ){
1754 return SQLITE_CORRUPT;
1755 }
drh72f82862001-05-24 21:06:34 +00001756 return SQLITE_OK;
1757}
1758
1759/*
drh8856d6a2004-04-29 14:42:46 +00001760** Return true if the page is the virtual root of its table.
1761**
1762** The virtual root page is the root page for most tables. But
1763** for the table rooted on page 1, sometime the real root page
1764** is empty except for the right-pointer. In such cases the
1765** virtual root page is the page that the right-pointer of page
1766** 1 is pointing to.
1767*/
1768static int isRootPage(MemPage *pPage){
1769 MemPage *pParent = pPage->pParent;
drhda200cc2004-05-09 11:51:38 +00001770 if( pParent==0 ) return 1;
1771 if( pParent->pgno>1 ) return 0;
1772 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
drh8856d6a2004-04-29 14:42:46 +00001773 return 0;
1774}
1775
1776/*
drh5e2f8b92001-05-28 00:41:15 +00001777** Move the cursor up to the parent page.
1778**
1779** pCur->idx is set to the cell index that contains the pointer
1780** to the page we are coming from. If we are coming from the
1781** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001782** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001783*/
drh8178a752003-01-05 21:41:40 +00001784static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001785 Pgno oldPgno;
1786 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001787 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001788 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001789
drhc39e0002004-05-07 23:50:57 +00001790 assert( pCur->isValid );
drh8178a752003-01-05 21:41:40 +00001791 pPage = pCur->pPage;
1792 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00001793 assert( !isRootPage(pPage) );
drhda200cc2004-05-09 11:51:38 +00001794 pageIntegrity(pPage);
drh8178a752003-01-05 21:41:40 +00001795 pParent = pPage->pParent;
1796 assert( pParent!=0 );
drhda200cc2004-05-09 11:51:38 +00001797 pageIntegrity(pParent);
drh8178a752003-01-05 21:41:40 +00001798 idxParent = pPage->idxParent;
drha34b6762004-05-07 13:30:42 +00001799 sqlite3pager_ref(pParent->aData);
drh3aac2dd2004-04-26 14:10:20 +00001800 oldPgno = pPage->pgno;
1801 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00001802 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001803 assert( pParent->idxShift==0 );
1804 if( pParent->idxShift==0 ){
1805 pCur->idx = idxParent;
1806#ifndef NDEBUG
1807 /* Verify that pCur->idx is the correct index to point back to the child
1808 ** page we just came from
1809 */
drh428ae8c2003-01-04 16:48:09 +00001810 if( pCur->idx<pParent->nCell ){
drha34b6762004-05-07 13:30:42 +00001811 assert( get4byte(&pParent->aCell[idxParent][2])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001812 }else{
drha34b6762004-05-07 13:30:42 +00001813 assert( get4byte(&pParent->aData[pParent->hdrOffset+6])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001814 }
1815#endif
1816 }else{
1817 /* The MemPage.idxShift flag indicates that cell indices might have
1818 ** changed since idxParent was set and hence idxParent might be out
1819 ** of date. So recompute the parent cell index by scanning all cells
1820 ** and locating the one that points to the child we just came from.
1821 */
1822 int i;
1823 pCur->idx = pParent->nCell;
drh428ae8c2003-01-04 16:48:09 +00001824 for(i=0; i<pParent->nCell; i++){
drh3aac2dd2004-04-26 14:10:20 +00001825 if( get4byte(&pParent->aCell[i][2])==oldPgno ){
drh428ae8c2003-01-04 16:48:09 +00001826 pCur->idx = i;
1827 break;
1828 }
drh72f82862001-05-24 21:06:34 +00001829 }
1830 }
1831}
1832
1833/*
1834** Move the cursor to the root page
1835*/
drh5e2f8b92001-05-28 00:41:15 +00001836static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00001837 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00001838 int rc;
drh0d316a42002-08-11 20:10:47 +00001839 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001840
drhde647132004-05-07 17:57:49 +00001841 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
drhc39e0002004-05-07 23:50:57 +00001842 if( rc ){
1843 pCur->isValid = 0;
1844 return rc;
1845 }
drh3aac2dd2004-04-26 14:10:20 +00001846 releasePage(pCur->pPage);
drhda200cc2004-05-09 11:51:38 +00001847 pageIntegrity(pRoot);
drh3aac2dd2004-04-26 14:10:20 +00001848 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00001849 pCur->idx = 0;
drh8856d6a2004-04-29 14:42:46 +00001850 if( pRoot->nCell==0 && !pRoot->leaf ){
1851 Pgno subpage;
1852 assert( pRoot->pgno==1 );
1853 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+6]);
1854 assert( subpage>0 );
drh3644f082004-05-10 18:45:09 +00001855 pCur->isValid = 1;
drh4b70f112004-05-02 21:12:19 +00001856 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00001857 }
drhc39e0002004-05-07 23:50:57 +00001858 pCur->isValid = pCur->pPage->nCell>0;
drh8856d6a2004-04-29 14:42:46 +00001859 return rc;
drh72f82862001-05-24 21:06:34 +00001860}
drh2af926b2001-05-15 00:39:25 +00001861
drh5e2f8b92001-05-28 00:41:15 +00001862/*
1863** Move the cursor down to the left-most leaf entry beneath the
1864** entry to which it is currently pointing.
1865*/
1866static int moveToLeftmost(BtCursor *pCur){
1867 Pgno pgno;
1868 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001869 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00001870
drhc39e0002004-05-07 23:50:57 +00001871 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001872 while( !(pPage = pCur->pPage)->leaf ){
drha34b6762004-05-07 13:30:42 +00001873 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1874 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001875 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001876 if( rc ) return rc;
1877 }
1878 return SQLITE_OK;
1879}
1880
drh2dcc9aa2002-12-04 13:40:25 +00001881/*
1882** Move the cursor down to the right-most leaf entry beneath the
1883** page to which it is currently pointing. Notice the difference
1884** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1885** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1886** finds the right-most entry beneath the *page*.
1887*/
1888static int moveToRightmost(BtCursor *pCur){
1889 Pgno pgno;
1890 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001891 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001892
drhc39e0002004-05-07 23:50:57 +00001893 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001894 while( !(pPage = pCur->pPage)->leaf ){
1895 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
1896 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001897 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001898 if( rc ) return rc;
1899 }
drh3aac2dd2004-04-26 14:10:20 +00001900 pCur->idx = pPage->nCell - 1;
drh2dcc9aa2002-12-04 13:40:25 +00001901 return SQLITE_OK;
1902}
1903
drh5e00f6c2001-09-13 13:46:56 +00001904/* Move the cursor to the first entry in the table. Return SQLITE_OK
1905** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001906** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001907*/
drh3aac2dd2004-04-26 14:10:20 +00001908int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001909 int rc;
drhc39e0002004-05-07 23:50:57 +00001910 if( pCur->status ){
1911 return pCur->status;
1912 }
drh5e00f6c2001-09-13 13:46:56 +00001913 rc = moveToRoot(pCur);
1914 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001915 if( pCur->isValid==0 ){
1916 assert( pCur->pPage->nCell==0 );
drh5e00f6c2001-09-13 13:46:56 +00001917 *pRes = 1;
1918 return SQLITE_OK;
1919 }
drhc39e0002004-05-07 23:50:57 +00001920 assert( pCur->pPage->nCell>0 );
drh5e00f6c2001-09-13 13:46:56 +00001921 *pRes = 0;
1922 rc = moveToLeftmost(pCur);
1923 return rc;
1924}
drh5e2f8b92001-05-28 00:41:15 +00001925
drh9562b552002-02-19 15:00:07 +00001926/* Move the cursor to the last entry in the table. Return SQLITE_OK
1927** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001928** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001929*/
drh3aac2dd2004-04-26 14:10:20 +00001930int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001931 int rc;
drhc39e0002004-05-07 23:50:57 +00001932 if( pCur->status ){
1933 return pCur->status;
1934 }
drh9562b552002-02-19 15:00:07 +00001935 rc = moveToRoot(pCur);
1936 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001937 if( pCur->isValid==0 ){
1938 assert( pCur->pPage->nCell==0 );
drh9562b552002-02-19 15:00:07 +00001939 *pRes = 1;
1940 return SQLITE_OK;
1941 }
drhc39e0002004-05-07 23:50:57 +00001942 assert( pCur->isValid );
drh9562b552002-02-19 15:00:07 +00001943 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001944 rc = moveToRightmost(pCur);
drh9562b552002-02-19 15:00:07 +00001945 return rc;
1946}
1947
drh3aac2dd2004-04-26 14:10:20 +00001948/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00001949** Return a success code.
1950**
drh3aac2dd2004-04-26 14:10:20 +00001951** For INTKEY tables, only the nKey parameter is used. pKey is
1952** ignored. For other tables, nKey is the number of bytes of data
1953** in nKey. The comparison function specified when the cursor was
1954** created is used to compare keys.
1955**
drh5e2f8b92001-05-28 00:41:15 +00001956** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001957** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001958** were present. The cursor might point to an entry that comes
1959** before or after the key.
1960**
drhbd03cae2001-06-02 02:40:57 +00001961** The result of comparing the key with the entry to which the
1962** cursor is left pointing is stored in pCur->iMatch. The same
1963** value is also written to *pRes if pRes!=NULL. The meaning of
1964** this value is as follows:
1965**
1966** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001967** is smaller than pKey or if the table is empty
1968** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001969**
1970** *pRes==0 The cursor is left pointing at an entry that
1971** exactly matches pKey.
1972**
1973** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001974** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001975*/
drh4a1c3802004-05-12 15:15:47 +00001976int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001977 int rc;
drhc39e0002004-05-07 23:50:57 +00001978
1979 if( pCur->status ){
1980 return pCur->status;
1981 }
drh5e2f8b92001-05-28 00:41:15 +00001982 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001983 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001984 assert( pCur->pPage );
1985 assert( pCur->pPage->isInit );
1986 if( pCur->isValid==0 ){
drhf328bc82004-05-10 23:29:49 +00001987 *pRes = -1;
drhc39e0002004-05-07 23:50:57 +00001988 assert( pCur->pPage->nCell==0 );
1989 return SQLITE_OK;
1990 }
drh72f82862001-05-24 21:06:34 +00001991 for(;;){
1992 int lwr, upr;
1993 Pgno chldPg;
1994 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001995 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001996 lwr = 0;
1997 upr = pPage->nCell-1;
drhda200cc2004-05-09 11:51:38 +00001998 pageIntegrity(pPage);
drh72f82862001-05-24 21:06:34 +00001999 while( lwr<=upr ){
drh0e1c19e2004-05-11 00:58:56 +00002000 const void *pCellKey;
drh4a1c3802004-05-12 15:15:47 +00002001 i64 nCellKey;
drh72f82862001-05-24 21:06:34 +00002002 pCur->idx = (lwr+upr)/2;
drhde647132004-05-07 17:57:49 +00002003 sqlite3BtreeKeySize(pCur, &nCellKey);
drh3aac2dd2004-04-26 14:10:20 +00002004 if( pPage->intKey ){
2005 if( nCellKey<nKey ){
2006 c = -1;
2007 }else if( nCellKey>nKey ){
2008 c = +1;
2009 }else{
2010 c = 0;
2011 }
drh0e1c19e2004-05-11 00:58:56 +00002012 }else if( (pCellKey = sqlite3BtreeKeyFetch(pCur, nCellKey))!=0 ){
drh3aac2dd2004-04-26 14:10:20 +00002013 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2014 }else{
drh0e1c19e2004-05-11 00:58:56 +00002015 u8 *pCellKey = sqliteMalloc( nCellKey );
drh3aac2dd2004-04-26 14:10:20 +00002016 if( pCellKey==0 ) return SQLITE_NOMEM;
2017 rc = sqlite3BtreeKey(pCur, 0, nCellKey, pCellKey);
2018 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
2019 sqliteFree(pCellKey);
2020 if( rc ) return rc;
2021 }
drh72f82862001-05-24 21:06:34 +00002022 if( c==0 ){
drh8b18dd42004-05-12 19:18:15 +00002023 if( pPage->leafData && !pPage->leaf ){
drhfc70e6f2004-05-12 21:11:27 +00002024 lwr = pCur->idx;
2025 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00002026 break;
2027 }else{
2028 pCur->iMatch = c;
2029 if( pRes ) *pRes = 0;
2030 return SQLITE_OK;
2031 }
drh72f82862001-05-24 21:06:34 +00002032 }
2033 if( c<0 ){
2034 lwr = pCur->idx+1;
2035 }else{
2036 upr = pCur->idx-1;
2037 }
2038 }
2039 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00002040 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00002041 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00002042 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00002043 }else if( lwr>=pPage->nCell ){
2044 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh72f82862001-05-24 21:06:34 +00002045 }else{
drh3aac2dd2004-04-26 14:10:20 +00002046 chldPg = get4byte(&pPage->aCell[lwr][2]);
drh72f82862001-05-24 21:06:34 +00002047 }
2048 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00002049 pCur->iMatch = c;
drhc39e0002004-05-07 23:50:57 +00002050 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh72f82862001-05-24 21:06:34 +00002051 if( pRes ) *pRes = c;
2052 return SQLITE_OK;
2053 }
drh428ae8c2003-01-04 16:48:09 +00002054 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00002055 rc = moveToChild(pCur, chldPg);
drhc39e0002004-05-07 23:50:57 +00002056 if( rc ){
2057 return rc;
2058 }
drh72f82862001-05-24 21:06:34 +00002059 }
drhbd03cae2001-06-02 02:40:57 +00002060 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00002061}
2062
2063/*
drhc39e0002004-05-07 23:50:57 +00002064** Return TRUE if the cursor is not pointing at an entry of the table.
2065**
2066** TRUE will be returned after a call to sqlite3BtreeNext() moves
2067** past the last entry in the table or sqlite3BtreePrev() moves past
2068** the first entry. TRUE is also returned if the table is empty.
2069*/
2070int sqlite3BtreeEof(BtCursor *pCur){
2071 return pCur->isValid==0;
2072}
2073
2074/*
drhbd03cae2001-06-02 02:40:57 +00002075** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00002076** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00002077** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00002078** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00002079*/
drh3aac2dd2004-04-26 14:10:20 +00002080int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00002081 int rc;
drh8178a752003-01-05 21:41:40 +00002082 MemPage *pPage = pCur->pPage;
drh8b18dd42004-05-12 19:18:15 +00002083
drh8c1238a2003-01-02 14:43:55 +00002084 assert( pRes!=0 );
drhc39e0002004-05-07 23:50:57 +00002085 if( pCur->isValid==0 ){
drh8c1238a2003-01-02 14:43:55 +00002086 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00002087 return SQLITE_OK;
drhecdc7532001-09-23 02:35:53 +00002088 }
drh8178a752003-01-05 21:41:40 +00002089 assert( pPage->isInit );
drh8178a752003-01-05 21:41:40 +00002090 assert( pCur->idx<pPage->nCell );
drh72f82862001-05-24 21:06:34 +00002091 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00002092 if( pCur->idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00002093 if( !pPage->leaf ){
2094 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+6]));
drh5e2f8b92001-05-28 00:41:15 +00002095 if( rc ) return rc;
2096 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00002097 *pRes = 0;
2098 return rc;
drh72f82862001-05-24 21:06:34 +00002099 }
drh5e2f8b92001-05-28 00:41:15 +00002100 do{
drh8856d6a2004-04-29 14:42:46 +00002101 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00002102 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00002103 pCur->isValid = 0;
drh5e2f8b92001-05-28 00:41:15 +00002104 return SQLITE_OK;
2105 }
drh8178a752003-01-05 21:41:40 +00002106 moveToParent(pCur);
2107 pPage = pCur->pPage;
2108 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00002109 *pRes = 0;
drh8b18dd42004-05-12 19:18:15 +00002110 if( pPage->leafData ){
2111 rc = sqlite3BtreeNext(pCur, pRes);
2112 }else{
2113 rc = SQLITE_OK;
2114 }
2115 return rc;
drh8178a752003-01-05 21:41:40 +00002116 }
2117 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00002118 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00002119 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00002120 }
drh5e2f8b92001-05-28 00:41:15 +00002121 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00002122 return rc;
drh72f82862001-05-24 21:06:34 +00002123}
2124
drh3b7511c2001-05-26 13:15:44 +00002125/*
drh2dcc9aa2002-12-04 13:40:25 +00002126** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00002127** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00002128** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00002129** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00002130*/
drh3aac2dd2004-04-26 14:10:20 +00002131int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00002132 int rc;
2133 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00002134 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00002135 if( pCur->isValid==0 ){
2136 *pRes = 1;
2137 return SQLITE_OK;
2138 }
drh8178a752003-01-05 21:41:40 +00002139 pPage = pCur->pPage;
drh8178a752003-01-05 21:41:40 +00002140 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00002141 assert( pCur->idx>=0 );
drha34b6762004-05-07 13:30:42 +00002142 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00002143 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00002144 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00002145 if( rc ) return rc;
2146 rc = moveToRightmost(pCur);
2147 }else{
2148 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00002149 if( isRootPage(pPage) ){
drhc39e0002004-05-07 23:50:57 +00002150 pCur->isValid = 0;
2151 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00002152 return SQLITE_OK;
2153 }
drh8178a752003-01-05 21:41:40 +00002154 moveToParent(pCur);
2155 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00002156 }
2157 pCur->idx--;
drh8b18dd42004-05-12 19:18:15 +00002158 if( pPage->leafData ){
2159 rc = sqlite3BtreePrevious(pCur, pRes);
2160 }else{
2161 rc = SQLITE_OK;
2162 }
drh2dcc9aa2002-12-04 13:40:25 +00002163 }
drh8178a752003-01-05 21:41:40 +00002164 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00002165 return rc;
2166}
2167
2168/*
drh3a4c1412004-05-09 20:40:11 +00002169** The TRACE macro will print high-level status information about the
2170** btree operation when the global variable sqlite3_btree_trace is
2171** enabled.
2172*/
2173#if SQLITE_TEST
2174# define TRACE(X) if( sqlite3_btree_trace ){ printf X; fflush(stdout); }
2175#else
2176# define TRACE(X)
2177#endif
2178int sqlite3_btree_trace=0; /* True to enable tracing */
2179
2180/*
drh3b7511c2001-05-26 13:15:44 +00002181** Allocate a new page from the database file.
2182**
drha34b6762004-05-07 13:30:42 +00002183** The new page is marked as dirty. (In other words, sqlite3pager_write()
drh3b7511c2001-05-26 13:15:44 +00002184** has already been called on the new page.) The new page has also
2185** been referenced and the calling routine is responsible for calling
drha34b6762004-05-07 13:30:42 +00002186** sqlite3pager_unref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00002187**
2188** SQLITE_OK is returned on success. Any other return value indicates
2189** an error. *ppPage and *pPgno are undefined in the event of an error.
drha34b6762004-05-07 13:30:42 +00002190** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00002191**
drh199e3cf2002-07-18 11:01:47 +00002192** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2193** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00002194** attempt to keep related pages close to each other in the database file,
2195** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00002196*/
drh199e3cf2002-07-18 11:01:47 +00002197static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drh3aac2dd2004-04-26 14:10:20 +00002198 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00002199 int rc;
drh3aac2dd2004-04-26 14:10:20 +00002200 int n; /* Number of pages on the freelist */
2201 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00002202
drh3aac2dd2004-04-26 14:10:20 +00002203 pPage1 = pBt->pPage1;
2204 n = get4byte(&pPage1->aData[36]);
2205 if( n>0 ){
drh91025292004-05-03 19:49:32 +00002206 /* There are pages on the freelist. Reuse one of those pages. */
drh3aac2dd2004-04-26 14:10:20 +00002207 MemPage *pTrunk;
drha34b6762004-05-07 13:30:42 +00002208 rc = sqlite3pager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00002209 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002210 put4byte(&pPage1->aData[36], n-1);
2211 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002212 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002213 rc = sqlite3pager_write(pTrunk->aData);
drh3b7511c2001-05-26 13:15:44 +00002214 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00002215 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002216 return rc;
2217 }
drh3aac2dd2004-04-26 14:10:20 +00002218 k = get4byte(&pTrunk->aData[4]);
2219 if( k==0 ){
2220 /* The trunk has no leaves. So extract the trunk page itself and
2221 ** use it as the newly allocated page */
drha34b6762004-05-07 13:30:42 +00002222 *pPgno = get4byte(&pPage1->aData[32]);
drh3aac2dd2004-04-26 14:10:20 +00002223 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
2224 *ppPage = pTrunk;
drh3a4c1412004-05-09 20:40:11 +00002225 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh30e58752002-03-02 20:41:57 +00002226 }else{
drh3aac2dd2004-04-26 14:10:20 +00002227 /* Extract a leaf from the trunk */
2228 int closest;
2229 unsigned char *aData = pTrunk->aData;
2230 if( nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00002231 int i, dist;
2232 closest = 0;
drh3aac2dd2004-04-26 14:10:20 +00002233 dist = get4byte(&aData[8]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002234 if( dist<0 ) dist = -dist;
drh3a4c1412004-05-09 20:40:11 +00002235 for(i=1; i<k; i++){
drh3aac2dd2004-04-26 14:10:20 +00002236 int d2 = get4byte(&aData[8+i*4]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002237 if( d2<0 ) d2 = -d2;
2238 if( d2<dist ) closest = i;
2239 }
2240 }else{
2241 closest = 0;
2242 }
drha34b6762004-05-07 13:30:42 +00002243 *pPgno = get4byte(&aData[8+closest*4]);
drh3a4c1412004-05-09 20:40:11 +00002244 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d: %d more free pages\n",
2245 *pPgno, closest+1, k, pTrunk->pgno, n-1));
drh9b171272004-05-08 02:03:22 +00002246 if( closest<k-1 ){
2247 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
2248 }
drh3a4c1412004-05-09 20:40:11 +00002249 put4byte(&aData[4], k-1);
drh3aac2dd2004-04-26 14:10:20 +00002250 rc = getPage(pBt, *pPgno, ppPage);
2251 releasePage(pTrunk);
drh30e58752002-03-02 20:41:57 +00002252 if( rc==SQLITE_OK ){
drh9b171272004-05-08 02:03:22 +00002253 sqlite3pager_dont_rollback((*ppPage)->aData);
drha34b6762004-05-07 13:30:42 +00002254 rc = sqlite3pager_write((*ppPage)->aData);
drh30e58752002-03-02 20:41:57 +00002255 }
2256 }
drh3b7511c2001-05-26 13:15:44 +00002257 }else{
drh3aac2dd2004-04-26 14:10:20 +00002258 /* There are no pages on the freelist, so create a new page at the
2259 ** end of the file */
drha34b6762004-05-07 13:30:42 +00002260 *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
drh3aac2dd2004-04-26 14:10:20 +00002261 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00002262 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002263 rc = sqlite3pager_write((*ppPage)->aData);
drh3a4c1412004-05-09 20:40:11 +00002264 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00002265 }
2266 return rc;
2267}
2268
2269/*
drh3aac2dd2004-04-26 14:10:20 +00002270** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00002271**
drha34b6762004-05-07 13:30:42 +00002272** sqlite3pager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00002273*/
drh3aac2dd2004-04-26 14:10:20 +00002274static int freePage(MemPage *pPage){
2275 Btree *pBt = pPage->pBt;
2276 MemPage *pPage1 = pBt->pPage1;
2277 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00002278
drh3aac2dd2004-04-26 14:10:20 +00002279 /* Prepare the page for freeing */
2280 assert( pPage->pgno>1 );
2281 pPage->isInit = 0;
2282 releasePage(pPage->pParent);
2283 pPage->pParent = 0;
2284
drha34b6762004-05-07 13:30:42 +00002285 /* Increment the free page count on pPage1 */
2286 rc = sqlite3pager_write(pPage1->aData);
drh3aac2dd2004-04-26 14:10:20 +00002287 if( rc ) return rc;
2288 n = get4byte(&pPage1->aData[36]);
2289 put4byte(&pPage1->aData[36], n+1);
2290
2291 if( n==0 ){
2292 /* This is the first free page */
drhda200cc2004-05-09 11:51:38 +00002293 rc = sqlite3pager_write(pPage->aData);
2294 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002295 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00002296 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002297 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002298 }else{
2299 /* Other free pages already exist. Retrive the first trunk page
2300 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00002301 MemPage *pTrunk;
2302 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002303 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002304 k = get4byte(&pTrunk->aData[4]);
drhb6f41482004-05-14 01:58:11 +00002305 if( k==pBt->usableSize/4 - 8 ){
drh3aac2dd2004-04-26 14:10:20 +00002306 /* The trunk is full. Turn the page being freed into a new
2307 ** trunk page with no leaves. */
drha34b6762004-05-07 13:30:42 +00002308 rc = sqlite3pager_write(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00002309 if( rc ) return rc;
2310 put4byte(pPage->aData, pTrunk->pgno);
2311 put4byte(&pPage->aData[4], 0);
2312 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002313 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
2314 pPage->pgno, pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002315 }else{
2316 /* Add the newly freed page as a leaf on the current trunk */
drha34b6762004-05-07 13:30:42 +00002317 rc = sqlite3pager_write(pTrunk->aData);
drh3aac2dd2004-04-26 14:10:20 +00002318 if( rc ) return rc;
2319 put4byte(&pTrunk->aData[4], k+1);
2320 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drha34b6762004-05-07 13:30:42 +00002321 sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002322 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002323 }
2324 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002325 }
drh3b7511c2001-05-26 13:15:44 +00002326 return rc;
2327}
2328
2329/*
drh3aac2dd2004-04-26 14:10:20 +00002330** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00002331*/
drh3aac2dd2004-04-26 14:10:20 +00002332static int clearCell(MemPage *pPage, unsigned char *pCell){
2333 Btree *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00002334 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00002335 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00002336 int rc;
drh3b7511c2001-05-26 13:15:44 +00002337
drh6f11bef2004-05-13 01:12:56 +00002338 parseCell(pPage, pCell, &info);
2339 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00002340 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00002341 }
drh6f11bef2004-05-13 01:12:56 +00002342 ovflPgno = get4byte(&pCell[info.iOverflow]);
drh3aac2dd2004-04-26 14:10:20 +00002343 while( ovflPgno!=0 ){
2344 MemPage *pOvfl;
2345 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00002346 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002347 ovflPgno = get4byte(pOvfl->aData);
drha34b6762004-05-07 13:30:42 +00002348 rc = freePage(pOvfl);
drhbd03cae2001-06-02 02:40:57 +00002349 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002350 sqlite3pager_unref(pOvfl->aData);
drh3b7511c2001-05-26 13:15:44 +00002351 }
drh5e2f8b92001-05-28 00:41:15 +00002352 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00002353}
2354
2355/*
drh91025292004-05-03 19:49:32 +00002356** Create the byte sequence used to represent a cell on page pPage
2357** and write that byte sequence into pCell[]. Overflow pages are
2358** allocated and filled in as necessary. The calling procedure
2359** is responsible for making sure sufficient space has been allocated
2360** for pCell[].
2361**
2362** Note that pCell does not necessary need to point to the pPage->aData
2363** area. pCell might point to some temporary storage. The cell will
2364** be constructed in this temporary area then copied into pPage->aData
2365** later.
drh3b7511c2001-05-26 13:15:44 +00002366*/
2367static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00002368 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00002369 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00002370 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00002371 const void *pData,int nData, /* The data */
2372 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00002373){
drh3b7511c2001-05-26 13:15:44 +00002374 int nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002375 const void *pSrc;
drha34b6762004-05-07 13:30:42 +00002376 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00002377 int spaceLeft;
2378 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00002379 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00002380 unsigned char *pPrior;
2381 unsigned char *pPayload;
2382 Btree *pBt = pPage->pBt;
2383 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00002384 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00002385 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00002386
drh91025292004-05-03 19:49:32 +00002387 /* Fill in the header. */
2388 nHeader = 2;
2389 if( !pPage->leaf ){
2390 nHeader += 4;
2391 }
drh8b18dd42004-05-12 19:18:15 +00002392 if( pPage->hasData ){
drh91025292004-05-03 19:49:32 +00002393 nHeader += putVarint(&pCell[nHeader], nData);
drh6f11bef2004-05-13 01:12:56 +00002394 }else{
drh91025292004-05-03 19:49:32 +00002395 nData = 0;
2396 }
drh6f11bef2004-05-13 01:12:56 +00002397 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
2398 parseCell(pPage, pCell, &info);
2399 assert( info.nHeader==nHeader );
2400 assert( info.nKey==nKey );
2401 assert( info.nData==nData );
2402
2403 /* Fill in the payload */
drh3aac2dd2004-04-26 14:10:20 +00002404 nPayload = nData;
2405 if( pPage->intKey ){
2406 pSrc = pData;
2407 nSrc = nData;
drh91025292004-05-03 19:49:32 +00002408 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00002409 }else{
2410 nPayload += nKey;
2411 pSrc = pKey;
2412 nSrc = nKey;
2413 }
drh6f11bef2004-05-13 01:12:56 +00002414 *pnSize = info.nSize;
2415 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00002416 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00002417 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00002418
drh3b7511c2001-05-26 13:15:44 +00002419 while( nPayload>0 ){
2420 if( spaceLeft==0 ){
drh3aac2dd2004-04-26 14:10:20 +00002421 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
drh3b7511c2001-05-26 13:15:44 +00002422 if( rc ){
drh9b171272004-05-08 02:03:22 +00002423 releasePage(pToRelease);
drh3aac2dd2004-04-26 14:10:20 +00002424 clearCell(pPage, pCell);
drh3b7511c2001-05-26 13:15:44 +00002425 return rc;
2426 }
drh3aac2dd2004-04-26 14:10:20 +00002427 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00002428 releasePage(pToRelease);
2429 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00002430 pPrior = pOvfl->aData;
2431 put4byte(pPrior, 0);
2432 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00002433 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00002434 }
2435 n = nPayload;
2436 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00002437 if( n>nSrc ) n = nSrc;
2438 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00002439 nPayload -= n;
drhde647132004-05-07 17:57:49 +00002440 pPayload += n;
drh9b171272004-05-08 02:03:22 +00002441 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00002442 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00002443 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00002444 if( nSrc==0 ){
2445 nSrc = nData;
2446 pSrc = pData;
2447 }
drhdd793422001-06-28 01:54:48 +00002448 }
drh9b171272004-05-08 02:03:22 +00002449 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00002450 return SQLITE_OK;
2451}
2452
2453/*
drhbd03cae2001-06-02 02:40:57 +00002454** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00002455** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00002456** pointer in the third argument.
2457*/
drh4b70f112004-05-02 21:12:19 +00002458static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00002459 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00002460 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00002461
drhdd793422001-06-28 01:54:48 +00002462 if( pgno==0 ) return;
drh4b70f112004-05-02 21:12:19 +00002463 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00002464 aData = sqlite3pager_lookup(pBt->pPager, pgno);
drhda200cc2004-05-09 11:51:38 +00002465 if( aData ){
drhb6f41482004-05-14 01:58:11 +00002466 pThis = (MemPage*)&aData[pBt->usableSize];
drhda200cc2004-05-09 11:51:38 +00002467 if( pThis->isInit ){
2468 if( pThis->pParent!=pNewParent ){
2469 if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
2470 pThis->pParent = pNewParent;
2471 if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
2472 }
2473 pThis->idxParent = idx;
drhdd793422001-06-28 01:54:48 +00002474 }
drha34b6762004-05-07 13:30:42 +00002475 sqlite3pager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00002476 }
2477}
2478
2479/*
drh4b70f112004-05-02 21:12:19 +00002480** Change the pParent pointer of all children of pPage to point back
2481** to pPage.
2482**
drhbd03cae2001-06-02 02:40:57 +00002483** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00002484** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00002485**
2486** This routine gets called after you memcpy() one page into
2487** another.
2488*/
drh4b70f112004-05-02 21:12:19 +00002489static void reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002490 int i;
drh4b70f112004-05-02 21:12:19 +00002491 Btree *pBt;
2492
drha34b6762004-05-07 13:30:42 +00002493 if( pPage->leaf ) return;
drh4b70f112004-05-02 21:12:19 +00002494 pBt = pPage->pBt;
drhbd03cae2001-06-02 02:40:57 +00002495 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002496 reparentPage(pBt, get4byte(&pPage->aCell[i][2]), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002497 }
drh4b70f112004-05-02 21:12:19 +00002498 reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+6]), pPage, i);
drh428ae8c2003-01-04 16:48:09 +00002499 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002500}
2501
2502/*
2503** Remove the i-th cell from pPage. This routine effects pPage only.
2504** The cell content is not freed or deallocated. It is assumed that
2505** the cell content has been copied someplace else. This routine just
2506** removes the reference to the cell from pPage.
2507**
2508** "sz" must be the number of bytes in the cell.
2509**
drhda200cc2004-05-09 11:51:38 +00002510** Try to maintain the integrity of the linked list of cells. But if
2511** the cell being inserted does not fit on the page, this will not be
2512** possible. If the linked list is not maintained, then just update
2513** pPage->aCell[] and set the pPage->needRelink flag so that we will
2514** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002515*/
drh4b70f112004-05-02 21:12:19 +00002516static void dropCell(MemPage *pPage, int idx, int sz){
drhde647132004-05-07 17:57:49 +00002517 int j, pc;
drhda200cc2004-05-09 11:51:38 +00002518 u8 *data;
drh8c42ca92001-06-22 19:15:00 +00002519 assert( idx>=0 && idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002520 assert( sz==cellSize(pPage, pPage->aCell[idx]) );
drha34b6762004-05-07 13:30:42 +00002521 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002522 assert( pPage->aCell[idx]>=pPage->aData );
drhb6f41482004-05-14 01:58:11 +00002523 assert( pPage->aCell[idx]<=&pPage->aData[pPage->pBt->usableSize-sz] );
drhda200cc2004-05-09 11:51:38 +00002524 data = pPage->aData;
2525 pc = Addr(pPage->aCell[idx]) - Addr(data);
drhb6f41482004-05-14 01:58:11 +00002526 assert( pc>pPage->hdrOffset && pc+sz<=pPage->pBt->usableSize );
drhde647132004-05-07 17:57:49 +00002527 freeSpace(pPage, pc, sz);
drh7c717f72001-06-24 20:39:41 +00002528 for(j=idx; j<pPage->nCell-1; j++){
drh4b70f112004-05-02 21:12:19 +00002529 pPage->aCell[j] = pPage->aCell[j+1];
drh14acc042001-06-10 19:56:58 +00002530 }
2531 pPage->nCell--;
drhda200cc2004-05-09 11:51:38 +00002532 if( !pPage->isOverfull && !pPage->needRelink ){
2533 u8 *pPrev;
2534 if( idx==0 ){
2535 pPrev = &data[pPage->hdrOffset+3];
2536 }else{
2537 pPrev = pPage->aCell[idx-1];
2538 }
2539 if( idx<pPage->nCell ){
2540 pc = Addr(pPage->aCell[idx]) - Addr(data);
2541 }else{
2542 pc = 0;
2543 }
2544 put2byte(pPrev, pc);
2545 pageIntegrity(pPage);
2546 }else{
2547 pPage->needRelink = 1;
2548 }
drh428ae8c2003-01-04 16:48:09 +00002549 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002550}
2551
2552/*
2553** Insert a new cell on pPage at cell index "i". pCell points to the
2554** content of the cell.
2555**
2556** If the cell content will fit on the page, then put it there. If it
drh24cd67e2004-05-10 16:18:47 +00002557** will not fit and pTemp is not NULL, then make a copy of the content
2558** into pTemp, set pPage->aCell[i] point to pTemp, and set pPage->isOverfull.
2559** If the content will not fit and pTemp is NULL, then make pPage->aCell[i]
2560** point to pCell and set pPage->isOverfull.
drh14acc042001-06-10 19:56:58 +00002561**
drhda200cc2004-05-09 11:51:38 +00002562** Try to maintain the integrity of the linked list of cells. But if
2563** the cell being inserted does not fit on the page, this will not be
2564** possible. If the linked list is not maintained, then just update
2565** pPage->aCell[] and set the pPage->needRelink flag so that we will
2566** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002567*/
drh24cd67e2004-05-10 16:18:47 +00002568static void insertCell(
2569 MemPage *pPage, /* Page into which we are copying */
2570 int i, /* Which cell on pPage to insert after */
2571 u8 *pCell, /* Text of the new cell to insert */
2572 int sz, /* Bytes of data in pCell */
2573 u8 *pTemp /* Temp storage space for pCell, if needed */
2574){
drh14acc042001-06-10 19:56:58 +00002575 int idx, j;
2576 assert( i>=0 && i<=pPage->nCell );
drha34b6762004-05-07 13:30:42 +00002577 assert( sz==cellSize(pPage, pCell) );
2578 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002579 idx = pPage->needRelink ? 0 : allocateSpace(pPage, sz);
drh4b70f112004-05-02 21:12:19 +00002580 resizeCellArray(pPage, pPage->nCell+1);
drh14acc042001-06-10 19:56:58 +00002581 for(j=pPage->nCell; j>i; j--){
drh4b70f112004-05-02 21:12:19 +00002582 pPage->aCell[j] = pPage->aCell[j-1];
drh14acc042001-06-10 19:56:58 +00002583 }
2584 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002585 if( idx<=0 ){
2586 pPage->isOverfull = 1;
drh24cd67e2004-05-10 16:18:47 +00002587 if( pTemp ){
2588 memcpy(pTemp, pCell, sz);
2589 }else{
2590 pTemp = pCell;
2591 }
2592 pPage->aCell[i] = pTemp;
drh14acc042001-06-10 19:56:58 +00002593 }else{
drhda200cc2004-05-09 11:51:38 +00002594 u8 *data = pPage->aData;
2595 memcpy(&data[idx], pCell, sz);
2596 pPage->aCell[i] = &data[idx];
2597 }
2598 if( !pPage->isOverfull && !pPage->needRelink ){
2599 u8 *pPrev;
2600 int pc;
2601 if( i==0 ){
2602 pPrev = &pPage->aData[pPage->hdrOffset+3];
2603 }else{
2604 pPrev = pPage->aCell[i-1];
2605 }
2606 pc = get2byte(pPrev);
2607 put2byte(pPrev, idx);
2608 put2byte(pPage->aCell[i], pc);
2609 pageIntegrity(pPage);
2610 }else{
2611 pPage->needRelink = 1;
drh14acc042001-06-10 19:56:58 +00002612 }
drh428ae8c2003-01-04 16:48:09 +00002613 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002614}
2615
2616/*
2617** Rebuild the linked list of cells on a page so that the cells
drh4b70f112004-05-02 21:12:19 +00002618** occur in the order specified by the pPage->aCell[] array.
drh8c42ca92001-06-22 19:15:00 +00002619** Invoke this routine once to repair damage after one or more
2620** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002621*/
drh4b70f112004-05-02 21:12:19 +00002622static void relinkCellList(MemPage *pPage){
2623 int i, idxFrom;
drha34b6762004-05-07 13:30:42 +00002624 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002625 if( !pPage->needRelink ) return;
drh4b70f112004-05-02 21:12:19 +00002626 idxFrom = pPage->hdrOffset+3;
drh14acc042001-06-10 19:56:58 +00002627 for(i=0; i<pPage->nCell; i++){
drhde647132004-05-07 17:57:49 +00002628 int idx = Addr(pPage->aCell[i]) - Addr(pPage->aData);
drhb6f41482004-05-14 01:58:11 +00002629 assert( idx>pPage->hdrOffset && idx<pPage->pBt->usableSize );
drh4b70f112004-05-02 21:12:19 +00002630 put2byte(&pPage->aData[idxFrom], idx);
2631 idxFrom = idx;
drh14acc042001-06-10 19:56:58 +00002632 }
drh4b70f112004-05-02 21:12:19 +00002633 put2byte(&pPage->aData[idxFrom], 0);
drhda200cc2004-05-09 11:51:38 +00002634 pPage->needRelink = 0;
drh14acc042001-06-10 19:56:58 +00002635}
2636
2637/*
drhc8629a12004-05-08 20:07:40 +00002638** GCC does not define the offsetof() macro so we'll have to do it
2639** ourselves.
2640*/
2641#ifndef offsetof
2642#define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
2643#endif
2644
2645/*
drh91025292004-05-03 19:49:32 +00002646** Move the content of the page at pFrom over to pTo. The pFrom->aCell[]
drh4b70f112004-05-02 21:12:19 +00002647** pointers that point into pFrom->aData[] must be adjusted to point
2648** into pTo->aData[] instead. But some pFrom->aCell[] entries might
2649** not point to pFrom->aData[]. Those are unchanged.
drh91025292004-05-03 19:49:32 +00002650**
2651** Over this operation completes, the meta data for pFrom is zeroed.
drh14acc042001-06-10 19:56:58 +00002652*/
drhda200cc2004-05-09 11:51:38 +00002653static void movePage(MemPage *pTo, MemPage *pFrom){
drh14acc042001-06-10 19:56:58 +00002654 uptr from, to;
2655 int i;
drhb6f41482004-05-14 01:58:11 +00002656 int usableSize;
drh4b70f112004-05-02 21:12:19 +00002657 int ofst;
2658
2659 assert( pTo->hdrOffset==0 );
drh3a4c1412004-05-09 20:40:11 +00002660 assert( pFrom->isInit );
drh4b70f112004-05-02 21:12:19 +00002661 ofst = pFrom->hdrOffset;
drhb6f41482004-05-14 01:58:11 +00002662 usableSize = pFrom->pBt->usableSize;
drh91025292004-05-03 19:49:32 +00002663 sqliteFree(pTo->aCell);
drhb6f41482004-05-14 01:58:11 +00002664 memcpy(pTo->aData, &pFrom->aData[ofst], usableSize - ofst);
drhc8629a12004-05-08 20:07:40 +00002665 memcpy(pTo, pFrom, offsetof(MemPage, aData));
2666 pFrom->isInit = 0;
2667 pFrom->aCell = 0;
drh4b70f112004-05-02 21:12:19 +00002668 assert( pTo->aData[5]<155 );
2669 pTo->aData[5] += ofst;
drh14acc042001-06-10 19:56:58 +00002670 pTo->isOverfull = pFrom->isOverfull;
drh4b70f112004-05-02 21:12:19 +00002671 to = Addr(pTo->aData);
drh91025292004-05-03 19:49:32 +00002672 from = Addr(&pFrom->aData[ofst]);
drh14acc042001-06-10 19:56:58 +00002673 for(i=0; i<pTo->nCell; i++){
drh91025292004-05-03 19:49:32 +00002674 uptr x = Addr(pTo->aCell[i]);
drhb6f41482004-05-14 01:58:11 +00002675 if( x>from && x<from+usableSize-ofst ){
drh4b70f112004-05-02 21:12:19 +00002676 *((uptr*)&pTo->aCell[i]) = x + to - from;
drh14acc042001-06-10 19:56:58 +00002677 }
2678 }
drhbd03cae2001-06-02 02:40:57 +00002679}
2680
2681/*
drhc3b70572003-01-04 19:44:07 +00002682** The following parameters determine how many adjacent pages get involved
2683** in a balancing operation. NN is the number of neighbors on either side
2684** of the page that participate in the balancing operation. NB is the
2685** total number of pages that participate, including the target page and
2686** NN neighbors on either side.
2687**
2688** The minimum value of NN is 1 (of course). Increasing NN above 1
2689** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2690** in exchange for a larger degradation in INSERT and UPDATE performance.
2691** The value of NN appears to give the best results overall.
2692*/
2693#define NN 1 /* Number of neighbors on either side of pPage */
2694#define NB (NN*2+1) /* Total pages involved in the balance */
2695
2696/*
drh8b2f49b2001-06-08 00:21:52 +00002697** This routine redistributes Cells on pPage and up to two siblings
2698** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002699** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002700** though both siblings might come from one side if pPage is the first
2701** or last child of its parent. If pPage has fewer than two siblings
2702** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002703** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002704**
2705** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002706** one in an effort to keep pages between 66% and 100% full. The root page
2707** is special and is allowed to be less than 66% full. If pPage is
2708** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002709** or decreased by one, as necessary, to keep the root page from being
2710** overfull or empty.
2711**
drh4b70f112004-05-02 21:12:19 +00002712** This routine alwyas calls relinkCellList() on its input page regardless of
drh14acc042001-06-10 19:56:58 +00002713** whether or not it does any real balancing. Client routines will typically
2714** invoke insertCell() or dropCell() before calling this routine, so we
2715** need to call relinkCellList() to clean up the mess that those other
2716** routines left behind.
2717**
drh8b2f49b2001-06-08 00:21:52 +00002718** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00002719** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002720** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00002721** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00002722**
drh8c42ca92001-06-22 19:15:00 +00002723** In the course of balancing the siblings of pPage, the parent of pPage
2724** might become overfull or underfull. If that happens, then this routine
2725** is called recursively on the parent.
2726**
drh5e00f6c2001-09-13 13:46:56 +00002727** If this routine fails for any reason, it might leave the database
2728** in a corrupted state. So if this routine fails, the database should
2729** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002730*/
drh4b70f112004-05-02 21:12:19 +00002731static int balance(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00002732 MemPage *pParent; /* The parent of pPage */
drh4b70f112004-05-02 21:12:19 +00002733 Btree *pBt; /* The whole database */
drha34b6762004-05-07 13:30:42 +00002734 int nCell; /* Number of cells in aCell[] */
drh8b2f49b2001-06-08 00:21:52 +00002735 int nOld; /* Number of pages in apOld[] */
2736 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002737 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002738 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00002739 int idx; /* Index of pPage in pParent->aCell[] */
2740 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00002741 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00002742 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00002743 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00002744 int usableSpace; /* Bytes in pPage beyond the header */
2745 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00002746 int subtotal; /* Subtotal of bytes in cells on one page */
drhb6f41482004-05-14 01:58:11 +00002747 int iSpace = 0; /* First unused byte of aSpace[] */
drhda200cc2004-05-09 11:51:38 +00002748 MemPage *extraUnref = 0; /* Unref this page if not zero */
drhc3b70572003-01-04 19:44:07 +00002749 MemPage *apOld[NB]; /* pPage and up to two siblings */
2750 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00002751 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drhc3b70572003-01-04 19:44:07 +00002752 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2753 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2754 int idxDiv[NB]; /* Indices of divider cells in pParent */
drh4b70f112004-05-02 21:12:19 +00002755 u8 *apDiv[NB]; /* Divider cells in pParent */
drha34b6762004-05-07 13:30:42 +00002756 int cntNew[NB+1]; /* Index in aCell[] of cell after i-th page */
drhc3b70572003-01-04 19:44:07 +00002757 int szNew[NB+1]; /* Combined size of cells place on i-th page */
drh4b70f112004-05-02 21:12:19 +00002758 u8 *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
drhc3b70572003-01-04 19:44:07 +00002759 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh4b70f112004-05-02 21:12:19 +00002760 u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)]; /* Space for apCopy[] */
drhb6f41482004-05-14 01:58:11 +00002761 u8 aSpace[MX_PAGE_SIZE*4]; /* Space to copies of divider cells */
drh8b2f49b2001-06-08 00:21:52 +00002762
drh14acc042001-06-10 19:56:58 +00002763 /*
2764 ** Return without doing any work if pPage is neither overfull nor
2765 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002766 */
drh3a4c1412004-05-09 20:40:11 +00002767 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00002768 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002769 pBt = pPage->pBt;
drhb6f41482004-05-14 01:58:11 +00002770 if( !pPage->isOverfull && pPage->nFree<pBt->usableSize*2/3 && pPage->nCell>=2){
drh4b70f112004-05-02 21:12:19 +00002771 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002772 return SQLITE_OK;
2773 }
2774
2775 /*
drh4b70f112004-05-02 21:12:19 +00002776 ** Find the parent of the page to be balanced. If there is no parent,
2777 ** it means this page is the root page and special rules apply.
drh8b2f49b2001-06-08 00:21:52 +00002778 */
drh14acc042001-06-10 19:56:58 +00002779 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002780 if( pParent==0 ){
2781 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002782 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002783 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002784 if( pPage->nCell==0 ){
drh8856d6a2004-04-29 14:42:46 +00002785 if( pPage->leaf ){
2786 /* The table is completely empty */
2787 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002788 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
drh8856d6a2004-04-29 14:42:46 +00002789 }else{
2790 /* The root page is empty but has one child. Transfer the
2791 ** information from that one child into the root page if it
drh3a4c1412004-05-09 20:40:11 +00002792 ** will fit. This reduces the depth of the tree by one.
drh8856d6a2004-04-29 14:42:46 +00002793 **
2794 ** If the root page is page 1, it has less space available than
drh4b70f112004-05-02 21:12:19 +00002795 ** its child (due to the 100 byte header that occurs at the beginning
2796 ** of the database fle), so it might not be able to hold all of the
2797 ** information currently contained in the child. If this is the
2798 ** case, then do not do the transfer. Leave page 1 empty except
2799 ** for the right-pointer to the child page. The child page becomes
2800 ** the virtual root of the tree.
drh8b2f49b2001-06-08 00:21:52 +00002801 */
drha34b6762004-05-07 13:30:42 +00002802 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+6]);
2803 assert( pgnoChild>0 && pgnoChild<=sqlite3pager_pagecount(pBt->pPager) );
drh8856d6a2004-04-29 14:42:46 +00002804 rc = getPage(pBt, pgnoChild, &pChild);
drh8b2f49b2001-06-08 00:21:52 +00002805 if( rc ) return rc;
drh8856d6a2004-04-29 14:42:46 +00002806 if( pPage->pgno==1 ){
drh4b70f112004-05-02 21:12:19 +00002807 rc = initPage(pChild, pPage);
drh8856d6a2004-04-29 14:42:46 +00002808 if( rc ) return rc;
2809 if( pChild->nFree>=100 ){
drh4b70f112004-05-02 21:12:19 +00002810 /* The child information will fit on the root page, so do the
2811 ** copy */
2812 zeroPage(pPage, pChild->aData[0]);
2813 resizeCellArray(pPage, pChild->nCell);
2814 for(i=0; i<pChild->nCell; i++){
2815 insertCell(pPage, i, pChild->aCell[i],
drh24cd67e2004-05-10 16:18:47 +00002816 cellSize(pChild, pChild->aCell[i]), 0);
drh4b70f112004-05-02 21:12:19 +00002817 }
2818 freePage(pChild);
drhda200cc2004-05-09 11:51:38 +00002819 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
drh4b70f112004-05-02 21:12:19 +00002820 }else{
2821 /* The child has more information that will fit on the root.
2822 ** The tree is already balanced. Do nothing. */
drhda200cc2004-05-09 11:51:38 +00002823 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
drh8856d6a2004-04-29 14:42:46 +00002824 }
2825 }else{
drhb6f41482004-05-14 01:58:11 +00002826 memcpy(pPage->aData, pChild->aData, pBt->usableSize);
drh8856d6a2004-04-29 14:42:46 +00002827 pPage->isInit = 0;
drh4b70f112004-05-02 21:12:19 +00002828 pPage->pParent = 0;
2829 rc = initPage(pPage, 0);
drh8856d6a2004-04-29 14:42:46 +00002830 assert( rc==SQLITE_OK );
drh4b70f112004-05-02 21:12:19 +00002831 freePage(pChild);
drh3a4c1412004-05-09 20:40:11 +00002832 TRACE(("BALANCE: transfer child %d into root %d\n",
2833 pChild->pgno, pPage->pgno));
drh5edc3122001-09-13 21:53:09 +00002834 }
drh4b70f112004-05-02 21:12:19 +00002835 reparentChildPages(pPage);
2836 releasePage(pChild);
drh8b2f49b2001-06-08 00:21:52 +00002837 }
2838 return SQLITE_OK;
2839 }
drh14acc042001-06-10 19:56:58 +00002840 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002841 /* It is OK for the root page to be less than half full.
2842 */
drha34b6762004-05-07 13:30:42 +00002843 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002844 TRACE(("BALANCE: root page %d is low - no changes\n", pPage->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002845 return SQLITE_OK;
2846 }
drh14acc042001-06-10 19:56:58 +00002847 /*
2848 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002849 ** When this happens, Create a new child page and copy the
2850 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002851 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002852 ** child. Then fall thru to the code below which will cause
2853 ** the overfull child page to be split.
2854 */
drh4b70f112004-05-02 21:12:19 +00002855 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
drh14acc042001-06-10 19:56:58 +00002856 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002857 assert( sqlite3pager_iswriteable(pChild->aData) );
drhda200cc2004-05-09 11:51:38 +00002858 movePage(pChild, pPage);
drhc8629a12004-05-08 20:07:40 +00002859 assert( pChild->aData[0]==pPage->aData[pPage->hdrOffset] );
drh14acc042001-06-10 19:56:58 +00002860 pChild->pParent = pPage;
drh457f5012004-05-09 01:35:05 +00002861 sqlite3pager_ref(pPage->aData);
drhda200cc2004-05-09 11:51:38 +00002862 pChild->idxParent = 0;
drh14acc042001-06-10 19:56:58 +00002863 pChild->isOverfull = 1;
drhc8629a12004-05-08 20:07:40 +00002864 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
drh4b70f112004-05-02 21:12:19 +00002865 put4byte(&pPage->aData[pPage->hdrOffset+6], pChild->pgno);
drh8b2f49b2001-06-08 00:21:52 +00002866 pParent = pPage;
2867 pPage = pChild;
drhda200cc2004-05-09 11:51:38 +00002868 extraUnref = pChild;
drh3a4c1412004-05-09 20:40:11 +00002869 TRACE(("BALANCE: copy root %d into %d and balance %d\n",
2870 pParent->pgno, pPage->pgno, pPage->pgno));
2871 }else{
2872 TRACE(("BALANCE: begin page %d child of %d\n",
2873 pPage->pgno, pParent->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002874 }
drha34b6762004-05-07 13:30:42 +00002875 rc = sqlite3pager_write(pParent->aData);
drh6019e162001-07-02 17:51:45 +00002876 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002877 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002878
drh8b2f49b2001-06-08 00:21:52 +00002879 /*
drh4b70f112004-05-02 21:12:19 +00002880 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00002881 ** to pPage. The "idx" variable is the index of that cell. If pPage
2882 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002883 */
drhbb49aba2003-01-04 18:53:27 +00002884 if( pParent->idxShift ){
drha34b6762004-05-07 13:30:42 +00002885 Pgno pgno;
drh4b70f112004-05-02 21:12:19 +00002886 pgno = pPage->pgno;
drha34b6762004-05-07 13:30:42 +00002887 assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00002888 for(idx=0; idx<pParent->nCell; idx++){
drha34b6762004-05-07 13:30:42 +00002889 if( get4byte(&pParent->aCell[idx][2])==pgno ){
drhbb49aba2003-01-04 18:53:27 +00002890 break;
2891 }
drh8b2f49b2001-06-08 00:21:52 +00002892 }
drh4b70f112004-05-02 21:12:19 +00002893 assert( idx<pParent->nCell
2894 || get4byte(&pParent->aData[pParent->hdrOffset+6])==pgno );
drhbb49aba2003-01-04 18:53:27 +00002895 }else{
2896 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002897 }
drh8b2f49b2001-06-08 00:21:52 +00002898
2899 /*
drh14acc042001-06-10 19:56:58 +00002900 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002901 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002902 */
drh14acc042001-06-10 19:56:58 +00002903 nOld = nNew = 0;
drha34b6762004-05-07 13:30:42 +00002904 sqlite3pager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00002905
2906 /*
drh4b70f112004-05-02 21:12:19 +00002907 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002908 ** the siblings. An attempt is made to find NN siblings on either
2909 ** side of pPage. More siblings are taken from one side, however, if
2910 ** pPage there are fewer than NN siblings on the other side. If pParent
2911 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002912 */
drhc3b70572003-01-04 19:44:07 +00002913 nxDiv = idx - NN;
2914 if( nxDiv + NB > pParent->nCell ){
2915 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002916 }
drhc3b70572003-01-04 19:44:07 +00002917 if( nxDiv<0 ){
2918 nxDiv = 0;
2919 }
drh8b2f49b2001-06-08 00:21:52 +00002920 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002921 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002922 if( k<pParent->nCell ){
2923 idxDiv[i] = k;
drh4b70f112004-05-02 21:12:19 +00002924 apDiv[i] = pParent->aCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002925 nDiv++;
drha34b6762004-05-07 13:30:42 +00002926 assert( !pParent->leaf );
2927 pgnoOld[i] = get4byte(&apDiv[i][2]);
drh14acc042001-06-10 19:56:58 +00002928 }else if( k==pParent->nCell ){
drh4b70f112004-05-02 21:12:19 +00002929 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+6]);
drh14acc042001-06-10 19:56:58 +00002930 }else{
2931 break;
drh8b2f49b2001-06-08 00:21:52 +00002932 }
drhde647132004-05-07 17:57:49 +00002933 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002934 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002935 apOld[i]->idxParent = k;
drh91025292004-05-03 19:49:32 +00002936 apCopy[i] = 0;
2937 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00002938 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002939 }
2940
2941 /*
drh14acc042001-06-10 19:56:58 +00002942 ** Make copies of the content of pPage and its siblings into aOld[].
2943 ** The rest of this function will use data from the copies rather
2944 ** that the original pages since the original pages will be in the
2945 ** process of being overwritten.
2946 */
2947 for(i=0; i<nOld; i++){
drhc8629a12004-05-08 20:07:40 +00002948 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
drhb6f41482004-05-14 01:58:11 +00002949 p->aData = &((u8*)p)[-pBt->usableSize];
drhda200cc2004-05-09 11:51:38 +00002950 p->aCell = 0;
2951 p->hdrOffset = 0;
2952 movePage(p, apOld[i]);
drh14acc042001-06-10 19:56:58 +00002953 }
2954
2955 /*
2956 ** Load pointers to all cells on sibling pages and the divider cells
2957 ** into the local apCell[] array. Make copies of the divider cells
drhb6f41482004-05-14 01:58:11 +00002958 ** into space obtained form aSpace[] and remove the the divider Cells
2959 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00002960 **
2961 ** If the siblings are on leaf pages, then the child pointers of the
2962 ** divider cells are stripped from the cells before they are copied
drhb6f41482004-05-14 01:58:11 +00002963 ** into aSpace[]. In this wall, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00002964 ** child pointers. If siblings are not leaves, then all cell in
2965 ** apCell[] include child pointers. Either way, all cells in apCell[]
2966 ** are alike.
drh8b2f49b2001-06-08 00:21:52 +00002967 */
2968 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00002969 leafCorrection = pPage->leaf*4;
drh8b18dd42004-05-12 19:18:15 +00002970 leafData = pPage->leafData && pPage->leaf;
drh8b2f49b2001-06-08 00:21:52 +00002971 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002972 MemPage *pOld = apCopy[i];
drh8b2f49b2001-06-08 00:21:52 +00002973 for(j=0; j<pOld->nCell; j++){
drh4b70f112004-05-02 21:12:19 +00002974 apCell[nCell] = pOld->aCell[j];
2975 szCell[nCell] = cellSize(pOld, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002976 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002977 }
2978 if( i<nOld-1 ){
drhb6f41482004-05-14 01:58:11 +00002979 int sz = cellSize(pParent, apDiv[i]);
drh8b18dd42004-05-12 19:18:15 +00002980 if( leafData ){
drh8b18dd42004-05-12 19:18:15 +00002981 dropCell(pParent, nxDiv, sz);
drh4b70f112004-05-02 21:12:19 +00002982 }else{
drhb6f41482004-05-14 01:58:11 +00002983 u8 *pTemp;
2984 szCell[nCell] = sz;
2985 pTemp = &aSpace[iSpace];
2986 iSpace += sz;
2987 assert( iSpace<=sizeof(aSpace) );
2988 memcpy(pTemp, apDiv[i], sz);
2989 apCell[nCell] = pTemp+leafCorrection;
2990 dropCell(pParent, nxDiv, sz);
drh8b18dd42004-05-12 19:18:15 +00002991 szCell[nCell] -= leafCorrection;
drhb6f41482004-05-14 01:58:11 +00002992 assert( get4byte(pTemp+2)==pgnoOld[i] );
drh8b18dd42004-05-12 19:18:15 +00002993 if( !pOld->leaf ){
2994 assert( leafCorrection==0 );
2995 /* The right pointer of the child page pOld becomes the left
2996 ** pointer of the divider cell */
2997 memcpy(&apCell[nCell][2], &pOld->aData[pOld->hdrOffset+6], 4);
2998 }else{
2999 assert( leafCorrection==4 );
3000 }
3001 nCell++;
drh4b70f112004-05-02 21:12:19 +00003002 }
drh8b2f49b2001-06-08 00:21:52 +00003003 }
3004 }
3005
3006 /*
drh6019e162001-07-02 17:51:45 +00003007 ** Figure out the number of pages needed to hold all nCell cells.
3008 ** Store this number in "k". Also compute szNew[] which is the total
3009 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00003010 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00003011 ** cntNew[k] should equal nCell.
3012 **
3013 ** This little patch of code is critical for keeping the tree
3014 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00003015 */
drhb6f41482004-05-14 01:58:11 +00003016 usableSpace = pBt->usableSize - 10 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00003017 for(subtotal=k=i=0; i<nCell; i++){
3018 subtotal += szCell[i];
drh4b70f112004-05-02 21:12:19 +00003019 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00003020 szNew[k] = subtotal - szCell[i];
3021 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00003022 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00003023 subtotal = 0;
3024 k++;
3025 }
3026 }
3027 szNew[k] = subtotal;
3028 cntNew[k] = nCell;
3029 k++;
3030 for(i=k-1; i>0; i--){
drh4b70f112004-05-02 21:12:19 +00003031 while( szNew[i]<usableSpace/2 ){
drh6019e162001-07-02 17:51:45 +00003032 cntNew[i-1]--;
3033 assert( cntNew[i-1]>0 );
3034 szNew[i] += szCell[cntNew[i-1]];
3035 szNew[i-1] -= szCell[cntNew[i-1]-1];
3036 }
3037 }
3038 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00003039
3040 /*
drh6b308672002-07-08 02:16:37 +00003041 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00003042 */
drh4b70f112004-05-02 21:12:19 +00003043 assert( pPage->pgno>1 );
3044 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00003045 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00003046 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00003047 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00003048 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00003049 pgnoNew[i] = pgnoOld[i];
3050 apOld[i] = 0;
drhda200cc2004-05-09 11:51:38 +00003051 sqlite3pager_write(pNew->aData);
drh6b308672002-07-08 02:16:37 +00003052 }else{
drhda200cc2004-05-09 11:51:38 +00003053 rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00003054 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00003055 apNew[i] = pNew;
drh6b308672002-07-08 02:16:37 +00003056 }
drh14acc042001-06-10 19:56:58 +00003057 nNew++;
drhda200cc2004-05-09 11:51:38 +00003058 zeroPage(pNew, pageFlags);
drh8b2f49b2001-06-08 00:21:52 +00003059 }
3060
drh6b308672002-07-08 02:16:37 +00003061 /* Free any old pages that were not reused as new pages.
3062 */
3063 while( i<nOld ){
drh4b70f112004-05-02 21:12:19 +00003064 rc = freePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00003065 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00003066 releasePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00003067 apOld[i] = 0;
3068 i++;
3069 }
3070
drh8b2f49b2001-06-08 00:21:52 +00003071 /*
drhf9ffac92002-03-02 19:00:31 +00003072 ** Put the new pages in accending order. This helps to
3073 ** keep entries in the disk file in order so that a scan
3074 ** of the table is a linear scan through the file. That
3075 ** in turn helps the operating system to deliver pages
3076 ** from the disk more rapidly.
3077 **
3078 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00003079 ** n is never more than NB (a small constant), that should
3080 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00003081 **
drhc3b70572003-01-04 19:44:07 +00003082 ** When NB==3, this one optimization makes the database
3083 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00003084 */
3085 for(i=0; i<k-1; i++){
3086 int minV = pgnoNew[i];
3087 int minI = i;
3088 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00003089 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00003090 minI = j;
3091 minV = pgnoNew[j];
3092 }
3093 }
3094 if( minI>i ){
3095 int t;
3096 MemPage *pT;
3097 t = pgnoNew[i];
3098 pT = apNew[i];
3099 pgnoNew[i] = pgnoNew[minI];
3100 apNew[i] = apNew[minI];
3101 pgnoNew[minI] = t;
3102 apNew[minI] = pT;
3103 }
3104 }
drh24cd67e2004-05-10 16:18:47 +00003105 TRACE(("BALANCE: old: %d %d %d new: %d %d %d %d\n",
3106 pgnoOld[0],
3107 nOld>=2 ? pgnoOld[1] : 0,
3108 nOld>=3 ? pgnoOld[2] : 0,
3109 pgnoNew[0],
3110 nNew>=2 ? pgnoNew[1] : 0,
3111 nNew>=3 ? pgnoNew[2] : 0,
3112 nNew>=4 ? pgnoNew[3] : 0));
3113
drhf9ffac92002-03-02 19:00:31 +00003114
3115 /*
drh14acc042001-06-10 19:56:58 +00003116 ** Evenly distribute the data in apCell[] across the new pages.
3117 ** Insert divider cells into pParent as necessary.
3118 */
3119 j = 0;
3120 for(i=0; i<nNew; i++){
3121 MemPage *pNew = apNew[i];
drh4b70f112004-05-02 21:12:19 +00003122 assert( pNew->pgno==pgnoNew[i] );
3123 resizeCellArray(pNew, cntNew[i] - j);
drh6019e162001-07-02 17:51:45 +00003124 while( j<cntNew[i] ){
3125 assert( pNew->nFree>=szCell[j] );
drh24cd67e2004-05-10 16:18:47 +00003126 insertCell(pNew, pNew->nCell, apCell[j], szCell[j], 0);
drh14acc042001-06-10 19:56:58 +00003127 j++;
3128 }
drh6019e162001-07-02 17:51:45 +00003129 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00003130 assert( !pNew->isOverfull );
drh4b70f112004-05-02 21:12:19 +00003131 relinkCellList(pNew);
drh14acc042001-06-10 19:56:58 +00003132 if( i<nNew-1 && j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00003133 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00003134 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00003135 int sz;
3136 pCell = apCell[j];
3137 sz = szCell[j] + leafCorrection;
drh4b70f112004-05-02 21:12:19 +00003138 if( !pNew->leaf ){
drh24cd67e2004-05-10 16:18:47 +00003139 memcpy(&pNew->aData[6], pCell+2, 4);
3140 pTemp = 0;
drh8b18dd42004-05-12 19:18:15 +00003141 }else if( leafData ){
drh6f11bef2004-05-13 01:12:56 +00003142 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00003143 j--;
drh6f11bef2004-05-13 01:12:56 +00003144 parseCell(pNew, apCell[j], &info);
drhb6f41482004-05-14 01:58:11 +00003145 pCell = &aSpace[iSpace];
drh6f11bef2004-05-13 01:12:56 +00003146 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
drhb6f41482004-05-14 01:58:11 +00003147 iSpace += sz;
3148 assert( iSpace<=sizeof(aSpace) );
drh8b18dd42004-05-12 19:18:15 +00003149 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00003150 }else{
3151 pCell -= 4;
drhb6f41482004-05-14 01:58:11 +00003152 pTemp = &aSpace[iSpace];
3153 iSpace += sz;
3154 assert( iSpace<=sizeof(aSpace) );
drh4b70f112004-05-02 21:12:19 +00003155 }
drh8b18dd42004-05-12 19:18:15 +00003156 insertCell(pParent, nxDiv, pCell, sz, pTemp);
drh4b70f112004-05-02 21:12:19 +00003157 put4byte(&pParent->aCell[nxDiv][2], pNew->pgno);
drh14acc042001-06-10 19:56:58 +00003158 j++;
3159 nxDiv++;
3160 }
3161 }
drh6019e162001-07-02 17:51:45 +00003162 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00003163 if( (pageFlags & PTF_LEAF)==0 ){
3164 memcpy(&apNew[nNew-1]->aData[6], &apCopy[nOld-1]->aData[6], 4);
drh14acc042001-06-10 19:56:58 +00003165 }
drh4b70f112004-05-02 21:12:19 +00003166 if( nxDiv==pParent->nCell ){
3167 /* Right-most sibling is the right-most child of pParent */
3168 put4byte(&pParent->aData[pParent->hdrOffset+6], pgnoNew[nNew-1]);
3169 }else{
3170 /* Right-most sibling is the left child of the first entry in pParent
3171 ** past the right-most divider entry */
drha34b6762004-05-07 13:30:42 +00003172 put4byte(&pParent->aCell[nxDiv][2], pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00003173 }
3174
3175 /*
3176 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00003177 */
3178 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00003179 reparentChildPages(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003180 }
drh4b70f112004-05-02 21:12:19 +00003181 reparentChildPages(pParent);
drh8b2f49b2001-06-08 00:21:52 +00003182
3183 /*
drh3a4c1412004-05-09 20:40:11 +00003184 ** Balance the parent page. Note that the current page (pPage) might
3185 ** have been added to the freelist is it might no longer be initialized.
3186 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00003187 */
drhda200cc2004-05-09 11:51:38 +00003188 assert( pParent->isInit );
drh3a4c1412004-05-09 20:40:11 +00003189 /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
3190 /* pageIntegrity(pPage); // No! pPage might have been added to freelist */
drh4b70f112004-05-02 21:12:19 +00003191 rc = balance(pParent);
drhda200cc2004-05-09 11:51:38 +00003192
drh8b2f49b2001-06-08 00:21:52 +00003193 /*
drh14acc042001-06-10 19:56:58 +00003194 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00003195 */
drh14acc042001-06-10 19:56:58 +00003196balance_cleanup:
drh8b2f49b2001-06-08 00:21:52 +00003197 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00003198 releasePage(apOld[i]);
3199 if( apCopy[i] ){
drh91025292004-05-03 19:49:32 +00003200 sqliteFree(apCopy[i]->aCell);
3201 }
drh8b2f49b2001-06-08 00:21:52 +00003202 }
drh14acc042001-06-10 19:56:58 +00003203 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00003204 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003205 }
drh91025292004-05-03 19:49:32 +00003206 releasePage(pParent);
drhda200cc2004-05-09 11:51:38 +00003207 releasePage(extraUnref);
drh3a4c1412004-05-09 20:40:11 +00003208 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
3209 pPage->pgno, nOld, nNew, nCell));
drh8b2f49b2001-06-08 00:21:52 +00003210 return rc;
3211}
3212
3213/*
drhf74b8d92002-09-01 23:20:45 +00003214** This routine checks all cursors that point to the same table
3215** as pCur points to. If any of those cursors were opened with
3216** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
3217** cursors point to the same table were opened with wrFlag==1
3218** then this routine returns SQLITE_OK.
3219**
3220** In addition to checking for read-locks (where a read-lock
3221** means a cursor opened with wrFlag==0) this routine also moves
3222** all cursors other than pCur so that they are pointing to the
3223** first Cell on root page. This is necessary because an insert
3224** or delete might change the number of cells on a page or delete
3225** a page entirely and we do not want to leave any cursors
3226** pointing to non-existant pages or cells.
3227*/
3228static int checkReadLocks(BtCursor *pCur){
3229 BtCursor *p;
3230 assert( pCur->wrFlag );
3231 for(p=pCur->pShared; p!=pCur; p=p->pShared){
3232 assert( p );
3233 assert( p->pgnoRoot==pCur->pgnoRoot );
drha34b6762004-05-07 13:30:42 +00003234 assert( p->pPage->pgno==sqlite3pager_pagenumber(p->pPage->aData) );
drhf74b8d92002-09-01 23:20:45 +00003235 if( p->wrFlag==0 ) return SQLITE_LOCKED;
drh91025292004-05-03 19:49:32 +00003236 if( p->pPage->pgno!=p->pgnoRoot ){
drhf74b8d92002-09-01 23:20:45 +00003237 moveToRoot(p);
3238 }
3239 }
3240 return SQLITE_OK;
3241}
3242
3243/*
drh3b7511c2001-05-26 13:15:44 +00003244** Insert a new record into the BTree. The key is given by (pKey,nKey)
3245** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00003246** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00003247** is left pointing at a random location.
3248**
3249** For an INTKEY table, only the nKey value of the key is used. pKey is
3250** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00003251*/
drh3aac2dd2004-04-26 14:10:20 +00003252int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00003253 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00003254 const void *pKey, i64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00003255 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00003256){
drh3b7511c2001-05-26 13:15:44 +00003257 int rc;
3258 int loc;
drh14acc042001-06-10 19:56:58 +00003259 int szNew;
drh3b7511c2001-05-26 13:15:44 +00003260 MemPage *pPage;
3261 Btree *pBt = pCur->pBt;
drha34b6762004-05-07 13:30:42 +00003262 unsigned char *oldCell;
3263 unsigned char newCell[MX_CELL_SIZE];
drh3b7511c2001-05-26 13:15:44 +00003264
drhc39e0002004-05-07 23:50:57 +00003265 if( pCur->status ){
3266 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003267 }
danielk1977e7c8d582004-05-13 13:38:52 +00003268 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003269 /* Must start a transaction before doing an insert */
3270 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003271 }
drhf74b8d92002-09-01 23:20:45 +00003272 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00003273 if( !pCur->wrFlag ){
3274 return SQLITE_PERM; /* Cursor not open for writing */
3275 }
drhf74b8d92002-09-01 23:20:45 +00003276 if( checkReadLocks(pCur) ){
3277 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3278 }
drh3aac2dd2004-04-26 14:10:20 +00003279 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00003280 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00003281 pPage = pCur->pPage;
drh4a1c3802004-05-12 15:15:47 +00003282 assert( pPage->intKey || nKey>=0 );
drh8b18dd42004-05-12 19:18:15 +00003283 assert( pPage->leaf || !pPage->leafData );
drh3a4c1412004-05-09 20:40:11 +00003284 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
3285 pCur->pgnoRoot, nKey, nData, pPage->pgno,
3286 loc==0 ? "overwrite" : "new entry"));
drh7aa128d2002-06-21 13:09:16 +00003287 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003288 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003289 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003290 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
drh3b7511c2001-05-26 13:15:44 +00003291 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003292 assert( szNew==cellSize(pPage, newCell) );
drh3a4c1412004-05-09 20:40:11 +00003293 assert( szNew<=sizeof(newCell) );
drhf328bc82004-05-10 23:29:49 +00003294 if( loc==0 && pCur->isValid ){
drha34b6762004-05-07 13:30:42 +00003295 int szOld;
3296 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00003297 oldCell = pPage->aCell[pCur->idx];
3298 if( !pPage->leaf ){
3299 memcpy(&newCell[2], &oldCell[2], 4);
3300 }
3301 szOld = cellSize(pPage, oldCell);
3302 rc = clearCell(pPage, oldCell);
drh5e2f8b92001-05-28 00:41:15 +00003303 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003304 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00003305 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00003306 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00003307 pCur->idx++;
3308 }else{
drh4b70f112004-05-02 21:12:19 +00003309 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00003310 }
drh24cd67e2004-05-10 16:18:47 +00003311 insertCell(pPage, pCur->idx, newCell, szNew, 0);
drh4b70f112004-05-02 21:12:19 +00003312 rc = balance(pPage);
drh23e11ca2004-05-04 17:27:28 +00003313 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
drh3fc190c2001-09-14 03:24:23 +00003314 /* fflush(stdout); */
drh4b70f112004-05-02 21:12:19 +00003315 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003316 return rc;
3317}
3318
3319/*
drh4b70f112004-05-02 21:12:19 +00003320** Delete the entry that the cursor is pointing to. The cursor
3321** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00003322*/
drh3aac2dd2004-04-26 14:10:20 +00003323int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00003324 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00003325 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00003326 int rc;
drh8c42ca92001-06-22 19:15:00 +00003327 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00003328 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00003329
drh7aa128d2002-06-21 13:09:16 +00003330 assert( pPage->isInit );
drhc39e0002004-05-07 23:50:57 +00003331 if( pCur->status ){
3332 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003333 }
drhf74b8d92002-09-01 23:20:45 +00003334 if( !pBt->inTrans ){
3335 /* Must start a transaction before doing a delete */
3336 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003337 }
drhf74b8d92002-09-01 23:20:45 +00003338 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00003339 if( pCur->idx >= pPage->nCell ){
3340 return SQLITE_ERROR; /* The cursor is not pointing to anything */
3341 }
drhecdc7532001-09-23 02:35:53 +00003342 if( !pCur->wrFlag ){
3343 return SQLITE_PERM; /* Did not open this cursor for writing */
3344 }
drhf74b8d92002-09-01 23:20:45 +00003345 if( checkReadLocks(pCur) ){
3346 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3347 }
drha34b6762004-05-07 13:30:42 +00003348 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003349 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003350 pCell = pPage->aCell[pCur->idx];
3351 if( !pPage->leaf ){
3352 pgnoChild = get4byte(&pCell[2]);
3353 }
3354 clearCell(pPage, pCell);
3355 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00003356 /*
drh5e00f6c2001-09-13 13:46:56 +00003357 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00003358 ** do something we will leave a hole on an internal page.
3359 ** We have to fill the hole by moving in a cell from a leaf. The
3360 ** next Cell after the one to be deleted is guaranteed to exist and
3361 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00003362 */
drh14acc042001-06-10 19:56:58 +00003363 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00003364 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00003365 int szNext;
drh8c1238a2003-01-02 14:43:55 +00003366 int notUsed;
drh24cd67e2004-05-10 16:18:47 +00003367 unsigned char tempCell[MX_CELL_SIZE];
drh8b18dd42004-05-12 19:18:15 +00003368 assert( !pPage->leafData );
drh14acc042001-06-10 19:56:58 +00003369 getTempCursor(pCur, &leafCur);
drh3aac2dd2004-04-26 14:10:20 +00003370 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00003371 if( rc!=SQLITE_OK ){
drh8a6ac0a2004-02-14 17:35:07 +00003372 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3373 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003374 }
drha34b6762004-05-07 13:30:42 +00003375 rc = sqlite3pager_write(leafCur.pPage->aData);
drh6019e162001-07-02 17:51:45 +00003376 if( rc ) return rc;
drh3a4c1412004-05-09 20:40:11 +00003377 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
3378 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003379 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3380 pNext = leafCur.pPage->aCell[leafCur.idx];
3381 szNext = cellSize(leafCur.pPage, pNext);
drh24cd67e2004-05-10 16:18:47 +00003382 assert( sizeof(tempCell)>=szNext+4 );
3383 insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell);
3384 put4byte(pPage->aCell[pCur->idx]+2, pgnoChild);
drh4b70f112004-05-02 21:12:19 +00003385 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003386 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003387 dropCell(leafCur.pPage, leafCur.idx, szNext);
3388 rc = balance(leafCur.pPage);
drh8c42ca92001-06-22 19:15:00 +00003389 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00003390 }else{
drh3a4c1412004-05-09 20:40:11 +00003391 TRACE(("DELETE: table=%d delete from leaf %d\n",
3392 pCur->pgnoRoot, pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003393 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3394 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003395 }
drh4b70f112004-05-02 21:12:19 +00003396 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003397 return rc;
drh3b7511c2001-05-26 13:15:44 +00003398}
drh8b2f49b2001-06-08 00:21:52 +00003399
3400/*
drhc6b52df2002-01-04 03:09:29 +00003401** Create a new BTree table. Write into *piTable the page
3402** number for the root page of the new table.
3403**
3404** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00003405** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00003406** are restricted to having a 4-byte integer key and arbitrary data and
3407** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00003408** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00003409*/
drh3aac2dd2004-04-26 14:10:20 +00003410int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
drh8b2f49b2001-06-08 00:21:52 +00003411 MemPage *pRoot;
3412 Pgno pgnoRoot;
3413 int rc;
3414 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003415 /* Must start a transaction first */
3416 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003417 }
drh5df72a52002-06-06 23:16:05 +00003418 if( pBt->readOnly ){
3419 return SQLITE_READONLY;
3420 }
drhda200cc2004-05-09 11:51:38 +00003421 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1);
drh8b2f49b2001-06-08 00:21:52 +00003422 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003423 assert( sqlite3pager_iswriteable(pRoot->aData) );
drhde647132004-05-07 17:57:49 +00003424 zeroPage(pRoot, flags | PTF_LEAF);
drha34b6762004-05-07 13:30:42 +00003425 sqlite3pager_unref(pRoot->aData);
drh8b2f49b2001-06-08 00:21:52 +00003426 *piTable = (int)pgnoRoot;
3427 return SQLITE_OK;
3428}
3429
3430/*
3431** Erase the given database page and all its children. Return
3432** the page to the freelist.
3433*/
drh4b70f112004-05-02 21:12:19 +00003434static int clearDatabasePage(
3435 Btree *pBt, /* The BTree that contains the table */
3436 Pgno pgno, /* Page number to clear */
3437 MemPage *pParent, /* Parent page. NULL for the root */
3438 int freePageFlag /* Deallocate page if true */
3439){
drh8b2f49b2001-06-08 00:21:52 +00003440 MemPage *pPage;
3441 int rc;
drh4b70f112004-05-02 21:12:19 +00003442 unsigned char *pCell;
3443 int i;
drh8b2f49b2001-06-08 00:21:52 +00003444
drhde647132004-05-07 17:57:49 +00003445 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
drh8b2f49b2001-06-08 00:21:52 +00003446 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003447 rc = sqlite3pager_write(pPage->aData);
drh6019e162001-07-02 17:51:45 +00003448 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003449 for(i=0; i<pPage->nCell; i++){
3450 pCell = pPage->aCell[i];
3451 if( !pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003452 rc = clearDatabasePage(pBt, get4byte(&pCell[2]), pPage->pParent, 1);
drh8b2f49b2001-06-08 00:21:52 +00003453 if( rc ) return rc;
3454 }
drh4b70f112004-05-02 21:12:19 +00003455 rc = clearCell(pPage, pCell);
drh8b2f49b2001-06-08 00:21:52 +00003456 if( rc ) return rc;
3457 }
drha34b6762004-05-07 13:30:42 +00003458 if( !pPage->leaf ){
3459 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[6]), pPage->pParent, 1);
drh2aa679f2001-06-25 02:11:07 +00003460 if( rc ) return rc;
3461 }
3462 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00003463 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003464 }else{
drh3a4c1412004-05-09 20:40:11 +00003465 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00003466 }
drh4b70f112004-05-02 21:12:19 +00003467 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003468 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003469}
3470
3471/*
3472** Delete all information from a single table in the database.
3473*/
drh3aac2dd2004-04-26 14:10:20 +00003474int sqlite3BtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003475 int rc;
drhf74b8d92002-09-01 23:20:45 +00003476 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003477 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003478 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003479 }
drhf74b8d92002-09-01 23:20:45 +00003480 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3481 if( pCur->pgnoRoot==(Pgno)iTable ){
3482 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3483 moveToRoot(pCur);
3484 }
drhecdc7532001-09-23 02:35:53 +00003485 }
drha34b6762004-05-07 13:30:42 +00003486 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
drh8b2f49b2001-06-08 00:21:52 +00003487 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003488 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00003489 }
drh8c42ca92001-06-22 19:15:00 +00003490 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003491}
3492
3493/*
3494** Erase all information in a table and add the root of the table to
3495** the freelist. Except, the root of the principle table (the one on
3496** page 2) is never added to the freelist.
3497*/
drh3aac2dd2004-04-26 14:10:20 +00003498int sqlite3BtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003499 int rc;
3500 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00003501 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003502 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003503 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003504 }
drhf74b8d92002-09-01 23:20:45 +00003505 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3506 if( pCur->pgnoRoot==(Pgno)iTable ){
3507 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
3508 }
drh5df72a52002-06-06 23:16:05 +00003509 }
drha34b6762004-05-07 13:30:42 +00003510 rc = getPage(pBt, (Pgno)iTable, &pPage);
drh2aa679f2001-06-25 02:11:07 +00003511 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003512 rc = sqlite3BtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00003513 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003514 if( iTable>1 ){
drha34b6762004-05-07 13:30:42 +00003515 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003516 }else{
drha34b6762004-05-07 13:30:42 +00003517 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
drh8b2f49b2001-06-08 00:21:52 +00003518 }
drh4b70f112004-05-02 21:12:19 +00003519 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00003520 return rc;
3521}
3522
drh001bbcb2003-03-19 03:14:00 +00003523
drh8b2f49b2001-06-08 00:21:52 +00003524/*
drh23e11ca2004-05-04 17:27:28 +00003525** Read the meta-information out of a database file. Meta[0]
3526** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00003527** through meta[15] are available for use by higher layers. Meta[0]
3528** is read-only, the others are read/write.
3529**
3530** The schema layer numbers meta values differently. At the schema
3531** layer (and the SetCookie and ReadCookie opcodes) the number of
3532** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00003533*/
drh3aac2dd2004-04-26 14:10:20 +00003534int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00003535 int rc;
drh4b70f112004-05-02 21:12:19 +00003536 unsigned char *pP1;
drh8b2f49b2001-06-08 00:21:52 +00003537
drh23e11ca2004-05-04 17:27:28 +00003538 assert( idx>=0 && idx<=15 );
drha34b6762004-05-07 13:30:42 +00003539 rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00003540 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003541 *pMeta = get4byte(&pP1[36 + idx*4]);
drha34b6762004-05-07 13:30:42 +00003542 sqlite3pager_unref(pP1);
drh8b2f49b2001-06-08 00:21:52 +00003543 return SQLITE_OK;
3544}
3545
3546/*
drh23e11ca2004-05-04 17:27:28 +00003547** Write meta-information back into the database. Meta[0] is
3548** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00003549*/
drh3aac2dd2004-04-26 14:10:20 +00003550int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
drh4b70f112004-05-02 21:12:19 +00003551 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00003552 int rc;
drh23e11ca2004-05-04 17:27:28 +00003553 assert( idx>=1 && idx<=15 );
drh8b2f49b2001-06-08 00:21:52 +00003554 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003555 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003556 }
drhde647132004-05-07 17:57:49 +00003557 assert( pBt->pPage1!=0 );
3558 pP1 = pBt->pPage1->aData;
drha34b6762004-05-07 13:30:42 +00003559 rc = sqlite3pager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00003560 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003561 put4byte(&pP1[36 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00003562 return SQLITE_OK;
3563}
drh8c42ca92001-06-22 19:15:00 +00003564
drhf328bc82004-05-10 23:29:49 +00003565/*
3566** Return the flag byte at the beginning of the page that the cursor
3567** is currently pointing to.
3568*/
3569int sqlite3BtreeFlags(BtCursor *pCur){
3570 MemPage *pPage = pCur->pPage;
3571 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
3572}
3573
drh5eddca62001-06-30 21:53:53 +00003574/******************************************************************************
3575** The complete implementation of the BTree subsystem is above this line.
3576** All the code the follows is for testing and troubleshooting the BTree
3577** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003578******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003579
drh8c42ca92001-06-22 19:15:00 +00003580/*
3581** Print a disassembly of the given page on standard output. This routine
3582** is used for debugging and testing only.
3583*/
drhaaab5722002-02-19 13:39:21 +00003584#ifdef SQLITE_TEST
drh23e11ca2004-05-04 17:27:28 +00003585int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003586 int rc;
3587 MemPage *pPage;
drhc8629a12004-05-08 20:07:40 +00003588 int i, j, c;
drh8c42ca92001-06-22 19:15:00 +00003589 int nFree;
3590 u16 idx;
drhab9f7f12004-05-08 10:56:11 +00003591 int hdr;
3592 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00003593 char range[20];
3594 unsigned char payload[20];
drhab9f7f12004-05-08 10:56:11 +00003595
drh4b70f112004-05-02 21:12:19 +00003596 rc = getPage(pBt, (Pgno)pgno, &pPage);
drh8c42ca92001-06-22 19:15:00 +00003597 if( rc ){
3598 return rc;
3599 }
drhab9f7f12004-05-08 10:56:11 +00003600 hdr = pPage->hdrOffset;
3601 data = pPage->aData;
drhc8629a12004-05-08 20:07:40 +00003602 c = data[hdr];
drh8b18dd42004-05-12 19:18:15 +00003603 pPage->intKey = (c & (PTF_INTKEY|PTF_LEAFDATA))!=0;
drhc8629a12004-05-08 20:07:40 +00003604 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh8b18dd42004-05-12 19:18:15 +00003605 pPage->leafData = (c & PTF_LEAFDATA)!=0;
drhc8629a12004-05-08 20:07:40 +00003606 pPage->leaf = (c & PTF_LEAF)!=0;
drh8b18dd42004-05-12 19:18:15 +00003607 pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
drhda200cc2004-05-09 11:51:38 +00003608 printf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno,
3609 data[hdr], data[hdr+5],
3610 (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
drh8c42ca92001-06-22 19:15:00 +00003611 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003612 assert( hdr == (pgno==1 ? 100 : 0) );
3613 idx = get2byte(&data[hdr+3]);
drhb6f41482004-05-14 01:58:11 +00003614 while( idx>0 && idx<=pBt->usableSize ){
drh6f11bef2004-05-13 01:12:56 +00003615 CellInfo info;
drh4b70f112004-05-02 21:12:19 +00003616 Pgno child;
drhab9f7f12004-05-08 10:56:11 +00003617 unsigned char *pCell = &data[idx];
drh6f11bef2004-05-13 01:12:56 +00003618 int sz;
3619
3620 pCell = &data[idx];
3621 parseCell(pPage, pCell, &info);
3622 sz = info.nSize;
drh8c42ca92001-06-22 19:15:00 +00003623 sprintf(range,"%d..%d", idx, idx+sz-1);
drh4b70f112004-05-02 21:12:19 +00003624 if( pPage->leaf ){
3625 child = 0;
3626 }else{
3627 child = get4byte(&pCell[2]);
3628 }
drh6f11bef2004-05-13 01:12:56 +00003629 sz = info.nData;
3630 if( !pPage->intKey ) sz += info.nKey;
drh8c42ca92001-06-22 19:15:00 +00003631 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
drh6f11bef2004-05-13 01:12:56 +00003632 memcpy(payload, &pCell[info.nHeader], sz);
drh8c42ca92001-06-22 19:15:00 +00003633 for(j=0; j<sz; j++){
3634 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3635 }
3636 payload[sz] = 0;
3637 printf(
drh6f11bef2004-05-13 01:12:56 +00003638 "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4d payload=%s\n",
3639 i, range, child, info.nKey, info.nData, payload
drh8c42ca92001-06-22 19:15:00 +00003640 );
drh4b70f112004-05-02 21:12:19 +00003641 if( pPage->isInit && pPage->aCell[i]!=pCell ){
3642 printf("**** aCell[%d] does not match on prior entry ****\n", i);
drh2aa679f2001-06-25 02:11:07 +00003643 }
drh7c717f72001-06-24 20:39:41 +00003644 i++;
drh4b70f112004-05-02 21:12:19 +00003645 idx = get2byte(pCell);
drh8c42ca92001-06-22 19:15:00 +00003646 }
3647 if( idx!=0 ){
3648 printf("ERROR: next cell index out of range: %d\n", idx);
3649 }
drh4b70f112004-05-02 21:12:19 +00003650 if( !pPage->leaf ){
drh3644f082004-05-10 18:45:09 +00003651 printf("right_child: %d\n", get4byte(&data[hdr+6]));
drh4b70f112004-05-02 21:12:19 +00003652 }
drh8c42ca92001-06-22 19:15:00 +00003653 nFree = 0;
3654 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003655 idx = get2byte(&data[hdr+1]);
drhb6f41482004-05-14 01:58:11 +00003656 while( idx>0 && idx<pPage->pBt->usableSize ){
drhab9f7f12004-05-08 10:56:11 +00003657 int sz = get2byte(&data[idx+2]);
drh4b70f112004-05-02 21:12:19 +00003658 sprintf(range,"%d..%d", idx, idx+sz-1);
3659 nFree += sz;
drh8c42ca92001-06-22 19:15:00 +00003660 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00003661 i, range, sz, nFree);
drhab9f7f12004-05-08 10:56:11 +00003662 idx = get2byte(&data[idx]);
drh2aa679f2001-06-25 02:11:07 +00003663 i++;
drh8c42ca92001-06-22 19:15:00 +00003664 }
3665 if( idx!=0 ){
3666 printf("ERROR: next freeblock index out of range: %d\n", idx);
3667 }
drha34b6762004-05-07 13:30:42 +00003668 if( recursive && !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003669 idx = get2byte(&data[hdr+3]);
drhb6f41482004-05-14 01:58:11 +00003670 while( idx>0 && idx<pBt->usableSize ){
drhab9f7f12004-05-08 10:56:11 +00003671 unsigned char *pCell = &data[idx];
drha34b6762004-05-07 13:30:42 +00003672 sqlite3BtreePageDump(pBt, get4byte(&pCell[2]), 1);
3673 idx = get2byte(pCell);
drh6019e162001-07-02 17:51:45 +00003674 }
drhab9f7f12004-05-08 10:56:11 +00003675 sqlite3BtreePageDump(pBt, get4byte(&data[hdr+6]), 1);
drh6019e162001-07-02 17:51:45 +00003676 }
drhab9f7f12004-05-08 10:56:11 +00003677 sqlite3pager_unref(data);
drh3644f082004-05-10 18:45:09 +00003678 fflush(stdout);
drh8c42ca92001-06-22 19:15:00 +00003679 return SQLITE_OK;
3680}
drhaaab5722002-02-19 13:39:21 +00003681#endif
drh8c42ca92001-06-22 19:15:00 +00003682
drhaaab5722002-02-19 13:39:21 +00003683#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003684/*
drh2aa679f2001-06-25 02:11:07 +00003685** Fill aResult[] with information about the entry and page that the
3686** cursor is pointing to.
3687**
3688** aResult[0] = The page number
3689** aResult[1] = The entry number
3690** aResult[2] = Total number of entries on this page
3691** aResult[3] = Size of this entry
3692** aResult[4] = Number of free bytes on this page
3693** aResult[5] = Number of free blocks on the page
3694** aResult[6] = Page number of the left child of this entry
3695** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003696**
3697** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003698*/
drhda200cc2004-05-09 11:51:38 +00003699int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003700 int cnt, idx;
3701 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00003702
3703 pageIntegrity(pPage);
drh4b70f112004-05-02 21:12:19 +00003704 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003705 aResult[0] = sqlite3pager_pagenumber(pPage->aData);
drh91025292004-05-03 19:49:32 +00003706 assert( aResult[0]==pPage->pgno );
drh8c42ca92001-06-22 19:15:00 +00003707 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003708 aResult[2] = pPage->nCell;
3709 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh4b70f112004-05-02 21:12:19 +00003710 aResult[3] = cellSize(pPage, pPage->aCell[pCur->idx]);
3711 aResult[6] = pPage->leaf ? 0 : get4byte(&pPage->aCell[pCur->idx][2]);
drh2aa679f2001-06-25 02:11:07 +00003712 }else{
3713 aResult[3] = 0;
3714 aResult[6] = 0;
3715 }
3716 aResult[4] = pPage->nFree;
3717 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00003718 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhb6f41482004-05-14 01:58:11 +00003719 while( idx>0 && idx<pPage->pBt->usableSize ){
drh2aa679f2001-06-25 02:11:07 +00003720 cnt++;
drh4b70f112004-05-02 21:12:19 +00003721 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003722 }
3723 aResult[5] = cnt;
drh4b70f112004-05-02 21:12:19 +00003724 aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh8c42ca92001-06-22 19:15:00 +00003725 return SQLITE_OK;
3726}
drhaaab5722002-02-19 13:39:21 +00003727#endif
drhdd793422001-06-28 01:54:48 +00003728
drhdd793422001-06-28 01:54:48 +00003729/*
drh5eddca62001-06-30 21:53:53 +00003730** Return the pager associated with a BTree. This routine is used for
3731** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003732*/
drh3aac2dd2004-04-26 14:10:20 +00003733Pager *sqlite3BtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003734 return pBt->pPager;
3735}
drh5eddca62001-06-30 21:53:53 +00003736
3737/*
3738** This structure is passed around through all the sanity checking routines
3739** in order to keep track of some global state information.
3740*/
drhaaab5722002-02-19 13:39:21 +00003741typedef struct IntegrityCk IntegrityCk;
3742struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003743 Btree *pBt; /* The tree being checked out */
3744 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3745 int nPage; /* Number of pages in the database */
3746 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00003747 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003748};
3749
3750/*
3751** Append a message to the error message string.
3752*/
drhaaab5722002-02-19 13:39:21 +00003753static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003754 if( pCheck->zErrMsg ){
3755 char *zOld = pCheck->zErrMsg;
3756 pCheck->zErrMsg = 0;
danielk19774adee202004-05-08 08:23:19 +00003757 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003758 sqliteFree(zOld);
3759 }else{
danielk19774adee202004-05-08 08:23:19 +00003760 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003761 }
3762}
3763
3764/*
3765** Add 1 to the reference count for page iPage. If this is the second
3766** reference to the page, add an error message to pCheck->zErrMsg.
3767** Return 1 if there are 2 ore more references to the page and 0 if
3768** if this is the first reference to the page.
3769**
3770** Also check that the page number is in bounds.
3771*/
drhaaab5722002-02-19 13:39:21 +00003772static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003773 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003774 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003775 char zBuf[100];
3776 sprintf(zBuf, "invalid page number %d", iPage);
3777 checkAppendMsg(pCheck, zContext, zBuf);
3778 return 1;
3779 }
3780 if( pCheck->anRef[iPage]==1 ){
3781 char zBuf[100];
3782 sprintf(zBuf, "2nd reference to page %d", iPage);
3783 checkAppendMsg(pCheck, zContext, zBuf);
3784 return 1;
3785 }
3786 return (pCheck->anRef[iPage]++)>1;
3787}
3788
3789/*
3790** Check the integrity of the freelist or of an overflow page list.
3791** Verify that the number of pages on the list is N.
3792*/
drh30e58752002-03-02 20:41:57 +00003793static void checkList(
3794 IntegrityCk *pCheck, /* Integrity checking context */
3795 int isFreeList, /* True for a freelist. False for overflow page list */
3796 int iPage, /* Page number for first page in the list */
3797 int N, /* Expected number of pages in the list */
3798 char *zContext /* Context for error messages */
3799){
3800 int i;
drh3a4c1412004-05-09 20:40:11 +00003801 int expected = N;
3802 int iFirst = iPage;
drh5eddca62001-06-30 21:53:53 +00003803 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003804 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00003805 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00003806 if( iPage<1 ){
drh3a4c1412004-05-09 20:40:11 +00003807 sprintf(zMsg, "%d of %d pages missing from overflow list starting at %d",
3808 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00003809 checkAppendMsg(pCheck, zContext, zMsg);
3810 break;
3811 }
3812 if( checkRef(pCheck, iPage, zContext) ) break;
drha34b6762004-05-07 13:30:42 +00003813 if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
drh5eddca62001-06-30 21:53:53 +00003814 sprintf(zMsg, "failed to get page %d", iPage);
3815 checkAppendMsg(pCheck, zContext, zMsg);
3816 break;
3817 }
drh30e58752002-03-02 20:41:57 +00003818 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00003819 int n = get4byte(&pOvfl[4]);
drh0d316a42002-08-11 20:10:47 +00003820 for(i=0; i<n; i++){
drh4b70f112004-05-02 21:12:19 +00003821 checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
drh30e58752002-03-02 20:41:57 +00003822 }
drh0d316a42002-08-11 20:10:47 +00003823 N -= n;
drh30e58752002-03-02 20:41:57 +00003824 }
drh4b70f112004-05-02 21:12:19 +00003825 iPage = get4byte(pOvfl);
drha34b6762004-05-07 13:30:42 +00003826 sqlite3pager_unref(pOvfl);
drh5eddca62001-06-30 21:53:53 +00003827 }
3828}
3829
3830/*
3831** Do various sanity checks on a single page of a tree. Return
3832** the tree depth. Root pages return 0. Parents of root pages
3833** return 1, and so forth.
3834**
3835** These checks are done:
3836**
3837** 1. Make sure that cells and freeblocks do not overlap
3838** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00003839** NO 2. Make sure cell keys are in order.
3840** NO 3. Make sure no key is less than or equal to zLowerBound.
3841** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00003842** 5. Check the integrity of overflow pages.
3843** 6. Recursively call checkTreePage on all children.
3844** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003845** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003846** the root of the tree.
3847*/
3848static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003849 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003850 int iPage, /* Page number of the page to check */
3851 MemPage *pParent, /* Parent page */
3852 char *zParentContext, /* Parent context */
3853 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003854 int nLower, /* Number of characters in zLowerBound */
3855 char *zUpperBound, /* All keys should be less than this, if not NULL */
3856 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003857){
3858 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00003859 int i, rc, depth, d2, pgno, cnt;
3860 int hdr;
3861 u8 *data;
drh5eddca62001-06-30 21:53:53 +00003862 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003863 Btree *pBt;
drhb6f41482004-05-14 01:58:11 +00003864 int maxLocal, usableSize;
drh5eddca62001-06-30 21:53:53 +00003865 char zMsg[100];
3866 char zContext[100];
drhda200cc2004-05-09 11:51:38 +00003867 char hit[MX_PAGE_SIZE];
drh5eddca62001-06-30 21:53:53 +00003868
3869 /* Check that the page exists
3870 */
drh0d316a42002-08-11 20:10:47 +00003871 cur.pBt = pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00003872 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00003873 if( iPage==0 ) return 0;
3874 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh4b70f112004-05-02 21:12:19 +00003875 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003876 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3877 checkAppendMsg(pCheck, zContext, zMsg);
3878 return 0;
3879 }
drh6f11bef2004-05-13 01:12:56 +00003880 maxLocal = pPage->leafData ? pBt->maxLeaf : pBt->maxLocal;
drh4b70f112004-05-02 21:12:19 +00003881 if( (rc = initPage(pPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003882 sprintf(zMsg, "initPage() returns error code %d", rc);
3883 checkAppendMsg(pCheck, zContext, zMsg);
drh91025292004-05-03 19:49:32 +00003884 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003885 return 0;
3886 }
3887
3888 /* Check out all the cells.
3889 */
3890 depth = 0;
drh5eddca62001-06-30 21:53:53 +00003891 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003892 for(i=0; i<pPage->nCell; i++){
drh6f11bef2004-05-13 01:12:56 +00003893 u8 *pCell;
3894 int sz;
3895 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00003896
3897 /* Check payload overflow pages
3898 */
drh3a4c1412004-05-09 20:40:11 +00003899 sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
drh6f11bef2004-05-13 01:12:56 +00003900 pCell = pPage->aCell[i];
3901 parseCell(pPage, pCell, &info);
3902 sz = info.nData;
3903 if( !pPage->intKey ) sz += info.nKey;
3904 if( sz>info.nLocal ){
drhb6f41482004-05-14 01:58:11 +00003905 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +00003906 checkList(pCheck, 0, get4byte(&pCell[info.iOverflow]),nPage,zContext);
drh5eddca62001-06-30 21:53:53 +00003907 }
3908
3909 /* Check sanity of left child page.
3910 */
drhda200cc2004-05-09 11:51:38 +00003911 if( !pPage->leaf ){
3912 pgno = get4byte(&pCell[2]);
3913 d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
3914 if( i>0 && d2!=depth ){
3915 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3916 }
3917 depth = d2;
drh5eddca62001-06-30 21:53:53 +00003918 }
drh5eddca62001-06-30 21:53:53 +00003919 }
drhda200cc2004-05-09 11:51:38 +00003920 if( !pPage->leaf ){
3921 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
3922 sprintf(zContext, "On page %d at right child: ", iPage);
3923 checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
3924 }
drh5eddca62001-06-30 21:53:53 +00003925
3926 /* Check for complete coverage of the page
3927 */
drhb6f41482004-05-14 01:58:11 +00003928 memset(hit, 0, usableSize);
drhda200cc2004-05-09 11:51:38 +00003929 memset(hit, 1, pPage->hdrOffset+10-4*(pPage->leaf));
3930 data = pPage->aData;
3931 hdr = pPage->hdrOffset;
drhb6f41482004-05-14 01:58:11 +00003932 for(cnt=0, i=get2byte(&data[hdr+3]); i>0 && i<usableSize && cnt<10000; cnt++){
drhda200cc2004-05-09 11:51:38 +00003933 int size = cellSize(pPage, &data[i]);
drh5eddca62001-06-30 21:53:53 +00003934 int j;
drhda200cc2004-05-09 11:51:38 +00003935 for(j=i+size-1; j>=i; j--) hit[j]++;
3936 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003937 }
drhb6f41482004-05-14 01:58:11 +00003938 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; cnt++){
drhda200cc2004-05-09 11:51:38 +00003939 int size = get2byte(&data[i+2]);
drh5eddca62001-06-30 21:53:53 +00003940 int j;
drhda200cc2004-05-09 11:51:38 +00003941 for(j=i+size-1; j>=i; j--) hit[j]++;
3942 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003943 }
drhb6f41482004-05-14 01:58:11 +00003944 for(i=cnt=0; i<usableSize; i++){
drh5eddca62001-06-30 21:53:53 +00003945 if( hit[i]==0 ){
drhda200cc2004-05-09 11:51:38 +00003946 cnt++;
drh5eddca62001-06-30 21:53:53 +00003947 }else if( hit[i]>1 ){
3948 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3949 checkAppendMsg(pCheck, zMsg, 0);
3950 break;
3951 }
3952 }
drhda200cc2004-05-09 11:51:38 +00003953 if( cnt!=data[hdr+5] ){
3954 sprintf(zMsg, "Fragmented space is %d byte reported as %d on page %d",
3955 cnt, data[hdr+5], iPage);
3956 checkAppendMsg(pCheck, zMsg, 0);
3957 }
drh6019e162001-07-02 17:51:45 +00003958
drh4b70f112004-05-02 21:12:19 +00003959 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00003960 return depth+1;
drh5eddca62001-06-30 21:53:53 +00003961}
3962
3963/*
3964** This routine does a complete check of the given BTree file. aRoot[] is
3965** an array of pages numbers were each page number is the root page of
3966** a table. nRoot is the number of entries in aRoot.
3967**
3968** If everything checks out, this routine returns NULL. If something is
3969** amiss, an error message is written into memory obtained from malloc()
3970** and a pointer to that error message is returned. The calling function
3971** is responsible for freeing the error message when it is done.
3972*/
drh3aac2dd2004-04-26 14:10:20 +00003973char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003974 int i;
3975 int nRef;
drhaaab5722002-02-19 13:39:21 +00003976 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003977
drha34b6762004-05-07 13:30:42 +00003978 nRef = *sqlite3pager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003979 if( lockBtree(pBt)!=SQLITE_OK ){
3980 return sqliteStrDup("Unable to acquire a read lock on the database");
3981 }
drh5eddca62001-06-30 21:53:53 +00003982 sCheck.pBt = pBt;
3983 sCheck.pPager = pBt->pPager;
drha34b6762004-05-07 13:30:42 +00003984 sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003985 if( sCheck.nPage==0 ){
3986 unlockBtreeIfUnused(pBt);
3987 return 0;
3988 }
drh8c1238a2003-01-02 14:43:55 +00003989 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drhda200cc2004-05-09 11:51:38 +00003990 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh5eddca62001-06-30 21:53:53 +00003991 sCheck.zErrMsg = 0;
3992
3993 /* Check the integrity of the freelist
3994 */
drha34b6762004-05-07 13:30:42 +00003995 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
3996 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003997
3998 /* Check all the tables.
3999 */
4000 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00004001 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00004002 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00004003 }
4004
4005 /* Make sure every page in the file is referenced
4006 */
4007 for(i=1; i<=sCheck.nPage; i++){
4008 if( sCheck.anRef[i]==0 ){
4009 char zBuf[100];
4010 sprintf(zBuf, "Page %d is never used", i);
4011 checkAppendMsg(&sCheck, zBuf, 0);
4012 }
4013 }
4014
4015 /* Make sure this analysis did not leave any unref() pages
4016 */
drh5e00f6c2001-09-13 13:46:56 +00004017 unlockBtreeIfUnused(pBt);
drha34b6762004-05-07 13:30:42 +00004018 if( nRef != *sqlite3pager_stats(pBt->pPager) ){
drh5eddca62001-06-30 21:53:53 +00004019 char zBuf[100];
4020 sprintf(zBuf,
4021 "Outstanding page count goes from %d to %d during this analysis",
drha34b6762004-05-07 13:30:42 +00004022 nRef, *sqlite3pager_stats(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00004023 );
4024 checkAppendMsg(&sCheck, zBuf, 0);
4025 }
4026
4027 /* Clean up and report errors.
4028 */
4029 sqliteFree(sCheck.anRef);
4030 return sCheck.zErrMsg;
4031}
paulb95a8862003-04-01 21:16:41 +00004032
drh73509ee2003-04-06 20:44:45 +00004033/*
4034** Return the full pathname of the underlying database file.
4035*/
drh3aac2dd2004-04-26 14:10:20 +00004036const char *sqlite3BtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00004037 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00004038 return sqlite3pager_filename(pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00004039}
4040
4041/*
drhf7c57532003-04-25 13:22:51 +00004042** Copy the complete content of pBtFrom into pBtTo. A transaction
4043** must be active for both files.
4044**
4045** The size of file pBtFrom may be reduced by this operation.
4046** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00004047*/
drh3aac2dd2004-04-26 14:10:20 +00004048int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
drhf7c57532003-04-25 13:22:51 +00004049 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00004050 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00004051
4052 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
drhf7c57532003-04-25 13:22:51 +00004053 if( pBtTo->pCursor ) return SQLITE_BUSY;
drhb6f41482004-05-14 01:58:11 +00004054 memcpy(pBtTo->pPage1, pBtFrom->pPage1, pBtFrom->usableSize);
drha34b6762004-05-07 13:30:42 +00004055 rc = sqlite3pager_overwrite(pBtTo->pPager, 1, pBtFrom->pPage1);
4056 nToPage = sqlite3pager_pagecount(pBtTo->pPager);
4057 nPage = sqlite3pager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00004058 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00004059 void *pPage;
drha34b6762004-05-07 13:30:42 +00004060 rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
drhf7c57532003-04-25 13:22:51 +00004061 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00004062 rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
drh2e6d11b2003-04-25 15:37:57 +00004063 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00004064 sqlite3pager_unref(pPage);
drhf7c57532003-04-25 13:22:51 +00004065 }
drh2e6d11b2003-04-25 15:37:57 +00004066 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
4067 void *pPage;
drha34b6762004-05-07 13:30:42 +00004068 rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
drh2e6d11b2003-04-25 15:37:57 +00004069 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00004070 rc = sqlite3pager_write(pPage);
4071 sqlite3pager_unref(pPage);
4072 sqlite3pager_dont_write(pBtTo->pPager, i);
drh2e6d11b2003-04-25 15:37:57 +00004073 }
4074 if( !rc && nPage<nToPage ){
drha34b6762004-05-07 13:30:42 +00004075 rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
drh2e6d11b2003-04-25 15:37:57 +00004076 }
drhf7c57532003-04-25 13:22:51 +00004077 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00004078 sqlite3BtreeRollback(pBtTo);
drhf7c57532003-04-25 13:22:51 +00004079 }
4080 return rc;
drh73509ee2003-04-06 20:44:45 +00004081}