blob: c1b7c0a3ee8bec23c2953c373d48bdc2701479e0 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drhc8629a12004-05-08 20:07:40 +000012** $Id: btree.c,v 1.116 2004/05/08 20:07:40 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
56** page contain a special header that describes the file. The format
57** of that header is as follows:
58**
59** OFFSET SIZE DESCRIPTION
drhde647132004-05-07 17:57:49 +000060** 0 16 Header string: "SQLite format 3\000"
drh9e572e62004-04-23 23:43:10 +000061** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
64** 20 2 Bytes of unused space at the end of each page
65** 22 2 Maximum allowed local payload per entry
66** 24 8 File change counter
67** 32 4 First freelist page
68** 36 4 Number of freelist pages in the file
69** 40 60 15 4-byte meta values passed to higher layers
70**
71** All of the integer values are big-endian (most significant byte first).
72** The file change counter is incremented every time the database is changed.
73** This allows other processes to know when the file has changed and thus
74** when they need to flush their cache.
75**
76** Each btree page begins with a header described below. Note that the
77** header for page one begins at byte 100. For all other btree pages, the
78** header begins on byte zero.
79**
80** OFFSET SIZE DESCRIPTION
81** 0 1 Flags. 01: leaf, 02: zerodata, 04: intkey, F8: type
82** 1 2 byte offset to the first freeblock
83** 3 2 byte offset to the first cell
84** 5 1 number of fragmented free bytes
85** 6 4 Right child (the Ptr(N+1) value). Omitted if leaf
86**
87** The flags define the format of this btree page. The leaf flag means that
88** this page has no children. The zerodata flag means that this page carries
89** only keys and no data. The intkey flag means that the key is a single
90** variable length integer at the beginning of the payload.
91**
92** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
93** byte are used. The integer consists of all bytes that have bit 8 set and
94** the first byte with bit 8 clear. Unlike fixed-length values, variable-
95** length integers are little-endian. Examples:
96**
97** 0x00 becomes 0x00000000
98** 0x1b becomes 0x0000001b
99** 0x9b 0x4a becomes 0x00000dca
100** 0x80 0x1b becomes 0x0000001b
101** 0xf8 0xac 0xb1 0x91 0x01 becomes 0x12345678
102** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
103**
104** Variable length integers are used for rowids and to hold the number of
105** bytes of key and data in a btree cell.
106**
107** Unused space within a btree page is collected into a linked list of
108** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
109** to the first freeblock is given in the header. Freeblocks occur in
110** increasing order. Because a freeblock is 4 bytes in size, the minimum
111** size allocation on a btree page is 4 bytes. Because a freeblock must be
112** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
113** exist on the freeblock chain. The total number of such fragmented bytes
114** is recorded in the page header at offset 5.
115**
116** SIZE DESCRIPTION
117** 2 Byte offset of the next freeblock
118** 2 Bytes in this freeblock
119**
120** Cells are of variable length. The first cell begins on the byte defined
121** in the page header. Cells do not necessarily occur in order - they can
122** skip around on the page.
123**
124** SIZE DESCRIPTION
125** 2 Byte offset of the next cell. 0 if this is the last cell
drh3aac2dd2004-04-26 14:10:20 +0000126** 4 Page number of the left child. Omitted if leaf flag is set.
127** var Number of bytes of data. Omitted if the zerodata flag is set.
128** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000129** * Payload
130** 4 First page of the overflow chain. Omitted if no overflow
131**
132** Overflow pages form a linked list. Each page except the last is completely
133** filled with data (pagesize - 4 bytes). The last page can have as little
134** as 1 byte of data.
135**
136** SIZE DESCRIPTION
137** 4 Page number of next overflow page
138** * Data
139**
140** Freelist pages come in two subtypes: trunk pages and leaf pages. The
141** file header points to first in a linked list of trunk page. Each trunk
142** page points to multiple leaf pages. The content of a leaf page is
143** unspecified. A trunk page looks like this:
144**
145** SIZE DESCRIPTION
146** 4 Page number of next trunk page
147** 4 Number of leaf pointers on this page
148** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000149*/
150#include "sqliteInt.h"
151#include "pager.h"
152#include "btree.h"
153#include <assert.h>
154
drh4b70f112004-05-02 21:12:19 +0000155
156/* Maximum page size. The upper bound on this value is 65536 (a limit
157** imposed by the 2-byte offset at the beginning of each cell.) The
158** maximum page size determines the amount of stack space allocated
159** by many of the routines in this module. On embedded architectures
160** or any machine where memory and especially stack memory is limited,
161** one may wish to chose a smaller value for the maximum page size.
162*/
163#ifndef MX_PAGE_SIZE
164# define MX_PAGE_SIZE 1024
165#endif
166
167/* Individual entries or "cells" are limited in size so that at least
168** this many cells will fit on one page. Changing this value will result
169** in an incompatible database.
170*/
171#define MN_CELLS_PER_PAGE 4
172
173/* The following value is the maximum cell size assuming a maximum page
174** size give above.
175*/
176#define MX_CELL_SIZE ((MX_PAGE_SIZE-10)/MN_CELLS_PER_PAGE)
177
178/* The maximum number of cells on a single page of the database. This
179** assumes a minimum cell size of 3 bytes. Such small cells will be
180** exceedingly rare, but they are possible.
181*/
182#define MX_CELL ((MX_PAGE_SIZE-10)/3)
183
paulb95a8862003-04-01 21:16:41 +0000184/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000185typedef struct MemPage MemPage;
paulb95a8862003-04-01 21:16:41 +0000186
drh8c42ca92001-06-22 19:15:00 +0000187/*
drhbd03cae2001-06-02 02:40:57 +0000188** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000189** SQLite database in order to identify the file as a real database.
drhde647132004-05-07 17:57:49 +0000190** 123456789 123456 */
191static const char zMagicHeader[] = "SQLite format 3";
drh08ed44e2001-04-29 23:32:55 +0000192
193/*
drh4b70f112004-05-02 21:12:19 +0000194** Page type flags. An ORed combination of these flags appear as the
195** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000196*/
drhde647132004-05-07 17:57:49 +0000197#define PTF_INTKEY 0x01
drh9e572e62004-04-23 23:43:10 +0000198#define PTF_ZERODATA 0x02
drhde647132004-05-07 17:57:49 +0000199#define PTF_LEAF 0x04
drh4b70f112004-05-02 21:12:19 +0000200/* Idea for the future: PTF_LEAFDATA */
drh8c42ca92001-06-22 19:15:00 +0000201
202/*
drh9e572e62004-04-23 23:43:10 +0000203** As each page of the file is loaded into memory, an instance of the following
204** structure is appended and initialized to zero. This structure stores
205** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000206**
drh72f82862001-05-24 21:06:34 +0000207** The pParent field points back to the parent page. This allows us to
208** walk up the BTree from any leaf to the root. Care must be taken to
209** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000210** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000211*/
212struct MemPage {
drhde647132004-05-07 17:57:49 +0000213 u32 notUsed;
drh3aac2dd2004-04-26 14:10:20 +0000214 u8 isInit; /* True if previously initialized */
drh9e572e62004-04-23 23:43:10 +0000215 u8 idxShift; /* True if Cell indices have changed */
drh3aac2dd2004-04-26 14:10:20 +0000216 u8 isOverfull; /* Some aCell[] do not fit on page */
drh9e572e62004-04-23 23:43:10 +0000217 u8 intKey; /* True if intkey flag is set */
218 u8 leaf; /* True if leaf flag is set */
219 u8 zeroData; /* True if zero data flag is set */
220 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
drh3aac2dd2004-04-26 14:10:20 +0000221 int idxParent; /* Index in pParent->aCell[] of this node */
drh9e572e62004-04-23 23:43:10 +0000222 int nFree; /* Number of free bytes on the page */
drh306dc212001-05-21 13:45:10 +0000223 int nCell; /* Number of entries on this page */
drh4b70f112004-05-02 21:12:19 +0000224 int nCellAlloc; /* Number of slots allocated in aCell[] */
drh9e572e62004-04-23 23:43:10 +0000225 unsigned char **aCell; /* Pointer to start of each cell */
drhc8629a12004-05-08 20:07:40 +0000226 struct Btree *pBt; /* Pointer back to BTree structure */
227
228 unsigned char *aData; /* Pointer back to the start of the page */
229 Pgno pgno; /* Page number for this page */
230 MemPage *pParent; /* The parent of this page. NULL for root */
drh8c42ca92001-06-22 19:15:00 +0000231};
drh7e3b0a02001-04-28 16:52:40 +0000232
233/*
drh3b7511c2001-05-26 13:15:44 +0000234** The in-memory image of a disk page has the auxiliary information appended
235** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
236** that extra information.
237*/
drh3aac2dd2004-04-26 14:10:20 +0000238#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000239
240/*
drha059ad02001-04-17 20:09:11 +0000241** Everything we need to know about an open database
242*/
243struct Btree {
244 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000245 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000246 MemPage *pPage1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000247 u8 inTrans; /* True if a transaction is in progress */
drh3aac2dd2004-04-26 14:10:20 +0000248 u8 inStmt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000249 u8 readOnly; /* True if the underlying file is readonly */
drh9e572e62004-04-23 23:43:10 +0000250 int pageSize; /* Number of usable bytes on each page */
251 int maxLocal; /* Maximum local payload */
drha059ad02001-04-17 20:09:11 +0000252};
253typedef Btree Bt;
254
drh365d68f2001-05-11 11:02:46 +0000255/*
256** A cursor is a pointer to a particular entry in the BTree.
257** The entry is identified by its MemPage and the index in
drha34b6762004-05-07 13:30:42 +0000258** MemPage.aCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000259*/
drh72f82862001-05-24 21:06:34 +0000260struct BtCursor {
drh5e2f8b92001-05-28 00:41:15 +0000261 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000262 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000263 BtCursor *pShared; /* Loop of cursors with the same root page */
drh3aac2dd2004-04-26 14:10:20 +0000264 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
265 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000266 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000267 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000268 int idx; /* Index of the entry in pPage->aCell[] */
drhecdc7532001-09-23 02:35:53 +0000269 u8 wrFlag; /* True if writable */
drh23e11ca2004-05-04 17:27:28 +0000270 u8 iMatch; /* compare result from last sqlite3BtreeMoveto() */
drhc39e0002004-05-07 23:50:57 +0000271 u8 isValid; /* TRUE if points to a valid entry */
272 u8 status; /* Set to SQLITE_ABORT if cursors is invalidated */
drh365d68f2001-05-11 11:02:46 +0000273};
drh7e3b0a02001-04-28 16:52:40 +0000274
drha059ad02001-04-17 20:09:11 +0000275/*
drh3aac2dd2004-04-26 14:10:20 +0000276** Read or write a two-, four-, and eight-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000277*/
drh9e572e62004-04-23 23:43:10 +0000278static u32 get2byte(unsigned char *p){
279 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000280}
drh9e572e62004-04-23 23:43:10 +0000281static u32 get4byte(unsigned char *p){
282 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
283}
drh3aac2dd2004-04-26 14:10:20 +0000284static u64 get8byte(unsigned char *p){
drh9e572e62004-04-23 23:43:10 +0000285 u64 v = get4byte(p);
286 return (v<<32) | get4byte(&p[4]);
287}
288static void put2byte(unsigned char *p, u32 v){
289 p[0] = v>>8;
290 p[1] = v;
291}
292static void put4byte(unsigned char *p, u32 v){
293 p[0] = v>>24;
294 p[1] = v>>16;
295 p[2] = v>>8;
296 p[3] = v;
297}
298static void put8byte(unsigned char *p, u64 v){
299 put4byte(&p[4], v>>32);
300 put4byte(p, v);
301}
302
303/*
304** Read a variable-length integer. Store the result in *pResult.
305** Return the number of bytes in the integer.
306*/
307static unsigned int getVarint(unsigned char *p, u64 *pResult){
308 u64 x = p[0] & 0x7f;
309 int n = 0;
310 while( (p[n++]&0x80)!=0 ){
311 x |= (p[n]&0x7f)<<(n*7);
312 }
313 *pResult = x;
314 return n;
315}
316
317/*
318** Write a variable length integer with value v into p[]. Return
319** the number of bytes written.
320*/
321static unsigned int putVarint(unsigned char *p, u64 v){
322 int i = 0;
323 do{
drhde647132004-05-07 17:57:49 +0000324 p[i++] = (v & 0x7f) | 0x80;
drh9e572e62004-04-23 23:43:10 +0000325 v >>= 7;
326 }while( v!=0 );
drhde647132004-05-07 17:57:49 +0000327 p[i-1] &= 0x7f;
drh9e572e62004-04-23 23:43:10 +0000328 return i;
drh0d316a42002-08-11 20:10:47 +0000329}
330
331/*
drh3aac2dd2004-04-26 14:10:20 +0000332** Parse a cell header and fill in the CellInfo structure.
333*/
334static void parseCellHeader(
335 MemPage *pPage, /* Page containing the cell */
336 unsigned char *pCell, /* The cell */
337 u64 *pnData, /* Number of bytes of data in payload */
338 u64 *pnKey, /* Number of bytes of key, or key value for intKey */
339 int *pnHeader /* Size of header in bytes. Offset to payload */
340){
341 int n;
342 if( pPage->leaf ){
343 n = 2;
344 }else{
345 n = 6;
346 }
347 if( pPage->zeroData ){
348 *pnData = 0;
349 }else{
350 n += getVarint(&pCell[n], pnData);
351 }
drhde647132004-05-07 17:57:49 +0000352 n += getVarint(&pCell[n], pnKey);
drh3aac2dd2004-04-26 14:10:20 +0000353 *pnHeader = n;
354}
355
356/*
drh3b7511c2001-05-26 13:15:44 +0000357** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000358** database page. The number returned includes the Cell header,
359** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000360** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000361** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000362*/
drh9e572e62004-04-23 23:43:10 +0000363static int cellSize(MemPage *pPage, unsigned char *pCell){
drh3aac2dd2004-04-26 14:10:20 +0000364 int n;
drh9e572e62004-04-23 23:43:10 +0000365 u64 nData, nKey;
drh3aac2dd2004-04-26 14:10:20 +0000366 int nPayload, maxPayload;
367
368 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
369 nPayload = (int)nData;
370 if( !pPage->intKey ){
371 nPayload += (int)nKey;
drh3b7511c2001-05-26 13:15:44 +0000372 }
drh3aac2dd2004-04-26 14:10:20 +0000373 maxPayload = pPage->pBt->maxLocal;
drh9e572e62004-04-23 23:43:10 +0000374 if( nPayload>maxPayload ){
375 nPayload = maxPayload + 4;
376 }
377 return n + nPayload;
drh3b7511c2001-05-26 13:15:44 +0000378}
379
380/*
drh72f82862001-05-24 21:06:34 +0000381** Defragment the page given. All Cells are moved to the
382** beginning of the page and all free space is collected
383** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000384*/
drh9e572e62004-04-23 23:43:10 +0000385static void defragmentPage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000386 int pc, i, n, addr;
387 int start, hdr, size;
drh9e572e62004-04-23 23:43:10 +0000388 int leftover;
389 unsigned char *oldPage;
drh23e11ca2004-05-04 17:27:28 +0000390 unsigned char newPage[MX_PAGE_SIZE];
drh2af926b2001-05-15 00:39:25 +0000391
drha34b6762004-05-07 13:30:42 +0000392 assert( sqlite3pager_iswriteable(pPage->aData) );
drh9e572e62004-04-23 23:43:10 +0000393 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000394 assert( pPage->pBt->pageSize <= MX_PAGE_SIZE );
drh9e572e62004-04-23 23:43:10 +0000395 oldPage = pPage->aData;
396 hdr = pPage->hdrOffset;
drh3aac2dd2004-04-26 14:10:20 +0000397 addr = 3+hdr;
drh9e572e62004-04-23 23:43:10 +0000398 n = 6+hdr;
399 if( !pPage->leaf ){
400 n += 4;
drh2af926b2001-05-15 00:39:25 +0000401 }
drhc39e0002004-05-07 23:50:57 +0000402 memcpy(&newPage[hdr], &oldPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000403 start = n;
drh3aac2dd2004-04-26 14:10:20 +0000404 pc = get2byte(&oldPage[addr]);
drh9e572e62004-04-23 23:43:10 +0000405 i = 0;
406 while( pc>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000407 assert( n<pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000408 size = cellSize(pPage, &oldPage[pc]);
409 memcpy(&newPage[n], &oldPage[pc], size);
drh3aac2dd2004-04-26 14:10:20 +0000410 put2byte(&newPage[addr],n);
drhc39e0002004-05-07 23:50:57 +0000411 pPage->aCell[i++] = &oldPage[n];
drh9e572e62004-04-23 23:43:10 +0000412 n += size;
drh3aac2dd2004-04-26 14:10:20 +0000413 addr = pc;
drh9e572e62004-04-23 23:43:10 +0000414 pc = get2byte(&oldPage[pc]);
drh2aa679f2001-06-25 02:11:07 +0000415 }
drhc39e0002004-05-07 23:50:57 +0000416 assert( i==pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +0000417 leftover = pPage->pBt->pageSize - n;
drh9e572e62004-04-23 23:43:10 +0000418 assert( leftover>=0 );
419 assert( pPage->nFree==leftover );
420 if( leftover<4 ){
421 oldPage[hdr+5] = leftover;
422 leftover = 0;
drh3aac2dd2004-04-26 14:10:20 +0000423 n = pPage->pBt->pageSize;
drh9e572e62004-04-23 23:43:10 +0000424 }
drhc39e0002004-05-07 23:50:57 +0000425 memcpy(&oldPage[hdr], &newPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000426 if( leftover==0 ){
drhc39e0002004-05-07 23:50:57 +0000427 put2byte(&oldPage[hdr+1], 0);
drh9e572e62004-04-23 23:43:10 +0000428 }else if( leftover>=4 ){
drhc39e0002004-05-07 23:50:57 +0000429 put2byte(&oldPage[hdr+1], n);
drh9e572e62004-04-23 23:43:10 +0000430 put2byte(&oldPage[n], 0);
431 put2byte(&oldPage[n+2], leftover);
432 memset(&oldPage[n+4], 0, leftover-4);
433 }
drhc39e0002004-05-07 23:50:57 +0000434 oldPage[hdr+5] = 0;
drh365d68f2001-05-11 11:02:46 +0000435}
436
drha059ad02001-04-17 20:09:11 +0000437/*
drh9e572e62004-04-23 23:43:10 +0000438** Allocate nByte bytes of space on a page. If nByte is less than
439** 4 it is rounded up to 4.
drhbd03cae2001-06-02 02:40:57 +0000440**
drh9e572e62004-04-23 23:43:10 +0000441** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000442** the new allocation. Or return 0 if there is not enough free
443** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000444**
drh72f82862001-05-24 21:06:34 +0000445** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000446** nBytes of contiguous free space, then this routine automatically
447** calls defragementPage() to consolidate all free space before
448** allocating the new chunk.
drh9e572e62004-04-23 23:43:10 +0000449**
450** Algorithm: Carve a piece off of the first freeblock that is
451** nByte in size or that larger.
drh7e3b0a02001-04-28 16:52:40 +0000452*/
drh9e572e62004-04-23 23:43:10 +0000453static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000454 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000455 int size;
456 unsigned char *data;
drh44ce7e22003-06-17 02:57:17 +0000457#ifndef NDEBUG
458 int cnt = 0;
459#endif
drh72f82862001-05-24 21:06:34 +0000460
drh9e572e62004-04-23 23:43:10 +0000461 data = pPage->aData;
drha34b6762004-05-07 13:30:42 +0000462 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000463 assert( pPage->pBt );
464 if( nByte<4 ) nByte = 4;
drh14acc042001-06-10 19:56:58 +0000465 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
drh9e572e62004-04-23 23:43:10 +0000466 hdr = pPage->hdrOffset;
drh91025292004-05-03 19:49:32 +0000467 if( data[hdr+5]>=60 ){
drh9e572e62004-04-23 23:43:10 +0000468 defragmentPage(pPage);
drh2af926b2001-05-15 00:39:25 +0000469 }
drh3aac2dd2004-04-26 14:10:20 +0000470 addr = hdr+1;
471 pc = get2byte(&data[addr]);
472 assert( addr<pc );
drha34b6762004-05-07 13:30:42 +0000473 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000474 while( (size = get2byte(&data[pc+2]))<nByte ){
475 addr = pc;
476 pc = get2byte(&data[addr]);
drha34b6762004-05-07 13:30:42 +0000477 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000478 assert( pc>=addr+size+4 || pc==0 );
drh9e572e62004-04-23 23:43:10 +0000479 if( pc==0 ){
480 assert( (cnt++)==0 );
481 defragmentPage(pPage);
482 assert( data[hdr+5]==0 );
drh3aac2dd2004-04-26 14:10:20 +0000483 addr = pPage->hdrOffset+1;
484 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000485 }
486 }
487 assert( pc>0 && size>=nByte );
drha34b6762004-05-07 13:30:42 +0000488 assert( pc+size<=pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000489 if( size>nByte+4 ){
drhde647132004-05-07 17:57:49 +0000490 int newStart = pc+nByte;
491 put2byte(&data[addr], newStart);
492 put2byte(&data[newStart], get2byte(&data[pc]));
493 put2byte(&data[newStart+2], size-nByte);
drh2af926b2001-05-15 00:39:25 +0000494 }else{
drh3aac2dd2004-04-26 14:10:20 +0000495 put2byte(&data[addr], get2byte(&data[pc]));
drh9e572e62004-04-23 23:43:10 +0000496 data[hdr+5] += size-nByte;
drh2af926b2001-05-15 00:39:25 +0000497 }
498 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000499 assert( pPage->nFree>=0 );
500 return pc;
drh7e3b0a02001-04-28 16:52:40 +0000501}
502
503/*
drh9e572e62004-04-23 23:43:10 +0000504** Return a section of the pPage->aData to the freelist.
505** The first byte of the new free block is pPage->aDisk[start]
506** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000507**
508** Most of the effort here is involved in coalesing adjacent
509** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000510*/
drh9e572e62004-04-23 23:43:10 +0000511static void freeSpace(MemPage *pPage, int start, int size){
512 int end = start + size; /* End of the segment being freed */
drha34b6762004-05-07 13:30:42 +0000513 int addr, pbegin;
drh9e572e62004-04-23 23:43:10 +0000514#ifndef NDEBUG
515 int tsize = 0; /* Total size of all freeblocks */
516#endif
517 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000518
drh9e572e62004-04-23 23:43:10 +0000519 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000520 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000521 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
522 assert( end<=pPage->pBt->pageSize );
523 if( size<4 ) size = 4;
524
525 /* Add the space back into the linked list of freeblocks */
drh3aac2dd2004-04-26 14:10:20 +0000526 addr = pPage->hdrOffset + 1;
527 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drh9e572e62004-04-23 23:43:10 +0000528 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000529 assert( pbegin>addr );
530 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000531 }
drh9e572e62004-04-23 23:43:10 +0000532 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000533 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000534 put2byte(&data[addr], start);
535 put2byte(&data[start], pbegin);
536 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000537 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000538
539 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000540 addr = pPage->hdrOffset + 1;
541 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000542 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000543 assert( pbegin>addr );
drh9e572e62004-04-23 23:43:10 +0000544 assert( pbegin<pPage->pBt->pageSize-4 );
545 pnext = get2byte(&data[pbegin]);
546 psize = get2byte(&data[pbegin+2]);
547 if( pbegin + psize + 3 >= pnext && pnext>0 ){
548 int frag = pnext - (pbegin+psize);
549 assert( frag<=data[pPage->hdrOffset+5] );
550 data[pPage->hdrOffset+5] -= frag;
551 put2byte(&data[pbegin], get2byte(&data[pnext]));
552 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
553 }else{
554 assert( (tsize += psize)>0 );
drh3aac2dd2004-04-26 14:10:20 +0000555 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000556 }
557 }
558 assert( tsize+data[pPage->hdrOffset+5]==pPage->nFree );
drh7e3b0a02001-04-28 16:52:40 +0000559}
560
drh9e572e62004-04-23 23:43:10 +0000561/*
drh4b70f112004-05-02 21:12:19 +0000562** Resize the aCell[] array of the given page so that it is able to
563** hold at least nNewSz entries.
564**
565** Return SQLITE_OK or SQLITE_NOMEM.
566*/
567static int resizeCellArray(MemPage *pPage, int nNewSz){
drha34b6762004-05-07 13:30:42 +0000568 if( pPage->nCellAlloc<nNewSz ){
drh4b70f112004-05-02 21:12:19 +0000569 pPage->aCell = sqliteRealloc(pPage->aCell, nNewSz*sizeof(pPage->aCell[0]) );
570 if( sqlite_malloc_failed ) return SQLITE_NOMEM;
drha34b6762004-05-07 13:30:42 +0000571 pPage->nCellAlloc = nNewSz;
drh4b70f112004-05-02 21:12:19 +0000572 }
573 return SQLITE_OK;
574}
575
576/*
drh7e3b0a02001-04-28 16:52:40 +0000577** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000578**
drhbd03cae2001-06-02 02:40:57 +0000579** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +0000580** is the parent of the page being initialized. The root of a
581** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000582**
drh72f82862001-05-24 21:06:34 +0000583** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000584** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000585** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
586** guarantee that the page is well-formed. It only shows that
587** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000588*/
drh9e572e62004-04-23 23:43:10 +0000589static int initPage(
drh3aac2dd2004-04-26 14:10:20 +0000590 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +0000591 MemPage *pParent /* The parent. Might be NULL */
592){
drh3aac2dd2004-04-26 14:10:20 +0000593 int c, pc, i, hdr;
drha34b6762004-05-07 13:30:42 +0000594 unsigned char *data;
595 int pageSize;
drh9e572e62004-04-23 23:43:10 +0000596 int sumCell = 0; /* Total size of all cells */
drh2af926b2001-05-15 00:39:25 +0000597
drh3aac2dd2004-04-26 14:10:20 +0000598 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000599 assert( pParent==0 || pParent->pBt==pPage->pBt );
drha34b6762004-05-07 13:30:42 +0000600 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
drhde647132004-05-07 17:57:49 +0000601 assert( pPage->aData == &((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhc8629a12004-05-08 20:07:40 +0000602 assert( pPage->pParent==0 || pPage->pParent==pParent );
603 if( pPage->pParent==0 && pParent!=0 ){
604 pPage->pParent = pParent;
drha34b6762004-05-07 13:30:42 +0000605 sqlite3pager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +0000606 }
drhc8629a12004-05-08 20:07:40 +0000607 if( pPage->isInit ) return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +0000608 pPage->nCell = pPage->nCellAlloc = 0;
drhde647132004-05-07 17:57:49 +0000609 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
610 hdr = pPage->hdrOffset;
drha34b6762004-05-07 13:30:42 +0000611 data = pPage->aData;
612 c = data[hdr];
drh9e572e62004-04-23 23:43:10 +0000613 pPage->intKey = (c & PTF_INTKEY)!=0;
614 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh4b70f112004-05-02 21:12:19 +0000615 pPage->leaf = (c & PTF_LEAF)!=0;
drhc8629a12004-05-08 20:07:40 +0000616 pPage->isOverfull = 0;
617 pPage->idxShift = 0;
drha34b6762004-05-07 13:30:42 +0000618 pageSize = pPage->pBt->pageSize;
drh2af926b2001-05-15 00:39:25 +0000619
drh9e572e62004-04-23 23:43:10 +0000620 /* Initialize the cell count and cell pointers */
621 pc = get2byte(&data[hdr+3]);
622 while( pc>0 ){
drha34b6762004-05-07 13:30:42 +0000623 if( pc>=pageSize ) return SQLITE_CORRUPT;
624 if( pPage->nCell>pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000625 pPage->nCell++;
626 pc = get2byte(&data[pc]);
627 }
drh4b70f112004-05-02 21:12:19 +0000628 if( resizeCellArray(pPage, pPage->nCell) ){
drh9e572e62004-04-23 23:43:10 +0000629 return SQLITE_NOMEM;
630 }
631 pc = get2byte(&data[hdr+3]);
632 for(i=0; pc>0; i++){
633 pPage->aCell[i] = &data[pc];
drh9e572e62004-04-23 23:43:10 +0000634 sumCell += cellSize(pPage, &data[pc]);
drhde647132004-05-07 17:57:49 +0000635 pc = get2byte(&data[pc]);
drh9e572e62004-04-23 23:43:10 +0000636 }
637
638 /* Compute the total free space on the page */
639 pPage->nFree = data[hdr+5];
640 pc = get2byte(&data[hdr+1]);
641 while( pc>0 ){
642 int next, size;
drha34b6762004-05-07 13:30:42 +0000643 if( pc>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000644 next = get2byte(&data[pc]);
645 size = get2byte(&data[pc+2]);
drh3aac2dd2004-04-26 14:10:20 +0000646 if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000647 pPage->nFree += size;
648 pc = next;
649 }
drha34b6762004-05-07 13:30:42 +0000650 if( pPage->nFree>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000651
652 /* Sanity check: Cells and freespace and header must sum to the size
653 ** a page. */
drha34b6762004-05-07 13:30:42 +0000654 if( sumCell+pPage->nFree+hdr+10-pPage->leaf*4 != pageSize ){
655 return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000656 }
657
drhde647132004-05-07 17:57:49 +0000658 pPage->isInit = 1;
drh9e572e62004-04-23 23:43:10 +0000659 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000660}
661
662/*
drh8b2f49b2001-06-08 00:21:52 +0000663** Set up a raw page so that it looks like a database page holding
664** no entries.
drhbd03cae2001-06-02 02:40:57 +0000665*/
drh9e572e62004-04-23 23:43:10 +0000666static void zeroPage(MemPage *pPage, int flags){
667 unsigned char *data = pPage->aData;
668 Btree *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +0000669 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +0000670 int first;
671
drhc8629a12004-05-08 20:07:40 +0000672 assert( sqlite3pager_pagenumber(data)==pPage->pgno );
673 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000674 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000675 memset(&data[hdr], 0, pBt->pageSize - hdr);
676 data[hdr] = flags;
drhde647132004-05-07 17:57:49 +0000677 first = hdr + 6 + 4*((flags&PTF_LEAF)==0);
drh9e572e62004-04-23 23:43:10 +0000678 put2byte(&data[hdr+1], first);
679 put2byte(&data[first+2], pBt->pageSize - first);
680 sqliteFree(pPage->aCell);
681 pPage->aCell = 0;
drh8c42ca92001-06-22 19:15:00 +0000682 pPage->nCell = 0;
drhde647132004-05-07 17:57:49 +0000683 pPage->nCellAlloc = 0;
drh9e572e62004-04-23 23:43:10 +0000684 pPage->nFree = pBt->pageSize - first;
685 pPage->intKey = (flags & PTF_INTKEY)!=0;
686 pPage->leaf = (flags & PTF_LEAF)!=0;
687 pPage->zeroData = (flags & PTF_ZERODATA)!=0;
688 pPage->hdrOffset = hdr;
drhc8629a12004-05-08 20:07:40 +0000689 pPage->isOverfull = 0;
690 pPage->idxShift = 0;
691 pPage->isInit = 1;
drhbd03cae2001-06-02 02:40:57 +0000692}
693
694/*
drh3aac2dd2004-04-26 14:10:20 +0000695** Get a page from the pager. Initialize the MemPage.pBt and
696** MemPage.aData elements if needed.
697*/
698static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
699 int rc;
700 unsigned char *aData;
701 MemPage *pPage;
drha34b6762004-05-07 13:30:42 +0000702 rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
drh3aac2dd2004-04-26 14:10:20 +0000703 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +0000704 pPage = (MemPage*)&aData[pBt->pageSize];
drh3aac2dd2004-04-26 14:10:20 +0000705 pPage->aData = aData;
706 pPage->pBt = pBt;
707 pPage->pgno = pgno;
drhde647132004-05-07 17:57:49 +0000708 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
drh3aac2dd2004-04-26 14:10:20 +0000709 *ppPage = pPage;
710 return SQLITE_OK;
711}
712
713/*
drhde647132004-05-07 17:57:49 +0000714** Get a page from the pager and initialize it. This routine
715** is just a convenience wrapper around separate calls to
716** getPage() and initPage().
717*/
718static int getAndInitPage(
719 Btree *pBt, /* The database file */
720 Pgno pgno, /* Number of the page to get */
721 MemPage **ppPage, /* Write the page pointer here */
722 MemPage *pParent /* Parent of the page */
723){
724 int rc;
725 rc = getPage(pBt, pgno, ppPage);
726 if( rc==SQLITE_OK ){
727 rc = initPage(*ppPage, pParent);
728 }
729 return rc;
730}
731
732/*
drh3aac2dd2004-04-26 14:10:20 +0000733** Release a MemPage. This should be called once for each prior
734** call to getPage.
735*/
drh4b70f112004-05-02 21:12:19 +0000736static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000737 if( pPage ){
738 assert( pPage->aData );
739 assert( pPage->pBt );
740 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000741 sqlite3pager_unref(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +0000742 }
743}
744
745/*
drh72f82862001-05-24 21:06:34 +0000746** This routine is called when the reference count for a page
747** reaches zero. We need to unref the pParent pointer when that
748** happens.
749*/
750static void pageDestructor(void *pData){
drh9e572e62004-04-23 23:43:10 +0000751 MemPage *pPage = (MemPage*)&((char*)pData)[SQLITE_PAGE_SIZE];
drh72f82862001-05-24 21:06:34 +0000752 if( pPage->pParent ){
753 MemPage *pParent = pPage->pParent;
754 pPage->pParent = 0;
drha34b6762004-05-07 13:30:42 +0000755 releasePage(pParent);
drh72f82862001-05-24 21:06:34 +0000756 }
drh9e572e62004-04-23 23:43:10 +0000757 sqliteFree(pPage->aCell);
758 pPage->aCell = 0;
drh3aac2dd2004-04-26 14:10:20 +0000759 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +0000760}
761
762/*
drh306dc212001-05-21 13:45:10 +0000763** Open a new database.
764**
765** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000766** for accessing the database. We do not open the database file
767** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000768**
769** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000770** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +0000771** database file will be deleted when sqlite3BtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000772*/
drh23e11ca2004-05-04 17:27:28 +0000773int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +0000774 const char *zFilename, /* Name of the file containing the BTree database */
775 Btree **ppBtree, /* Pointer to new Btree object written here */
776 int nCache, /* Number of cache pages */
777 int flags /* Options */
drh6019e162001-07-02 17:51:45 +0000778){
drha059ad02001-04-17 20:09:11 +0000779 Btree *pBt;
drha34b6762004-05-07 13:30:42 +0000780 int rc;
drha059ad02001-04-17 20:09:11 +0000781
drhd62d3d02003-01-24 12:14:20 +0000782 /*
783 ** The following asserts make sure that structures used by the btree are
784 ** the right size. This is to guard against size changes that result
785 ** when compiling on a different architecture.
786 */
drh9e572e62004-04-23 23:43:10 +0000787 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +0000788 assert( sizeof(u32)==4 );
789 assert( sizeof(u16)==2 );
790 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +0000791 assert( sizeof(ptr)==sizeof(char*) );
792 assert( sizeof(uptr)==sizeof(ptr) );
793
drha059ad02001-04-17 20:09:11 +0000794 pBt = sqliteMalloc( sizeof(*pBt) );
795 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000796 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000797 return SQLITE_NOMEM;
798 }
drh6019e162001-07-02 17:51:45 +0000799 if( nCache<10 ) nCache = 10;
drha34b6762004-05-07 13:30:42 +0000800 rc = sqlite3pager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
drh3aac2dd2004-04-26 14:10:20 +0000801 (flags & BTREE_OMIT_JOURNAL)==0);
drha059ad02001-04-17 20:09:11 +0000802 if( rc!=SQLITE_OK ){
drha34b6762004-05-07 13:30:42 +0000803 if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000804 sqliteFree(pBt);
805 *ppBtree = 0;
806 return rc;
807 }
drha34b6762004-05-07 13:30:42 +0000808 sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000809 pBt->pCursor = 0;
drha34b6762004-05-07 13:30:42 +0000810 pBt->pPage1 = 0;
811 pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
drh4b70f112004-05-02 21:12:19 +0000812 pBt->pageSize = SQLITE_PAGE_SIZE; /* FIX ME - read from header */
813 pBt->maxLocal = (pBt->pageSize-10)/4-12;
drha059ad02001-04-17 20:09:11 +0000814 *ppBtree = pBt;
815 return SQLITE_OK;
816}
817
818/*
819** Close an open database and invalidate all cursors.
820*/
drh3aac2dd2004-04-26 14:10:20 +0000821int sqlite3BtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000822 while( pBt->pCursor ){
drh3aac2dd2004-04-26 14:10:20 +0000823 sqlite3BtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000824 }
drha34b6762004-05-07 13:30:42 +0000825 sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000826 sqliteFree(pBt);
827 return SQLITE_OK;
828}
829
830/*
drhda47d772002-12-02 04:25:19 +0000831** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000832**
833** The maximum number of cache pages is set to the absolute
834** value of mxPage. If mxPage is negative, the pager will
835** operate asynchronously - it will not stop to do fsync()s
836** to insure data is written to the disk surface before
837** continuing. Transactions still work if synchronous is off,
838** and the database cannot be corrupted if this program
839** crashes. But if the operating system crashes or there is
840** an abrupt power failure when synchronous is off, the database
841** could be left in an inconsistent and unrecoverable state.
842** Synchronous is on by default so database corruption is not
843** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000844*/
drh23e11ca2004-05-04 17:27:28 +0000845int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
drha34b6762004-05-07 13:30:42 +0000846 sqlite3pager_set_cachesize(pBt->pPager, mxPage);
drhf57b14a2001-09-14 18:54:08 +0000847 return SQLITE_OK;
848}
849
850/*
drh973b6e32003-02-12 14:09:42 +0000851** Change the way data is synced to disk in order to increase or decrease
852** how well the database resists damage due to OS crashes and power
853** failures. Level 1 is the same as asynchronous (no syncs() occur and
854** there is a high probability of damage) Level 2 is the default. There
855** is a very low but non-zero probability of damage. Level 3 reduces the
856** probability of damage to near zero but with a write performance reduction.
857*/
drh3aac2dd2004-04-26 14:10:20 +0000858int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
drha34b6762004-05-07 13:30:42 +0000859 sqlite3pager_set_safety_level(pBt->pPager, level);
drh973b6e32003-02-12 14:09:42 +0000860 return SQLITE_OK;
861}
862
863/*
drha34b6762004-05-07 13:30:42 +0000864** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +0000865** also acquire a readlock on that file.
866**
867** SQLITE_OK is returned on success. If the file is not a
868** well-formed database file, then SQLITE_CORRUPT is returned.
869** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
870** is returned if we run out of memory. SQLITE_PROTOCOL is returned
871** if there is a locking protocol violation.
872*/
873static int lockBtree(Btree *pBt){
874 int rc;
drh3aac2dd2004-04-26 14:10:20 +0000875 MemPage *pPage1;
drha34b6762004-05-07 13:30:42 +0000876 if( pBt->pPage1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000877 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +0000878 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +0000879
drh306dc212001-05-21 13:45:10 +0000880
881 /* Do some checking to help insure the file we opened really is
882 ** a valid database file.
883 */
drha34b6762004-05-07 13:30:42 +0000884 if( sqlite3pager_pagecount(pBt->pPager)>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000885 if( memcmp(pPage1->aData, zMagicHeader, 16)!=0 ){
drhc602f9a2004-02-12 19:01:04 +0000886 rc = SQLITE_NOTADB;
drh72f82862001-05-24 21:06:34 +0000887 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +0000888 }
drh9e572e62004-04-23 23:43:10 +0000889 /*** TBD: Other header checks such as page size ****/
drh306dc212001-05-21 13:45:10 +0000890 }
drh3aac2dd2004-04-26 14:10:20 +0000891 pBt->pPage1 = pPage1;
drh306dc212001-05-21 13:45:10 +0000892 return rc;
893
drh72f82862001-05-24 21:06:34 +0000894page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +0000895 releasePage(pPage1);
896 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +0000897 return rc;
drh306dc212001-05-21 13:45:10 +0000898}
899
900/*
drhb8ca3072001-12-05 00:21:20 +0000901** If there are no outstanding cursors and we are not in the middle
902** of a transaction but there is a read lock on the database, then
903** this routine unrefs the first page of the database file which
904** has the effect of releasing the read lock.
905**
906** If there are any outstanding cursors, this routine is a no-op.
907**
908** If there is a transaction in progress, this routine is a no-op.
909*/
910static void unlockBtreeIfUnused(Btree *pBt){
drh3aac2dd2004-04-26 14:10:20 +0000911 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->pPage1!=0 ){
912 releasePage(pBt->pPage1);
913 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +0000914 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +0000915 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +0000916 }
917}
918
919/*
drh9e572e62004-04-23 23:43:10 +0000920** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +0000921** file.
drh8b2f49b2001-06-08 00:21:52 +0000922*/
923static int newDatabase(Btree *pBt){
drh9e572e62004-04-23 23:43:10 +0000924 MemPage *pP1;
925 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +0000926 int rc;
drhde647132004-05-07 17:57:49 +0000927 if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000928 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +0000929 assert( pP1!=0 );
930 data = pP1->aData;
drha34b6762004-05-07 13:30:42 +0000931 rc = sqlite3pager_write(data);
drh8b2f49b2001-06-08 00:21:52 +0000932 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +0000933 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
934 assert( sizeof(zMagicHeader)==16 );
935 put2byte(&data[16], SQLITE_PAGE_SIZE);
936 data[18] = 1;
937 data[19] = 1;
938 put2byte(&data[22], (SQLITE_PAGE_SIZE-10)/4-12);
939 zeroPage(pP1, PTF_INTKEY|PTF_LEAF);
drh8b2f49b2001-06-08 00:21:52 +0000940 return SQLITE_OK;
941}
942
943/*
drh72f82862001-05-24 21:06:34 +0000944** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +0000945**
946** A transaction must be started before attempting any changes
947** to the database. None of the following routines will work
948** unless a transaction is started first:
949**
drh23e11ca2004-05-04 17:27:28 +0000950** sqlite3BtreeCreateTable()
951** sqlite3BtreeCreateIndex()
952** sqlite3BtreeClearTable()
953** sqlite3BtreeDropTable()
954** sqlite3BtreeInsert()
955** sqlite3BtreeDelete()
956** sqlite3BtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +0000957*/
drh3aac2dd2004-04-26 14:10:20 +0000958int sqlite3BtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000959 int rc;
960 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +0000961 if( pBt->readOnly ) return SQLITE_READONLY;
drh3aac2dd2004-04-26 14:10:20 +0000962 if( pBt->pPage1==0 ){
drh7e3b0a02001-04-28 16:52:40 +0000963 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +0000964 if( rc!=SQLITE_OK ){
965 return rc;
966 }
drha059ad02001-04-17 20:09:11 +0000967 }
drha34b6762004-05-07 13:30:42 +0000968 rc = sqlite3pager_begin(pBt->pPage1->aData);
drhf74b8d92002-09-01 23:20:45 +0000969 if( rc==SQLITE_OK ){
970 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +0000971 }
drhb8ca3072001-12-05 00:21:20 +0000972 if( rc==SQLITE_OK ){
973 pBt->inTrans = 1;
drh3aac2dd2004-04-26 14:10:20 +0000974 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +0000975 }else{
976 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +0000977 }
drhb8ca3072001-12-05 00:21:20 +0000978 return rc;
drha059ad02001-04-17 20:09:11 +0000979}
980
981/*
drh2aa679f2001-06-25 02:11:07 +0000982** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +0000983**
984** This will release the write lock on the database file. If there
985** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +0000986*/
drh3aac2dd2004-04-26 14:10:20 +0000987int sqlite3BtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000988 int rc;
drha34b6762004-05-07 13:30:42 +0000989 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +0000990 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +0000991 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +0000992 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +0000993 return rc;
994}
995
996/*
drhc39e0002004-05-07 23:50:57 +0000997** Invalidate all cursors
998*/
999static void invalidateCursors(Btree *pBt){
1000 BtCursor *pCur;
1001 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1002 MemPage *pPage = pCur->pPage;
1003 if( pPage && !pPage->isInit ){
1004 releasePage(pPage);
1005 pCur->pPage = 0;
1006 pCur->isValid = 0;
1007 pCur->status = SQLITE_ABORT;
1008 }
1009 }
1010}
1011
drhc8629a12004-05-08 20:07:40 +00001012#ifdef SQLITE_TEST
1013/*
1014** Print debugging information about all cursors to standard output.
1015*/
1016void sqlite3BtreeCursorList(Btree *pBt){
1017 BtCursor *pCur;
1018 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1019 MemPage *pPage = pCur->pPage;
1020 char *zMode = pCur->wrFlag ? "rw" : "ro";
1021 printf("CURSOR %08x rooted at %4d(%s) currently at %d.%d%s\n",
1022 (int)pCur, pCur->pgnoRoot, zMode,
1023 pPage ? pPage->pgno : 0, pCur->idx,
1024 pCur->isValid ? "" : " eof"
1025 );
1026 }
1027}
1028#endif
1029
drhc39e0002004-05-07 23:50:57 +00001030/*
drhecdc7532001-09-23 02:35:53 +00001031** Rollback the transaction in progress. All cursors will be
1032** invalided by this operation. Any attempt to use a cursor
1033** that was open at the beginning of this operation will result
1034** in an error.
drh5e00f6c2001-09-13 13:46:56 +00001035**
1036** This will release the write lock on the database file. If there
1037** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001038*/
drh3aac2dd2004-04-26 14:10:20 +00001039int sqlite3BtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001040 int rc;
drh7c717f72001-06-24 20:39:41 +00001041 if( pBt->inTrans==0 ) return SQLITE_OK;
1042 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001043 pBt->inStmt = 0;
drha34b6762004-05-07 13:30:42 +00001044 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_rollback(pBt->pPager);
drhc39e0002004-05-07 23:50:57 +00001045 invalidateCursors(pBt);
drh5e00f6c2001-09-13 13:46:56 +00001046 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001047 return rc;
1048}
1049
1050/*
drh663fc632002-02-02 18:49:19 +00001051** Set the checkpoint for the current transaction. The checkpoint serves
1052** as a sub-transaction that can be rolled back independently of the
1053** main transaction. You must start a transaction before starting a
1054** checkpoint. The checkpoint is ended automatically if the transaction
1055** commits or rolls back.
1056**
1057** Only one checkpoint may be active at a time. It is an error to try
1058** to start a new checkpoint if another checkpoint is already active.
1059*/
drh3aac2dd2004-04-26 14:10:20 +00001060int sqlite3BtreeBeginStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001061 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001062 if( !pBt->inTrans || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00001063 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00001064 }
drha34b6762004-05-07 13:30:42 +00001065 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00001066 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00001067 return rc;
1068}
1069
1070
1071/*
1072** Commit a checkpoint to transaction currently in progress. If no
1073** checkpoint is active, this is a no-op.
1074*/
drh3aac2dd2004-04-26 14:10:20 +00001075int sqlite3BtreeCommitStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001076 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001077 if( pBt->inStmt && !pBt->readOnly ){
drha34b6762004-05-07 13:30:42 +00001078 rc = sqlite3pager_stmt_commit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00001079 }else{
1080 rc = SQLITE_OK;
1081 }
drh3aac2dd2004-04-26 14:10:20 +00001082 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001083 return rc;
1084}
1085
1086/*
1087** Rollback the checkpoint to the current transaction. If there
1088** is no active checkpoint or transaction, this routine is a no-op.
1089**
1090** All cursors will be invalided by this operation. Any attempt
1091** to use a cursor that was open at the beginning of this operation
1092** will result in an error.
1093*/
drh3aac2dd2004-04-26 14:10:20 +00001094int sqlite3BtreeRollbackStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001095 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001096 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drha34b6762004-05-07 13:30:42 +00001097 rc = sqlite3pager_stmt_rollback(pBt->pPager);
drhc39e0002004-05-07 23:50:57 +00001098 invalidateCursors(pBt);
drh3aac2dd2004-04-26 14:10:20 +00001099 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001100 return rc;
1101}
1102
1103/*
drh3aac2dd2004-04-26 14:10:20 +00001104** Default key comparison function to be used if no comparison function
1105** is specified on the sqlite3BtreeCursor() call.
1106*/
1107static int dfltCompare(
1108 void *NotUsed, /* User data is not used */
1109 int n1, const void *p1, /* First key to compare */
1110 int n2, const void *p2 /* Second key to compare */
1111){
1112 int c;
1113 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1114 if( c==0 ){
1115 c = n1 - n2;
1116 }
1117 return c;
1118}
1119
1120/*
drh8b2f49b2001-06-08 00:21:52 +00001121** Create a new cursor for the BTree whose root is on the page
1122** iTable. The act of acquiring a cursor gets a read lock on
1123** the database file.
drh1bee3d72001-10-15 00:44:35 +00001124**
1125** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001126** If wrFlag==1, then the cursor can be used for reading or for
1127** writing if other conditions for writing are also met. These
1128** are the conditions that must be met in order for writing to
1129** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001130**
drhf74b8d92002-09-01 23:20:45 +00001131** 1: The cursor must have been opened with wrFlag==1
1132**
1133** 2: No other cursors may be open with wrFlag==0 on the same table
1134**
1135** 3: The database must be writable (not on read-only media)
1136**
1137** 4: There must be an active transaction.
1138**
1139** Condition 2 warrants further discussion. If any cursor is opened
1140** on a table with wrFlag==0, that prevents all other cursors from
1141** writing to that table. This is a kind of "read-lock". When a cursor
1142** is opened with wrFlag==0 it is guaranteed that the table will not
1143** change as long as the cursor is open. This allows the cursor to
1144** do a sequential scan of the table without having to worry about
1145** entries being inserted or deleted during the scan. Cursors should
1146** be opened with wrFlag==0 only if this read-lock property is needed.
1147** That is to say, cursors should be opened with wrFlag==0 only if they
drh23e11ca2004-05-04 17:27:28 +00001148** intend to use the sqlite3BtreeNext() system call. All other cursors
drhf74b8d92002-09-01 23:20:45 +00001149** should be opened with wrFlag==1 even if they never really intend
1150** to write.
1151**
drh6446c4d2001-12-15 14:22:18 +00001152** No checking is done to make sure that page iTable really is the
1153** root page of a b-tree. If it is not, then the cursor acquired
1154** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00001155**
1156** The comparison function must be logically the same for every cursor
1157** on a particular table. Changing the comparison function will result
1158** in incorrect operations. If the comparison function is NULL, a
1159** default comparison function is used. The comparison function is
1160** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00001161*/
drh3aac2dd2004-04-26 14:10:20 +00001162int sqlite3BtreeCursor(
1163 Btree *pBt, /* The btree */
1164 int iTable, /* Root page of table to open */
1165 int wrFlag, /* 1 to write. 0 read-only */
1166 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1167 void *pArg, /* First arg to xCompare() */
1168 BtCursor **ppCur /* Write new cursor here */
1169){
drha059ad02001-04-17 20:09:11 +00001170 int rc;
drhf74b8d92002-09-01 23:20:45 +00001171 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001172
drha0c9a112004-03-10 13:42:37 +00001173 if( pBt->readOnly && wrFlag ){
1174 *ppCur = 0;
1175 return SQLITE_READONLY;
1176 }
drh4b70f112004-05-02 21:12:19 +00001177 if( pBt->pPage1==0 ){
drha059ad02001-04-17 20:09:11 +00001178 rc = lockBtree(pBt);
1179 if( rc!=SQLITE_OK ){
1180 *ppCur = 0;
1181 return rc;
1182 }
1183 }
1184 pCur = sqliteMalloc( sizeof(*pCur) );
1185 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001186 rc = SQLITE_NOMEM;
1187 goto create_cursor_exception;
1188 }
drh8b2f49b2001-06-08 00:21:52 +00001189 pCur->pgnoRoot = (Pgno)iTable;
drhde647132004-05-07 17:57:49 +00001190 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00001191 if( rc!=SQLITE_OK ){
1192 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001193 }
drh3aac2dd2004-04-26 14:10:20 +00001194 pCur->xCompare = xCmp ? xCmp : dfltCompare;
1195 pCur->pArg = pArg;
drh14acc042001-06-10 19:56:58 +00001196 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001197 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001198 pCur->idx = 0;
drha059ad02001-04-17 20:09:11 +00001199 pCur->pNext = pBt->pCursor;
1200 if( pCur->pNext ){
1201 pCur->pNext->pPrev = pCur;
1202 }
drh14acc042001-06-10 19:56:58 +00001203 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001204 pRing = pBt->pCursor;
1205 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1206 if( pRing ){
1207 pCur->pShared = pRing->pShared;
1208 pRing->pShared = pCur;
1209 }else{
1210 pCur->pShared = pCur;
1211 }
drha059ad02001-04-17 20:09:11 +00001212 pBt->pCursor = pCur;
drhc39e0002004-05-07 23:50:57 +00001213 pCur->isValid = 0;
1214 pCur->status = SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001215 *ppCur = pCur;
1216 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001217
1218create_cursor_exception:
1219 *ppCur = 0;
1220 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00001221 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001222 sqliteFree(pCur);
1223 }
drh5e00f6c2001-09-13 13:46:56 +00001224 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001225 return rc;
drha059ad02001-04-17 20:09:11 +00001226}
1227
1228/*
drh5e00f6c2001-09-13 13:46:56 +00001229** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001230** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001231*/
drh3aac2dd2004-04-26 14:10:20 +00001232int sqlite3BtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001233 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001234 if( pCur->pPrev ){
1235 pCur->pPrev->pNext = pCur->pNext;
1236 }else{
1237 pBt->pCursor = pCur->pNext;
1238 }
1239 if( pCur->pNext ){
1240 pCur->pNext->pPrev = pCur->pPrev;
1241 }
drh3aac2dd2004-04-26 14:10:20 +00001242 releasePage(pCur->pPage);
drhf74b8d92002-09-01 23:20:45 +00001243 if( pCur->pShared!=pCur ){
1244 BtCursor *pRing = pCur->pShared;
1245 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1246 pRing->pShared = pCur->pShared;
1247 }
drh5e00f6c2001-09-13 13:46:56 +00001248 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001249 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001250 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001251}
1252
drh7e3b0a02001-04-28 16:52:40 +00001253/*
drh5e2f8b92001-05-28 00:41:15 +00001254** Make a temporary cursor by filling in the fields of pTempCur.
1255** The temporary cursor is not on the cursor list for the Btree.
1256*/
drh14acc042001-06-10 19:56:58 +00001257static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001258 memcpy(pTempCur, pCur, sizeof(*pCur));
1259 pTempCur->pNext = 0;
1260 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001261 if( pTempCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001262 sqlite3pager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001263 }
drh5e2f8b92001-05-28 00:41:15 +00001264}
1265
1266/*
drhbd03cae2001-06-02 02:40:57 +00001267** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001268** function above.
1269*/
drh14acc042001-06-10 19:56:58 +00001270static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001271 if( pCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001272 sqlite3pager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001273 }
drh5e2f8b92001-05-28 00:41:15 +00001274}
1275
1276/*
drh3aac2dd2004-04-26 14:10:20 +00001277** Set *pSize to the size of the buffer needed to hold the value of
1278** the key for the current entry. If the cursor is not pointing
1279** to a valid entry, *pSize is set to 0.
1280**
drh4b70f112004-05-02 21:12:19 +00001281** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00001282** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00001283*/
drh3aac2dd2004-04-26 14:10:20 +00001284int sqlite3BtreeKeySize(BtCursor *pCur, u64 *pSize){
drh2af926b2001-05-15 00:39:25 +00001285 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001286 unsigned char *cell;
drh2af926b2001-05-15 00:39:25 +00001287
drhc39e0002004-05-07 23:50:57 +00001288 if( !pCur->isValid ){
drh72f82862001-05-24 21:06:34 +00001289 *pSize = 0;
1290 }else{
drhc39e0002004-05-07 23:50:57 +00001291 pPage = pCur->pPage;
1292 assert( pPage!=0 );
1293 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1294 cell = pPage->aCell[pCur->idx];
drh3aac2dd2004-04-26 14:10:20 +00001295 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001296 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001297 cell += 4; /* Skip the child pointer */
1298 }
1299 if( !pPage->zeroData ){
drha34b6762004-05-07 13:30:42 +00001300 while( (0x80&*(cell++))!=0 ){} /* Skip the data size number */
drh3aac2dd2004-04-26 14:10:20 +00001301 }
drha34b6762004-05-07 13:30:42 +00001302 getVarint(cell, pSize);
drh72f82862001-05-24 21:06:34 +00001303 }
1304 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001305}
drh2af926b2001-05-15 00:39:25 +00001306
drh72f82862001-05-24 21:06:34 +00001307/*
1308** Read payload information from the entry that the pCur cursor is
1309** pointing to. Begin reading the payload at "offset" and read
1310** a total of "amt" bytes. Put the result in zBuf.
1311**
1312** This routine does not make a distinction between key and data.
1313** It just reads bytes from the payload area.
1314*/
drh3aac2dd2004-04-26 14:10:20 +00001315static int getPayload(
1316 BtCursor *pCur, /* Cursor pointing to entry to read from */
1317 int offset, /* Begin reading this far into payload */
1318 int amt, /* Read this many bytes */
1319 unsigned char *pBuf, /* Write the bytes into this buffer */
1320 int skipKey /* offset begins at data if this is true */
1321){
1322 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001323 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001324 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001325 MemPage *pPage;
1326 Btree *pBt;
1327 u64 nData, nKey;
1328 int maxLocal, ovflSize;
1329
drh72f82862001-05-24 21:06:34 +00001330 assert( pCur!=0 && pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001331 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001332 pBt = pCur->pBt;
1333 pPage = pCur->pPage;
1334 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1335 aPayload = pPage->aCell[pCur->idx];
1336 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001337 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001338 aPayload += 4; /* Skip the child pointer */
1339 }
1340 if( pPage->zeroData ){
1341 nData = 0;
1342 }else{
1343 aPayload += getVarint(aPayload, &nData);
1344 }
drha34b6762004-05-07 13:30:42 +00001345 aPayload += getVarint(aPayload, &nKey);
drh3aac2dd2004-04-26 14:10:20 +00001346 if( pPage->intKey ){
1347 nKey = 0;
1348 }
1349 assert( offset>=0 );
1350 if( skipKey ){
1351 offset += nKey;
1352 }
1353 if( offset+amt > nKey+nData ){
drha34b6762004-05-07 13:30:42 +00001354 return SQLITE_ERROR;
drh3aac2dd2004-04-26 14:10:20 +00001355 }
drha34b6762004-05-07 13:30:42 +00001356 maxLocal = pBt->maxLocal;
drh3aac2dd2004-04-26 14:10:20 +00001357 if( offset<maxLocal ){
drh2af926b2001-05-15 00:39:25 +00001358 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001359 if( a+offset>maxLocal ){
1360 a = maxLocal - offset;
drh2af926b2001-05-15 00:39:25 +00001361 }
drha34b6762004-05-07 13:30:42 +00001362 memcpy(pBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001363 if( a==amt ){
1364 return SQLITE_OK;
1365 }
drh2aa679f2001-06-25 02:11:07 +00001366 offset = 0;
drha34b6762004-05-07 13:30:42 +00001367 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00001368 amt -= a;
drhdd793422001-06-28 01:54:48 +00001369 }else{
drh3aac2dd2004-04-26 14:10:20 +00001370 offset -= maxLocal;
drhbd03cae2001-06-02 02:40:57 +00001371 }
1372 if( amt>0 ){
drha34b6762004-05-07 13:30:42 +00001373 nextPage = get4byte(&aPayload[maxLocal]);
drh2af926b2001-05-15 00:39:25 +00001374 }
drh3aac2dd2004-04-26 14:10:20 +00001375 ovflSize = pBt->pageSize - 4;
drh2af926b2001-05-15 00:39:25 +00001376 while( amt>0 && nextPage ){
drha34b6762004-05-07 13:30:42 +00001377 rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
drh2af926b2001-05-15 00:39:25 +00001378 if( rc!=0 ){
1379 return rc;
1380 }
drha34b6762004-05-07 13:30:42 +00001381 nextPage = get4byte(aPayload);
drh3aac2dd2004-04-26 14:10:20 +00001382 if( offset<ovflSize ){
drh2af926b2001-05-15 00:39:25 +00001383 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001384 if( a + offset > ovflSize ){
1385 a = ovflSize - offset;
drh2af926b2001-05-15 00:39:25 +00001386 }
drh9b171272004-05-08 02:03:22 +00001387 memcpy(pBuf, &aPayload[offset+4], a);
drh2aa679f2001-06-25 02:11:07 +00001388 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001389 amt -= a;
drha34b6762004-05-07 13:30:42 +00001390 pBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001391 }else{
drh3aac2dd2004-04-26 14:10:20 +00001392 offset -= ovflSize;
drh2af926b2001-05-15 00:39:25 +00001393 }
drha34b6762004-05-07 13:30:42 +00001394 sqlite3pager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00001395 }
drha7fcb052001-12-14 15:09:55 +00001396 if( amt>0 ){
1397 return SQLITE_CORRUPT;
1398 }
1399 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001400}
1401
drh72f82862001-05-24 21:06:34 +00001402/*
drh3aac2dd2004-04-26 14:10:20 +00001403** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001404** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001405** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00001406**
drh3aac2dd2004-04-26 14:10:20 +00001407** Return SQLITE_OK on success or an error code if anything goes
1408** wrong. An error is returned if "offset+amt" is larger than
1409** the available payload.
drh72f82862001-05-24 21:06:34 +00001410*/
drha34b6762004-05-07 13:30:42 +00001411int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh8c1238a2003-01-02 14:43:55 +00001412 assert( amt>=0 );
1413 assert( offset>=0 );
drhc39e0002004-05-07 23:50:57 +00001414 if( pCur->isValid==0 ){
1415 return pCur->status;
drh3aac2dd2004-04-26 14:10:20 +00001416 }
drhc39e0002004-05-07 23:50:57 +00001417 assert( pCur->pPage!=0 );
1418 assert( pCur->pPage->intKey==0 );
1419 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001420 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1421}
1422
1423/*
1424** Return a pointer to the key of record that cursor pCur
1425** is point to if the entire key is in contiguous memory.
1426** If the key is split up among multiple tables, return 0.
1427** If pCur is not pointing to a valid entry return 0.
1428**
1429** The pointer returned is ephemeral. The key may move
1430** or be destroyed on the next call to any Btree routine.
1431**
1432** This routine is used to do quick key comparisons in the
1433** common case where the entire key fits in the payload area
drh4b70f112004-05-02 21:12:19 +00001434** of a cell and does not overflow onto secondary pages. If
1435** this routine returns 0 for a valid cursor, the caller will
1436** need to allocate a buffer big enough to hold the whole key
1437** then use sqlite3BtreeKey() to copy the key value into the
1438** buffer. That is substantially slower. But fortunately,
1439** most keys are small enough to fit in the payload area so
1440** the slower method is rarely needed.
drh3aac2dd2004-04-26 14:10:20 +00001441*/
1442void *sqlite3BtreeKeyFetch(BtCursor *pCur){
1443 unsigned char *aPayload;
1444 MemPage *pPage;
1445 Btree *pBt;
1446 u64 nData, nKey;
1447
drhc39e0002004-05-07 23:50:57 +00001448 assert( pCur!=0 );
1449 if( !pCur->isValid ){
1450 return 0;
1451 }
1452 assert( pCur->pPage!=0 );
1453 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001454 pBt = pCur->pBt;
1455 pPage = pCur->pPage;
1456 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drhc39e0002004-05-07 23:50:57 +00001457 assert( pPage->intKey==0 );
drh3aac2dd2004-04-26 14:10:20 +00001458 aPayload = pPage->aCell[pCur->idx];
1459 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001460 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001461 aPayload += 4; /* Skip the child pointer */
1462 }
1463 if( !pPage->zeroData ){
1464 aPayload += getVarint(aPayload, &nData);
1465 }
drha34b6762004-05-07 13:30:42 +00001466 aPayload += getVarint(aPayload, &nKey);
drhc39e0002004-05-07 23:50:57 +00001467 if( nKey>pBt->maxLocal ){
drh5e00f6c2001-09-13 13:46:56 +00001468 return 0;
drh72f82862001-05-24 21:06:34 +00001469 }
drh3aac2dd2004-04-26 14:10:20 +00001470 return aPayload;
drh72f82862001-05-24 21:06:34 +00001471}
1472
drh3aac2dd2004-04-26 14:10:20 +00001473
drh72f82862001-05-24 21:06:34 +00001474/*
drhbd03cae2001-06-02 02:40:57 +00001475** Set *pSize to the number of bytes of data in the entry the
1476** cursor currently points to. Always return SQLITE_OK.
1477** Failure is not possible. If the cursor is not currently
1478** pointing to an entry (which can happen, for example, if
1479** the database is empty) then *pSize is set to 0.
drh72f82862001-05-24 21:06:34 +00001480*/
drh3aac2dd2004-04-26 14:10:20 +00001481int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drh72f82862001-05-24 21:06:34 +00001482 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001483 unsigned char *cell;
1484 u64 size;
drh72f82862001-05-24 21:06:34 +00001485
drhc39e0002004-05-07 23:50:57 +00001486 if( !pCur->isValid ){
1487 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1488 }
drh72f82862001-05-24 21:06:34 +00001489 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001490 assert( pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001491 assert( pPage->isInit );
1492 if( pPage->zeroData ){
drh72f82862001-05-24 21:06:34 +00001493 *pSize = 0;
1494 }else{
drhc39e0002004-05-07 23:50:57 +00001495 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001496 cell = pPage->aCell[pCur->idx];
1497 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001498 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001499 cell += 4; /* Skip the child pointer */
1500 }
drha34b6762004-05-07 13:30:42 +00001501 getVarint(cell, &size);
drh3aac2dd2004-04-26 14:10:20 +00001502 assert( (size & 0x00000000ffffffff)==size );
drha34b6762004-05-07 13:30:42 +00001503 *pSize = (u32)size;
drh72f82862001-05-24 21:06:34 +00001504 }
1505 return SQLITE_OK;
1506}
1507
1508/*
drh3aac2dd2004-04-26 14:10:20 +00001509** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001510** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001511** begins at "offset".
1512**
1513** Return SQLITE_OK on success or an error code if anything goes
1514** wrong. An error is returned if "offset+amt" is larger than
1515** the available payload.
drh72f82862001-05-24 21:06:34 +00001516*/
drh3aac2dd2004-04-26 14:10:20 +00001517int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhc39e0002004-05-07 23:50:57 +00001518 if( !pCur->isValid ){
1519 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1520 }
drh8c1238a2003-01-02 14:43:55 +00001521 assert( amt>=0 );
1522 assert( offset>=0 );
1523 assert( pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001524 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001525 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00001526}
1527
drh72f82862001-05-24 21:06:34 +00001528/*
drh8178a752003-01-05 21:41:40 +00001529** Move the cursor down to a new child page. The newPgno argument is the
1530** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001531*/
drh3aac2dd2004-04-26 14:10:20 +00001532static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00001533 int rc;
1534 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00001535 MemPage *pOldPage;
drh0d316a42002-08-11 20:10:47 +00001536 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001537
drhc39e0002004-05-07 23:50:57 +00001538 assert( pCur->isValid );
drhde647132004-05-07 17:57:49 +00001539 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001540 if( rc ) return rc;
drh428ae8c2003-01-04 16:48:09 +00001541 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00001542 pOldPage = pCur->pPage;
1543 pOldPage->idxShift = 0;
1544 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00001545 pCur->pPage = pNewPage;
1546 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001547 if( pNewPage->nCell<1 ){
1548 return SQLITE_CORRUPT;
1549 }
drh72f82862001-05-24 21:06:34 +00001550 return SQLITE_OK;
1551}
1552
1553/*
drh8856d6a2004-04-29 14:42:46 +00001554** Return true if the page is the virtual root of its table.
1555**
1556** The virtual root page is the root page for most tables. But
1557** for the table rooted on page 1, sometime the real root page
1558** is empty except for the right-pointer. In such cases the
1559** virtual root page is the page that the right-pointer of page
1560** 1 is pointing to.
1561*/
1562static int isRootPage(MemPage *pPage){
1563 MemPage *pParent = pPage->pParent;
drhc8629a12004-05-08 20:07:40 +00001564 if( pParent==0 ) return 1;
1565 if( pParent->pgno>1 ) return 0;
1566 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
drh8856d6a2004-04-29 14:42:46 +00001567 return 0;
1568}
1569
1570/*
drh5e2f8b92001-05-28 00:41:15 +00001571** Move the cursor up to the parent page.
1572**
1573** pCur->idx is set to the cell index that contains the pointer
1574** to the page we are coming from. If we are coming from the
1575** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001576** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001577*/
drh8178a752003-01-05 21:41:40 +00001578static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001579 Pgno oldPgno;
1580 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001581 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001582 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001583
drhc39e0002004-05-07 23:50:57 +00001584 assert( pCur->isValid );
drh8178a752003-01-05 21:41:40 +00001585 pPage = pCur->pPage;
1586 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00001587 assert( !isRootPage(pPage) );
drh8178a752003-01-05 21:41:40 +00001588 pParent = pPage->pParent;
1589 assert( pParent!=0 );
1590 idxParent = pPage->idxParent;
drha34b6762004-05-07 13:30:42 +00001591 sqlite3pager_ref(pParent->aData);
drh3aac2dd2004-04-26 14:10:20 +00001592 oldPgno = pPage->pgno;
1593 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00001594 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001595 assert( pParent->idxShift==0 );
1596 if( pParent->idxShift==0 ){
1597 pCur->idx = idxParent;
1598#ifndef NDEBUG
1599 /* Verify that pCur->idx is the correct index to point back to the child
1600 ** page we just came from
1601 */
drh428ae8c2003-01-04 16:48:09 +00001602 if( pCur->idx<pParent->nCell ){
drha34b6762004-05-07 13:30:42 +00001603 assert( get4byte(&pParent->aCell[idxParent][2])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001604 }else{
drha34b6762004-05-07 13:30:42 +00001605 assert( get4byte(&pParent->aData[pParent->hdrOffset+6])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001606 }
1607#endif
1608 }else{
1609 /* The MemPage.idxShift flag indicates that cell indices might have
1610 ** changed since idxParent was set and hence idxParent might be out
1611 ** of date. So recompute the parent cell index by scanning all cells
1612 ** and locating the one that points to the child we just came from.
1613 */
1614 int i;
1615 pCur->idx = pParent->nCell;
drh428ae8c2003-01-04 16:48:09 +00001616 for(i=0; i<pParent->nCell; i++){
drh3aac2dd2004-04-26 14:10:20 +00001617 if( get4byte(&pParent->aCell[i][2])==oldPgno ){
drh428ae8c2003-01-04 16:48:09 +00001618 pCur->idx = i;
1619 break;
1620 }
drh72f82862001-05-24 21:06:34 +00001621 }
1622 }
1623}
1624
1625/*
1626** Move the cursor to the root page
1627*/
drh5e2f8b92001-05-28 00:41:15 +00001628static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00001629 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00001630 int rc;
drh0d316a42002-08-11 20:10:47 +00001631 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001632
drhde647132004-05-07 17:57:49 +00001633 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
drhc39e0002004-05-07 23:50:57 +00001634 if( rc ){
1635 pCur->isValid = 0;
1636 return rc;
1637 }
drh3aac2dd2004-04-26 14:10:20 +00001638 releasePage(pCur->pPage);
1639 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00001640 pCur->idx = 0;
drh8856d6a2004-04-29 14:42:46 +00001641 if( pRoot->nCell==0 && !pRoot->leaf ){
1642 Pgno subpage;
1643 assert( pRoot->pgno==1 );
1644 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+6]);
1645 assert( subpage>0 );
drh4b70f112004-05-02 21:12:19 +00001646 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00001647 }
drhc39e0002004-05-07 23:50:57 +00001648 pCur->isValid = pCur->pPage->nCell>0;
drh8856d6a2004-04-29 14:42:46 +00001649 return rc;
drh72f82862001-05-24 21:06:34 +00001650}
drh2af926b2001-05-15 00:39:25 +00001651
drh5e2f8b92001-05-28 00:41:15 +00001652/*
1653** Move the cursor down to the left-most leaf entry beneath the
1654** entry to which it is currently pointing.
1655*/
1656static int moveToLeftmost(BtCursor *pCur){
1657 Pgno pgno;
1658 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001659 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00001660
drhc39e0002004-05-07 23:50:57 +00001661 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001662 while( !(pPage = pCur->pPage)->leaf ){
drha34b6762004-05-07 13:30:42 +00001663 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1664 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001665 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001666 if( rc ) return rc;
1667 }
1668 return SQLITE_OK;
1669}
1670
drh2dcc9aa2002-12-04 13:40:25 +00001671/*
1672** Move the cursor down to the right-most leaf entry beneath the
1673** page to which it is currently pointing. Notice the difference
1674** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1675** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1676** finds the right-most entry beneath the *page*.
1677*/
1678static int moveToRightmost(BtCursor *pCur){
1679 Pgno pgno;
1680 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001681 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001682
drhc39e0002004-05-07 23:50:57 +00001683 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001684 while( !(pPage = pCur->pPage)->leaf ){
1685 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
1686 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001687 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001688 if( rc ) return rc;
1689 }
drh3aac2dd2004-04-26 14:10:20 +00001690 pCur->idx = pPage->nCell - 1;
drh2dcc9aa2002-12-04 13:40:25 +00001691 return SQLITE_OK;
1692}
1693
drh5e00f6c2001-09-13 13:46:56 +00001694/* Move the cursor to the first entry in the table. Return SQLITE_OK
1695** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001696** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001697*/
drh3aac2dd2004-04-26 14:10:20 +00001698int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001699 int rc;
drhc39e0002004-05-07 23:50:57 +00001700 if( pCur->status ){
1701 return pCur->status;
1702 }
drh5e00f6c2001-09-13 13:46:56 +00001703 rc = moveToRoot(pCur);
1704 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001705 if( pCur->isValid==0 ){
1706 assert( pCur->pPage->nCell==0 );
drh5e00f6c2001-09-13 13:46:56 +00001707 *pRes = 1;
1708 return SQLITE_OK;
1709 }
drhc39e0002004-05-07 23:50:57 +00001710 assert( pCur->pPage->nCell>0 );
drh5e00f6c2001-09-13 13:46:56 +00001711 *pRes = 0;
1712 rc = moveToLeftmost(pCur);
1713 return rc;
1714}
drh5e2f8b92001-05-28 00:41:15 +00001715
drh9562b552002-02-19 15:00:07 +00001716/* Move the cursor to the last entry in the table. Return SQLITE_OK
1717** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001718** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001719*/
drh3aac2dd2004-04-26 14:10:20 +00001720int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001721 int rc;
drhc39e0002004-05-07 23:50:57 +00001722 if( pCur->status ){
1723 return pCur->status;
1724 }
drh9562b552002-02-19 15:00:07 +00001725 rc = moveToRoot(pCur);
1726 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001727 if( pCur->isValid==0 ){
1728 assert( pCur->pPage->nCell==0 );
drh9562b552002-02-19 15:00:07 +00001729 *pRes = 1;
1730 return SQLITE_OK;
1731 }
drhc39e0002004-05-07 23:50:57 +00001732 assert( pCur->isValid );
drh9562b552002-02-19 15:00:07 +00001733 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001734 rc = moveToRightmost(pCur);
drh9562b552002-02-19 15:00:07 +00001735 return rc;
1736}
1737
drh3aac2dd2004-04-26 14:10:20 +00001738/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00001739** Return a success code.
1740**
drh3aac2dd2004-04-26 14:10:20 +00001741** For INTKEY tables, only the nKey parameter is used. pKey is
1742** ignored. For other tables, nKey is the number of bytes of data
1743** in nKey. The comparison function specified when the cursor was
1744** created is used to compare keys.
1745**
drh5e2f8b92001-05-28 00:41:15 +00001746** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001747** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001748** were present. The cursor might point to an entry that comes
1749** before or after the key.
1750**
drhbd03cae2001-06-02 02:40:57 +00001751** The result of comparing the key with the entry to which the
1752** cursor is left pointing is stored in pCur->iMatch. The same
1753** value is also written to *pRes if pRes!=NULL. The meaning of
1754** this value is as follows:
1755**
1756** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001757** is smaller than pKey or if the table is empty
1758** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001759**
1760** *pRes==0 The cursor is left pointing at an entry that
1761** exactly matches pKey.
1762**
1763** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001764** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001765*/
drh3aac2dd2004-04-26 14:10:20 +00001766int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, u64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001767 int rc;
drhc39e0002004-05-07 23:50:57 +00001768
1769 if( pCur->status ){
1770 return pCur->status;
1771 }
drh5e2f8b92001-05-28 00:41:15 +00001772 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001773 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001774 assert( pCur->pPage );
1775 assert( pCur->pPage->isInit );
1776 if( pCur->isValid==0 ){
1777 assert( pCur->pPage->nCell==0 );
1778 return SQLITE_OK;
1779 }
drh72f82862001-05-24 21:06:34 +00001780 for(;;){
1781 int lwr, upr;
1782 Pgno chldPg;
1783 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001784 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001785 lwr = 0;
1786 upr = pPage->nCell-1;
1787 while( lwr<=upr ){
drh3aac2dd2004-04-26 14:10:20 +00001788 void *pCellKey;
1789 u64 nCellKey;
drh72f82862001-05-24 21:06:34 +00001790 pCur->idx = (lwr+upr)/2;
drhde647132004-05-07 17:57:49 +00001791 sqlite3BtreeKeySize(pCur, &nCellKey);
drh3aac2dd2004-04-26 14:10:20 +00001792 if( pPage->intKey ){
1793 if( nCellKey<nKey ){
1794 c = -1;
1795 }else if( nCellKey>nKey ){
1796 c = +1;
1797 }else{
1798 c = 0;
1799 }
1800 }else if( (pCellKey = sqlite3BtreeKeyFetch(pCur))!=0 ){
1801 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1802 }else{
1803 pCellKey = sqliteMalloc( nCellKey );
1804 if( pCellKey==0 ) return SQLITE_NOMEM;
1805 rc = sqlite3BtreeKey(pCur, 0, nCellKey, pCellKey);
1806 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1807 sqliteFree(pCellKey);
1808 if( rc ) return rc;
1809 }
drh72f82862001-05-24 21:06:34 +00001810 if( c==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001811 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001812 if( pRes ) *pRes = 0;
1813 return SQLITE_OK;
1814 }
1815 if( c<0 ){
1816 lwr = pCur->idx+1;
1817 }else{
1818 upr = pCur->idx-1;
1819 }
1820 }
1821 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00001822 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00001823 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00001824 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00001825 }else if( lwr>=pPage->nCell ){
1826 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh72f82862001-05-24 21:06:34 +00001827 }else{
drh3aac2dd2004-04-26 14:10:20 +00001828 chldPg = get4byte(&pPage->aCell[lwr][2]);
drh72f82862001-05-24 21:06:34 +00001829 }
1830 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001831 pCur->iMatch = c;
drhc39e0002004-05-07 23:50:57 +00001832 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001833 if( pRes ) *pRes = c;
1834 return SQLITE_OK;
1835 }
drh428ae8c2003-01-04 16:48:09 +00001836 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00001837 rc = moveToChild(pCur, chldPg);
drhc39e0002004-05-07 23:50:57 +00001838 if( rc ){
1839 return rc;
1840 }
drh72f82862001-05-24 21:06:34 +00001841 }
drhbd03cae2001-06-02 02:40:57 +00001842 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00001843}
1844
1845/*
drhc39e0002004-05-07 23:50:57 +00001846** Return TRUE if the cursor is not pointing at an entry of the table.
1847**
1848** TRUE will be returned after a call to sqlite3BtreeNext() moves
1849** past the last entry in the table or sqlite3BtreePrev() moves past
1850** the first entry. TRUE is also returned if the table is empty.
1851*/
1852int sqlite3BtreeEof(BtCursor *pCur){
1853 return pCur->isValid==0;
1854}
1855
1856/*
drhbd03cae2001-06-02 02:40:57 +00001857** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00001858** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00001859** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00001860** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00001861*/
drh3aac2dd2004-04-26 14:10:20 +00001862int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00001863 int rc;
drh8178a752003-01-05 21:41:40 +00001864 MemPage *pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001865 assert( pRes!=0 );
drhc39e0002004-05-07 23:50:57 +00001866 if( pCur->isValid==0 ){
drh8c1238a2003-01-02 14:43:55 +00001867 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00001868 return SQLITE_OK;
drhecdc7532001-09-23 02:35:53 +00001869 }
drh8178a752003-01-05 21:41:40 +00001870 assert( pPage->isInit );
drh8178a752003-01-05 21:41:40 +00001871 assert( pCur->idx<pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001872 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00001873 if( pCur->idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00001874 if( !pPage->leaf ){
1875 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+6]));
drh5e2f8b92001-05-28 00:41:15 +00001876 if( rc ) return rc;
1877 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001878 *pRes = 0;
1879 return rc;
drh72f82862001-05-24 21:06:34 +00001880 }
drh5e2f8b92001-05-28 00:41:15 +00001881 do{
drh8856d6a2004-04-29 14:42:46 +00001882 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00001883 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00001884 pCur->isValid = 0;
drh5e2f8b92001-05-28 00:41:15 +00001885 return SQLITE_OK;
1886 }
drh8178a752003-01-05 21:41:40 +00001887 moveToParent(pCur);
1888 pPage = pCur->pPage;
1889 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00001890 *pRes = 0;
drh8178a752003-01-05 21:41:40 +00001891 return SQLITE_OK;
1892 }
1893 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00001894 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00001895 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00001896 }
drh5e2f8b92001-05-28 00:41:15 +00001897 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001898 return rc;
drh72f82862001-05-24 21:06:34 +00001899}
1900
drh3b7511c2001-05-26 13:15:44 +00001901/*
drh2dcc9aa2002-12-04 13:40:25 +00001902** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00001903** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00001904** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00001905** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00001906*/
drh3aac2dd2004-04-26 14:10:20 +00001907int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00001908 int rc;
1909 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00001910 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001911 if( pCur->isValid==0 ){
1912 *pRes = 1;
1913 return SQLITE_OK;
1914 }
drh8178a752003-01-05 21:41:40 +00001915 pPage = pCur->pPage;
drh8178a752003-01-05 21:41:40 +00001916 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00001917 assert( pCur->idx>=0 );
drha34b6762004-05-07 13:30:42 +00001918 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001919 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001920 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001921 if( rc ) return rc;
1922 rc = moveToRightmost(pCur);
1923 }else{
1924 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00001925 if( isRootPage(pPage) ){
drhc39e0002004-05-07 23:50:57 +00001926 pCur->isValid = 0;
1927 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001928 return SQLITE_OK;
1929 }
drh8178a752003-01-05 21:41:40 +00001930 moveToParent(pCur);
1931 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001932 }
1933 pCur->idx--;
1934 rc = SQLITE_OK;
1935 }
drh8178a752003-01-05 21:41:40 +00001936 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001937 return rc;
1938}
1939
1940/*
drh3b7511c2001-05-26 13:15:44 +00001941** Allocate a new page from the database file.
1942**
drha34b6762004-05-07 13:30:42 +00001943** The new page is marked as dirty. (In other words, sqlite3pager_write()
drh3b7511c2001-05-26 13:15:44 +00001944** has already been called on the new page.) The new page has also
1945** been referenced and the calling routine is responsible for calling
drha34b6762004-05-07 13:30:42 +00001946** sqlite3pager_unref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00001947**
1948** SQLITE_OK is returned on success. Any other return value indicates
1949** an error. *ppPage and *pPgno are undefined in the event of an error.
drha34b6762004-05-07 13:30:42 +00001950** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00001951**
drh199e3cf2002-07-18 11:01:47 +00001952** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1953** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00001954** attempt to keep related pages close to each other in the database file,
1955** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00001956*/
drh199e3cf2002-07-18 11:01:47 +00001957static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drh3aac2dd2004-04-26 14:10:20 +00001958 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00001959 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001960 int n; /* Number of pages on the freelist */
1961 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00001962
drh3aac2dd2004-04-26 14:10:20 +00001963 pPage1 = pBt->pPage1;
1964 n = get4byte(&pPage1->aData[36]);
1965 if( n>0 ){
drh91025292004-05-03 19:49:32 +00001966 /* There are pages on the freelist. Reuse one of those pages. */
drh3aac2dd2004-04-26 14:10:20 +00001967 MemPage *pTrunk;
drha34b6762004-05-07 13:30:42 +00001968 rc = sqlite3pager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00001969 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001970 put4byte(&pPage1->aData[36], n-1);
1971 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00001972 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00001973 rc = sqlite3pager_write(pTrunk->aData);
drh3b7511c2001-05-26 13:15:44 +00001974 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00001975 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00001976 return rc;
1977 }
drh3aac2dd2004-04-26 14:10:20 +00001978 k = get4byte(&pTrunk->aData[4]);
1979 if( k==0 ){
1980 /* The trunk has no leaves. So extract the trunk page itself and
1981 ** use it as the newly allocated page */
drha34b6762004-05-07 13:30:42 +00001982 *pPgno = get4byte(&pPage1->aData[32]);
drh3aac2dd2004-04-26 14:10:20 +00001983 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
1984 *ppPage = pTrunk;
drh30e58752002-03-02 20:41:57 +00001985 }else{
drh3aac2dd2004-04-26 14:10:20 +00001986 /* Extract a leaf from the trunk */
1987 int closest;
1988 unsigned char *aData = pTrunk->aData;
1989 if( nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00001990 int i, dist;
1991 closest = 0;
drh3aac2dd2004-04-26 14:10:20 +00001992 dist = get4byte(&aData[8]) - nearby;
drhbea00b92002-07-08 10:59:50 +00001993 if( dist<0 ) dist = -dist;
drh0d316a42002-08-11 20:10:47 +00001994 for(i=1; i<n; i++){
drh3aac2dd2004-04-26 14:10:20 +00001995 int d2 = get4byte(&aData[8+i*4]) - nearby;
drhbea00b92002-07-08 10:59:50 +00001996 if( d2<0 ) d2 = -d2;
1997 if( d2<dist ) closest = i;
1998 }
1999 }else{
2000 closest = 0;
2001 }
drh3aac2dd2004-04-26 14:10:20 +00002002 put4byte(&aData[4], n-1);
drha34b6762004-05-07 13:30:42 +00002003 *pPgno = get4byte(&aData[8+closest*4]);
drh9b171272004-05-08 02:03:22 +00002004 if( closest<k-1 ){
2005 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
2006 }
2007 put4byte(&pTrunk->aData[4], k-1);
drh3aac2dd2004-04-26 14:10:20 +00002008 rc = getPage(pBt, *pPgno, ppPage);
2009 releasePage(pTrunk);
drh30e58752002-03-02 20:41:57 +00002010 if( rc==SQLITE_OK ){
drh9b171272004-05-08 02:03:22 +00002011 sqlite3pager_dont_rollback((*ppPage)->aData);
drha34b6762004-05-07 13:30:42 +00002012 rc = sqlite3pager_write((*ppPage)->aData);
drh30e58752002-03-02 20:41:57 +00002013 }
2014 }
drh3b7511c2001-05-26 13:15:44 +00002015 }else{
drh3aac2dd2004-04-26 14:10:20 +00002016 /* There are no pages on the freelist, so create a new page at the
2017 ** end of the file */
drha34b6762004-05-07 13:30:42 +00002018 *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
drh3aac2dd2004-04-26 14:10:20 +00002019 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00002020 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002021 rc = sqlite3pager_write((*ppPage)->aData);
drh3b7511c2001-05-26 13:15:44 +00002022 }
2023 return rc;
2024}
2025
2026/*
drh3aac2dd2004-04-26 14:10:20 +00002027** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00002028**
drha34b6762004-05-07 13:30:42 +00002029** sqlite3pager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00002030*/
drh3aac2dd2004-04-26 14:10:20 +00002031static int freePage(MemPage *pPage){
2032 Btree *pBt = pPage->pBt;
2033 MemPage *pPage1 = pBt->pPage1;
2034 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00002035
drh3aac2dd2004-04-26 14:10:20 +00002036 /* Prepare the page for freeing */
2037 assert( pPage->pgno>1 );
2038 pPage->isInit = 0;
2039 releasePage(pPage->pParent);
2040 pPage->pParent = 0;
2041
drha34b6762004-05-07 13:30:42 +00002042 /* Increment the free page count on pPage1 */
2043 rc = sqlite3pager_write(pPage1->aData);
drh3aac2dd2004-04-26 14:10:20 +00002044 if( rc ) return rc;
2045 n = get4byte(&pPage1->aData[36]);
2046 put4byte(&pPage1->aData[36], n+1);
2047
2048 if( n==0 ){
2049 /* This is the first free page */
2050 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00002051 put4byte(&pPage1->aData[32], pPage->pgno);
drh3aac2dd2004-04-26 14:10:20 +00002052 }else{
2053 /* Other free pages already exist. Retrive the first trunk page
2054 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00002055 MemPage *pTrunk;
2056 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002057 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002058 k = get4byte(&pTrunk->aData[4]);
2059 if( k==pBt->pageSize/4 - 8 ){
2060 /* The trunk is full. Turn the page being freed into a new
2061 ** trunk page with no leaves. */
drha34b6762004-05-07 13:30:42 +00002062 rc = sqlite3pager_write(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00002063 if( rc ) return rc;
2064 put4byte(pPage->aData, pTrunk->pgno);
2065 put4byte(&pPage->aData[4], 0);
2066 put4byte(&pPage1->aData[32], pPage->pgno);
2067 }else{
2068 /* Add the newly freed page as a leaf on the current trunk */
drha34b6762004-05-07 13:30:42 +00002069 rc = sqlite3pager_write(pTrunk->aData);
drh3aac2dd2004-04-26 14:10:20 +00002070 if( rc ) return rc;
2071 put4byte(&pTrunk->aData[4], k+1);
2072 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drha34b6762004-05-07 13:30:42 +00002073 sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
drh3aac2dd2004-04-26 14:10:20 +00002074 }
2075 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002076 }
drh3b7511c2001-05-26 13:15:44 +00002077 return rc;
2078}
2079
2080/*
drh3aac2dd2004-04-26 14:10:20 +00002081** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00002082*/
drh3aac2dd2004-04-26 14:10:20 +00002083static int clearCell(MemPage *pPage, unsigned char *pCell){
2084 Btree *pBt = pPage->pBt;
drha34b6762004-05-07 13:30:42 +00002085 int rc, n, nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002086 u64 nData, nKey;
2087 Pgno ovflPgno;
drh3b7511c2001-05-26 13:15:44 +00002088
drh3aac2dd2004-04-26 14:10:20 +00002089 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
drha34b6762004-05-07 13:30:42 +00002090 assert( (nData&0x000000007fffffff)==nData );
2091 nPayload = (int)nData;
drh3aac2dd2004-04-26 14:10:20 +00002092 if( !pPage->intKey ){
2093 nPayload += nKey;
drh5e2f8b92001-05-28 00:41:15 +00002094 }
drh3aac2dd2004-04-26 14:10:20 +00002095 if( nPayload<=pBt->maxLocal ){
drha34b6762004-05-07 13:30:42 +00002096 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00002097 }
2098 ovflPgno = get4byte(&pCell[n+pBt->maxLocal]);
2099 while( ovflPgno!=0 ){
2100 MemPage *pOvfl;
2101 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00002102 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002103 ovflPgno = get4byte(pOvfl->aData);
drha34b6762004-05-07 13:30:42 +00002104 rc = freePage(pOvfl);
drhbd03cae2001-06-02 02:40:57 +00002105 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002106 sqlite3pager_unref(pOvfl->aData);
drh3b7511c2001-05-26 13:15:44 +00002107 }
drh5e2f8b92001-05-28 00:41:15 +00002108 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00002109}
2110
2111/*
drh91025292004-05-03 19:49:32 +00002112** Create the byte sequence used to represent a cell on page pPage
2113** and write that byte sequence into pCell[]. Overflow pages are
2114** allocated and filled in as necessary. The calling procedure
2115** is responsible for making sure sufficient space has been allocated
2116** for pCell[].
2117**
2118** Note that pCell does not necessary need to point to the pPage->aData
2119** area. pCell might point to some temporary storage. The cell will
2120** be constructed in this temporary area then copied into pPage->aData
2121** later.
drh3b7511c2001-05-26 13:15:44 +00002122*/
2123static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00002124 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00002125 unsigned char *pCell, /* Complete text of the cell */
drh3aac2dd2004-04-26 14:10:20 +00002126 const void *pKey, u64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00002127 const void *pData,int nData, /* The data */
2128 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00002129){
drh3b7511c2001-05-26 13:15:44 +00002130 int nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002131 const void *pSrc;
drha34b6762004-05-07 13:30:42 +00002132 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00002133 int spaceLeft;
2134 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00002135 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00002136 unsigned char *pPrior;
2137 unsigned char *pPayload;
2138 Btree *pBt = pPage->pBt;
2139 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00002140 int nHeader;
drh3b7511c2001-05-26 13:15:44 +00002141
drh91025292004-05-03 19:49:32 +00002142 /* Fill in the header. */
2143 nHeader = 2;
2144 if( !pPage->leaf ){
2145 nHeader += 4;
2146 }
2147 if( !pPage->zeroData ){
2148 nHeader += putVarint(&pCell[nHeader], nData);
2149 }
2150 nHeader += putVarint(&pCell[nHeader], nKey);
2151
2152 /* Fill in the payload */
2153 if( pPage->zeroData ){
2154 nData = 0;
2155 }
drh3aac2dd2004-04-26 14:10:20 +00002156 nPayload = nData;
2157 if( pPage->intKey ){
2158 pSrc = pData;
2159 nSrc = nData;
drh91025292004-05-03 19:49:32 +00002160 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00002161 }else{
2162 nPayload += nKey;
2163 pSrc = pKey;
2164 nSrc = nKey;
2165 }
drh4b70f112004-05-02 21:12:19 +00002166 if( nPayload>pBt->maxLocal ){
2167 *pnSize = nHeader + pBt->maxLocal + 4;
2168 }else{
2169 *pnSize = nHeader + nPayload;
2170 }
drh3aac2dd2004-04-26 14:10:20 +00002171 spaceLeft = pBt->maxLocal;
2172 pPayload = &pCell[nHeader];
2173 pPrior = &pPayload[pBt->maxLocal];
drh3b7511c2001-05-26 13:15:44 +00002174
drh3b7511c2001-05-26 13:15:44 +00002175 while( nPayload>0 ){
2176 if( spaceLeft==0 ){
drh3aac2dd2004-04-26 14:10:20 +00002177 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
drh3b7511c2001-05-26 13:15:44 +00002178 if( rc ){
drh9b171272004-05-08 02:03:22 +00002179 releasePage(pToRelease);
drh3aac2dd2004-04-26 14:10:20 +00002180 clearCell(pPage, pCell);
drh3b7511c2001-05-26 13:15:44 +00002181 return rc;
2182 }
drh3aac2dd2004-04-26 14:10:20 +00002183 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00002184 releasePage(pToRelease);
2185 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00002186 pPrior = pOvfl->aData;
2187 put4byte(pPrior, 0);
2188 pPayload = &pOvfl->aData[4];
2189 spaceLeft = pBt->pageSize - 4;
drh3b7511c2001-05-26 13:15:44 +00002190 }
2191 n = nPayload;
2192 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00002193 if( n>nSrc ) n = nSrc;
2194 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00002195 nPayload -= n;
drhde647132004-05-07 17:57:49 +00002196 pPayload += n;
drh9b171272004-05-08 02:03:22 +00002197 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00002198 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00002199 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00002200 if( nSrc==0 ){
2201 nSrc = nData;
2202 pSrc = pData;
2203 }
drhdd793422001-06-28 01:54:48 +00002204 }
drh9b171272004-05-08 02:03:22 +00002205 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00002206 return SQLITE_OK;
2207}
2208
2209/*
drhbd03cae2001-06-02 02:40:57 +00002210** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00002211** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00002212** pointer in the third argument.
2213*/
drh4b70f112004-05-02 21:12:19 +00002214static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00002215 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00002216 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00002217
drhdd793422001-06-28 01:54:48 +00002218 if( pgno==0 ) return;
drh4b70f112004-05-02 21:12:19 +00002219 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00002220 aData = sqlite3pager_lookup(pBt->pPager, pgno);
2221 pThis = (MemPage*)&aData[pBt->pageSize];
drh6019e162001-07-02 17:51:45 +00002222 if( pThis && pThis->isInit ){
drhdd793422001-06-28 01:54:48 +00002223 if( pThis->pParent!=pNewParent ){
drha34b6762004-05-07 13:30:42 +00002224 if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
drhdd793422001-06-28 01:54:48 +00002225 pThis->pParent = pNewParent;
drha34b6762004-05-07 13:30:42 +00002226 if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
drhdd793422001-06-28 01:54:48 +00002227 }
drh428ae8c2003-01-04 16:48:09 +00002228 pThis->idxParent = idx;
drha34b6762004-05-07 13:30:42 +00002229 sqlite3pager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00002230 }
2231}
2232
2233/*
drh4b70f112004-05-02 21:12:19 +00002234** Change the pParent pointer of all children of pPage to point back
2235** to pPage.
2236**
drhbd03cae2001-06-02 02:40:57 +00002237** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00002238** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00002239**
2240** This routine gets called after you memcpy() one page into
2241** another.
2242*/
drh4b70f112004-05-02 21:12:19 +00002243static void reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002244 int i;
drh4b70f112004-05-02 21:12:19 +00002245 Btree *pBt;
2246
drha34b6762004-05-07 13:30:42 +00002247 if( pPage->leaf ) return;
drh4b70f112004-05-02 21:12:19 +00002248 pBt = pPage->pBt;
drhbd03cae2001-06-02 02:40:57 +00002249 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002250 reparentPage(pBt, get4byte(&pPage->aCell[i][2]), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002251 }
drh4b70f112004-05-02 21:12:19 +00002252 reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+6]), pPage, i);
drh428ae8c2003-01-04 16:48:09 +00002253 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002254}
2255
2256/*
2257** Remove the i-th cell from pPage. This routine effects pPage only.
2258** The cell content is not freed or deallocated. It is assumed that
2259** the cell content has been copied someplace else. This routine just
2260** removes the reference to the cell from pPage.
2261**
2262** "sz" must be the number of bytes in the cell.
2263**
2264** Do not bother maintaining the integrity of the linked list of Cells.
drha34b6762004-05-07 13:30:42 +00002265** Only the pPage->aCell[] array is important. The relinkCellList()
drh8c42ca92001-06-22 19:15:00 +00002266** routine will be called soon after this routine in order to rebuild
2267** the linked list.
drh14acc042001-06-10 19:56:58 +00002268*/
drh4b70f112004-05-02 21:12:19 +00002269static void dropCell(MemPage *pPage, int idx, int sz){
drhde647132004-05-07 17:57:49 +00002270 int j, pc;
drh8c42ca92001-06-22 19:15:00 +00002271 assert( idx>=0 && idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002272 assert( sz==cellSize(pPage, pPage->aCell[idx]) );
drha34b6762004-05-07 13:30:42 +00002273 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002274 assert( pPage->aCell[idx]>=pPage->aData );
2275 assert( pPage->aCell[idx]<&pPage->aData[pPage->pBt->pageSize-sz] );
drhde647132004-05-07 17:57:49 +00002276 pc = Addr(pPage->aCell[idx]) - Addr(pPage->aData);
2277 assert( pc>pPage->hdrOffset && pc+sz<=pPage->pBt->pageSize );
2278 freeSpace(pPage, pc, sz);
drh7c717f72001-06-24 20:39:41 +00002279 for(j=idx; j<pPage->nCell-1; j++){
drh4b70f112004-05-02 21:12:19 +00002280 pPage->aCell[j] = pPage->aCell[j+1];
drh14acc042001-06-10 19:56:58 +00002281 }
2282 pPage->nCell--;
drh428ae8c2003-01-04 16:48:09 +00002283 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002284}
2285
2286/*
2287** Insert a new cell on pPage at cell index "i". pCell points to the
2288** content of the cell.
2289**
2290** If the cell content will fit on the page, then put it there. If it
drha34b6762004-05-07 13:30:42 +00002291** will not fit, then just make pPage->aCell[i] point to the content
drh14acc042001-06-10 19:56:58 +00002292** and set pPage->isOverfull.
2293**
2294** Do not bother maintaining the integrity of the linked list of Cells.
drha34b6762004-05-07 13:30:42 +00002295** Only the pPage->aCell[] array is important. The relinkCellList()
drh8c42ca92001-06-22 19:15:00 +00002296** routine will be called soon after this routine in order to rebuild
2297** the linked list.
drh14acc042001-06-10 19:56:58 +00002298*/
drh4b70f112004-05-02 21:12:19 +00002299static void insertCell(MemPage *pPage, int i, unsigned char *pCell, int sz){
drh14acc042001-06-10 19:56:58 +00002300 int idx, j;
2301 assert( i>=0 && i<=pPage->nCell );
drha34b6762004-05-07 13:30:42 +00002302 assert( sz==cellSize(pPage, pCell) );
2303 assert( sqlite3pager_iswriteable(pPage->aData) );
2304 idx = allocateSpace(pPage, sz);
drh4b70f112004-05-02 21:12:19 +00002305 resizeCellArray(pPage, pPage->nCell+1);
drh14acc042001-06-10 19:56:58 +00002306 for(j=pPage->nCell; j>i; j--){
drh4b70f112004-05-02 21:12:19 +00002307 pPage->aCell[j] = pPage->aCell[j-1];
drh14acc042001-06-10 19:56:58 +00002308 }
2309 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002310 if( idx<=0 ){
2311 pPage->isOverfull = 1;
drh4b70f112004-05-02 21:12:19 +00002312 pPage->aCell[i] = pCell;
drh14acc042001-06-10 19:56:58 +00002313 }else{
drh4b70f112004-05-02 21:12:19 +00002314 memcpy(&pPage->aData[idx], pCell, sz);
2315 pPage->aCell[i] = &pPage->aData[idx];
drh14acc042001-06-10 19:56:58 +00002316 }
drh428ae8c2003-01-04 16:48:09 +00002317 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002318}
2319
2320/*
2321** Rebuild the linked list of cells on a page so that the cells
drh4b70f112004-05-02 21:12:19 +00002322** occur in the order specified by the pPage->aCell[] array.
drh8c42ca92001-06-22 19:15:00 +00002323** Invoke this routine once to repair damage after one or more
2324** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002325*/
drh4b70f112004-05-02 21:12:19 +00002326static void relinkCellList(MemPage *pPage){
2327 int i, idxFrom;
drha34b6762004-05-07 13:30:42 +00002328 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002329 idxFrom = pPage->hdrOffset+3;
drh14acc042001-06-10 19:56:58 +00002330 for(i=0; i<pPage->nCell; i++){
drhde647132004-05-07 17:57:49 +00002331 int idx = Addr(pPage->aCell[i]) - Addr(pPage->aData);
drh4b70f112004-05-02 21:12:19 +00002332 assert( idx>pPage->hdrOffset && idx<pPage->pBt->pageSize );
2333 put2byte(&pPage->aData[idxFrom], idx);
2334 idxFrom = idx;
drh14acc042001-06-10 19:56:58 +00002335 }
drh4b70f112004-05-02 21:12:19 +00002336 put2byte(&pPage->aData[idxFrom], 0);
drh14acc042001-06-10 19:56:58 +00002337}
2338
2339/*
drhc8629a12004-05-08 20:07:40 +00002340** GCC does not define the offsetof() macro so we'll have to do it
2341** ourselves.
2342*/
2343#ifndef offsetof
2344#define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
2345#endif
2346
2347/*
drh91025292004-05-03 19:49:32 +00002348** Move the content of the page at pFrom over to pTo. The pFrom->aCell[]
drh4b70f112004-05-02 21:12:19 +00002349** pointers that point into pFrom->aData[] must be adjusted to point
2350** into pTo->aData[] instead. But some pFrom->aCell[] entries might
2351** not point to pFrom->aData[]. Those are unchanged.
drh91025292004-05-03 19:49:32 +00002352**
2353** Over this operation completes, the meta data for pFrom is zeroed.
drh14acc042001-06-10 19:56:58 +00002354*/
drhc8629a12004-05-08 20:07:40 +00002355static void movePage(MemPage *pTo, MemPage *pFrom){
drh14acc042001-06-10 19:56:58 +00002356 uptr from, to;
2357 int i;
drh4b70f112004-05-02 21:12:19 +00002358 int pageSize;
2359 int ofst;
2360
2361 assert( pTo->hdrOffset==0 );
2362 ofst = pFrom->hdrOffset;
drhc8629a12004-05-08 20:07:40 +00002363 pageSize = pFrom->pBt->pageSize;
drh91025292004-05-03 19:49:32 +00002364 sqliteFree(pTo->aCell);
drhc8629a12004-05-08 20:07:40 +00002365 memcpy(pTo->aData, &pFrom->aData[ofst], pageSize - ofst);
2366 memcpy(pTo, pFrom, offsetof(MemPage, aData));
2367 pFrom->isInit = 0;
2368 pFrom->aCell = 0;
drh4b70f112004-05-02 21:12:19 +00002369 assert( pTo->aData[5]<155 );
2370 pTo->aData[5] += ofst;
drh14acc042001-06-10 19:56:58 +00002371 pTo->isOverfull = pFrom->isOverfull;
drh4b70f112004-05-02 21:12:19 +00002372 to = Addr(pTo->aData);
drh91025292004-05-03 19:49:32 +00002373 from = Addr(&pFrom->aData[ofst]);
drh14acc042001-06-10 19:56:58 +00002374 for(i=0; i<pTo->nCell; i++){
drh91025292004-05-03 19:49:32 +00002375 uptr x = Addr(pTo->aCell[i]);
2376 if( x>from && x<from+pageSize-ofst ){
drh4b70f112004-05-02 21:12:19 +00002377 *((uptr*)&pTo->aCell[i]) = x + to - from;
drh14acc042001-06-10 19:56:58 +00002378 }
2379 }
drhbd03cae2001-06-02 02:40:57 +00002380}
2381
2382/*
drhc3b70572003-01-04 19:44:07 +00002383** The following parameters determine how many adjacent pages get involved
2384** in a balancing operation. NN is the number of neighbors on either side
2385** of the page that participate in the balancing operation. NB is the
2386** total number of pages that participate, including the target page and
2387** NN neighbors on either side.
2388**
2389** The minimum value of NN is 1 (of course). Increasing NN above 1
2390** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2391** in exchange for a larger degradation in INSERT and UPDATE performance.
2392** The value of NN appears to give the best results overall.
2393*/
2394#define NN 1 /* Number of neighbors on either side of pPage */
2395#define NB (NN*2+1) /* Total pages involved in the balance */
2396
2397/*
drh8b2f49b2001-06-08 00:21:52 +00002398** This routine redistributes Cells on pPage and up to two siblings
2399** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002400** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002401** though both siblings might come from one side if pPage is the first
2402** or last child of its parent. If pPage has fewer than two siblings
2403** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002404** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002405**
2406** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002407** one in an effort to keep pages between 66% and 100% full. The root page
2408** is special and is allowed to be less than 66% full. If pPage is
2409** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002410** or decreased by one, as necessary, to keep the root page from being
2411** overfull or empty.
2412**
drh4b70f112004-05-02 21:12:19 +00002413** This routine alwyas calls relinkCellList() on its input page regardless of
drh14acc042001-06-10 19:56:58 +00002414** whether or not it does any real balancing. Client routines will typically
2415** invoke insertCell() or dropCell() before calling this routine, so we
2416** need to call relinkCellList() to clean up the mess that those other
2417** routines left behind.
2418**
drh8b2f49b2001-06-08 00:21:52 +00002419** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00002420** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002421** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00002422** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00002423**
drh8c42ca92001-06-22 19:15:00 +00002424** In the course of balancing the siblings of pPage, the parent of pPage
2425** might become overfull or underfull. If that happens, then this routine
2426** is called recursively on the parent.
2427**
drh5e00f6c2001-09-13 13:46:56 +00002428** If this routine fails for any reason, it might leave the database
2429** in a corrupted state. So if this routine fails, the database should
2430** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002431*/
drh4b70f112004-05-02 21:12:19 +00002432static int balance(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00002433 MemPage *pParent; /* The parent of pPage */
drh4b70f112004-05-02 21:12:19 +00002434 Btree *pBt; /* The whole database */
drha34b6762004-05-07 13:30:42 +00002435 int nCell; /* Number of cells in aCell[] */
drh8b2f49b2001-06-08 00:21:52 +00002436 int nOld; /* Number of pages in apOld[] */
2437 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002438 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002439 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00002440 int idx; /* Index of pPage in pParent->aCell[] */
2441 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00002442 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00002443 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
2444 int usableSpace; /* Bytes in pPage beyond the header */
2445 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00002446 int subtotal; /* Subtotal of bytes in cells on one page */
drhc8629a12004-05-08 20:07:40 +00002447 MemPage *extraUnref = 0; /* Unref this page if not zero */
drhc3b70572003-01-04 19:44:07 +00002448 MemPage *apOld[NB]; /* pPage and up to two siblings */
2449 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00002450 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drhc3b70572003-01-04 19:44:07 +00002451 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2452 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2453 int idxDiv[NB]; /* Indices of divider cells in pParent */
drh4b70f112004-05-02 21:12:19 +00002454 u8 *apDiv[NB]; /* Divider cells in pParent */
2455 u8 aTemp[NB][MX_CELL_SIZE]; /* Temporary holding area for apDiv[] */
drha34b6762004-05-07 13:30:42 +00002456 int cntNew[NB+1]; /* Index in aCell[] of cell after i-th page */
drhc3b70572003-01-04 19:44:07 +00002457 int szNew[NB+1]; /* Combined size of cells place on i-th page */
drh4b70f112004-05-02 21:12:19 +00002458 u8 *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
drhc3b70572003-01-04 19:44:07 +00002459 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh4b70f112004-05-02 21:12:19 +00002460 u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)]; /* Space for apCopy[] */
drh8b2f49b2001-06-08 00:21:52 +00002461
drh14acc042001-06-10 19:56:58 +00002462 /*
2463 ** Return without doing any work if pPage is neither overfull nor
2464 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002465 */
drha34b6762004-05-07 13:30:42 +00002466 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002467 pBt = pPage->pBt;
2468 if( !pPage->isOverfull && pPage->nFree<pBt->pageSize/2 && pPage->nCell>=2){
2469 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002470 return SQLITE_OK;
2471 }
2472
2473 /*
drh4b70f112004-05-02 21:12:19 +00002474 ** Find the parent of the page to be balanced. If there is no parent,
2475 ** it means this page is the root page and special rules apply.
drh8b2f49b2001-06-08 00:21:52 +00002476 */
drh14acc042001-06-10 19:56:58 +00002477 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002478 if( pParent==0 ){
2479 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002480 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002481 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002482 if( pPage->nCell==0 ){
drh8856d6a2004-04-29 14:42:46 +00002483 if( pPage->leaf ){
2484 /* The table is completely empty */
2485 relinkCellList(pPage);
2486 }else{
2487 /* The root page is empty but has one child. Transfer the
2488 ** information from that one child into the root page if it
2489 ** will fit. This reduces the depth of the BTree by one.
2490 **
2491 ** If the root page is page 1, it has less space available than
drh4b70f112004-05-02 21:12:19 +00002492 ** its child (due to the 100 byte header that occurs at the beginning
2493 ** of the database fle), so it might not be able to hold all of the
2494 ** information currently contained in the child. If this is the
2495 ** case, then do not do the transfer. Leave page 1 empty except
2496 ** for the right-pointer to the child page. The child page becomes
2497 ** the virtual root of the tree.
drh8b2f49b2001-06-08 00:21:52 +00002498 */
drha34b6762004-05-07 13:30:42 +00002499 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+6]);
2500 assert( pgnoChild>0 && pgnoChild<=sqlite3pager_pagecount(pBt->pPager) );
drh8856d6a2004-04-29 14:42:46 +00002501 rc = getPage(pBt, pgnoChild, &pChild);
drh8b2f49b2001-06-08 00:21:52 +00002502 if( rc ) return rc;
drh8856d6a2004-04-29 14:42:46 +00002503 if( pPage->pgno==1 ){
drh4b70f112004-05-02 21:12:19 +00002504 rc = initPage(pChild, pPage);
drh8856d6a2004-04-29 14:42:46 +00002505 if( rc ) return rc;
2506 if( pChild->nFree>=100 ){
drh4b70f112004-05-02 21:12:19 +00002507 /* The child information will fit on the root page, so do the
2508 ** copy */
2509 zeroPage(pPage, pChild->aData[0]);
2510 resizeCellArray(pPage, pChild->nCell);
2511 for(i=0; i<pChild->nCell; i++){
2512 insertCell(pPage, i, pChild->aCell[i],
2513 cellSize(pChild, pChild->aCell[i]));
2514 }
2515 freePage(pChild);
2516 }else{
2517 /* The child has more information that will fit on the root.
2518 ** The tree is already balanced. Do nothing. */
drh8856d6a2004-04-29 14:42:46 +00002519 }
2520 }else{
drh4b70f112004-05-02 21:12:19 +00002521 memcpy(pPage, pChild, pBt->pageSize);
drh8856d6a2004-04-29 14:42:46 +00002522 pPage->isInit = 0;
drh4b70f112004-05-02 21:12:19 +00002523 pPage->pParent = 0;
2524 rc = initPage(pPage, 0);
drh8856d6a2004-04-29 14:42:46 +00002525 assert( rc==SQLITE_OK );
drh4b70f112004-05-02 21:12:19 +00002526 freePage(pChild);
drh5edc3122001-09-13 21:53:09 +00002527 }
drh4b70f112004-05-02 21:12:19 +00002528 reparentChildPages(pPage);
2529 releasePage(pChild);
drh8b2f49b2001-06-08 00:21:52 +00002530 }
2531 return SQLITE_OK;
2532 }
drh14acc042001-06-10 19:56:58 +00002533 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002534 /* It is OK for the root page to be less than half full.
2535 */
drha34b6762004-05-07 13:30:42 +00002536 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002537 return SQLITE_OK;
2538 }
drh14acc042001-06-10 19:56:58 +00002539 /*
2540 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002541 ** When this happens, Create a new child page and copy the
2542 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002543 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002544 ** child. Then fall thru to the code below which will cause
2545 ** the overfull child page to be split.
2546 */
drh4b70f112004-05-02 21:12:19 +00002547 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
drh14acc042001-06-10 19:56:58 +00002548 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002549 assert( sqlite3pager_iswriteable(pChild->aData) );
drhc8629a12004-05-08 20:07:40 +00002550 movePage(pChild, pPage);
2551 assert( pChild->aData[0]==pPage->aData[pPage->hdrOffset] );
drh14acc042001-06-10 19:56:58 +00002552 pChild->pParent = pPage;
drha34b6762004-05-07 13:30:42 +00002553 sqlite3pager_ref(pPage->aData);
drhc8629a12004-05-08 20:07:40 +00002554 pChild->idxParent = 0;
drh14acc042001-06-10 19:56:58 +00002555 pChild->isOverfull = 1;
drhc8629a12004-05-08 20:07:40 +00002556 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
drh4b70f112004-05-02 21:12:19 +00002557 put4byte(&pPage->aData[pPage->hdrOffset+6], pChild->pgno);
drh8b2f49b2001-06-08 00:21:52 +00002558 pParent = pPage;
2559 pPage = pChild;
drhc8629a12004-05-08 20:07:40 +00002560 extraUnref = pChild;
drh8b2f49b2001-06-08 00:21:52 +00002561 }
drha34b6762004-05-07 13:30:42 +00002562 rc = sqlite3pager_write(pParent->aData);
drh6019e162001-07-02 17:51:45 +00002563 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002564 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002565
drh8b2f49b2001-06-08 00:21:52 +00002566 /*
drh4b70f112004-05-02 21:12:19 +00002567 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00002568 ** to pPage. The "idx" variable is the index of that cell. If pPage
2569 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002570 */
drhbb49aba2003-01-04 18:53:27 +00002571 if( pParent->idxShift ){
drha34b6762004-05-07 13:30:42 +00002572 Pgno pgno;
drh4b70f112004-05-02 21:12:19 +00002573 pgno = pPage->pgno;
drha34b6762004-05-07 13:30:42 +00002574 assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00002575 for(idx=0; idx<pParent->nCell; idx++){
drha34b6762004-05-07 13:30:42 +00002576 if( get4byte(&pParent->aCell[idx][2])==pgno ){
drhbb49aba2003-01-04 18:53:27 +00002577 break;
2578 }
drh8b2f49b2001-06-08 00:21:52 +00002579 }
drh4b70f112004-05-02 21:12:19 +00002580 assert( idx<pParent->nCell
2581 || get4byte(&pParent->aData[pParent->hdrOffset+6])==pgno );
drhbb49aba2003-01-04 18:53:27 +00002582 }else{
2583 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002584 }
drh8b2f49b2001-06-08 00:21:52 +00002585
2586 /*
drh14acc042001-06-10 19:56:58 +00002587 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002588 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002589 */
drh14acc042001-06-10 19:56:58 +00002590 nOld = nNew = 0;
drha34b6762004-05-07 13:30:42 +00002591 sqlite3pager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00002592
2593 /*
drh4b70f112004-05-02 21:12:19 +00002594 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002595 ** the siblings. An attempt is made to find NN siblings on either
2596 ** side of pPage. More siblings are taken from one side, however, if
2597 ** pPage there are fewer than NN siblings on the other side. If pParent
2598 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002599 */
drhc3b70572003-01-04 19:44:07 +00002600 nxDiv = idx - NN;
2601 if( nxDiv + NB > pParent->nCell ){
2602 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002603 }
drhc3b70572003-01-04 19:44:07 +00002604 if( nxDiv<0 ){
2605 nxDiv = 0;
2606 }
drh8b2f49b2001-06-08 00:21:52 +00002607 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002608 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002609 if( k<pParent->nCell ){
2610 idxDiv[i] = k;
drh4b70f112004-05-02 21:12:19 +00002611 apDiv[i] = pParent->aCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002612 nDiv++;
drha34b6762004-05-07 13:30:42 +00002613 assert( !pParent->leaf );
2614 pgnoOld[i] = get4byte(&apDiv[i][2]);
drh14acc042001-06-10 19:56:58 +00002615 }else if( k==pParent->nCell ){
drh4b70f112004-05-02 21:12:19 +00002616 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+6]);
drh14acc042001-06-10 19:56:58 +00002617 }else{
2618 break;
drh8b2f49b2001-06-08 00:21:52 +00002619 }
drhde647132004-05-07 17:57:49 +00002620 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002621 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002622 apOld[i]->idxParent = k;
drh91025292004-05-03 19:49:32 +00002623 apCopy[i] = 0;
2624 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00002625 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002626 }
2627
2628 /*
drh14acc042001-06-10 19:56:58 +00002629 ** Make copies of the content of pPage and its siblings into aOld[].
2630 ** The rest of this function will use data from the copies rather
2631 ** that the original pages since the original pages will be in the
2632 ** process of being overwritten.
2633 */
2634 for(i=0; i<nOld; i++){
drhc8629a12004-05-08 20:07:40 +00002635 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
2636 p->aData = &((u8*)p)[-pBt->pageSize];
2637 p->aCell = 0;
2638 p->hdrOffset = 0;
2639 movePage(p, apOld[i]);
drh14acc042001-06-10 19:56:58 +00002640 }
2641
2642 /*
2643 ** Load pointers to all cells on sibling pages and the divider cells
2644 ** into the local apCell[] array. Make copies of the divider cells
2645 ** into aTemp[] and remove the the divider Cells from pParent.
drh4b70f112004-05-02 21:12:19 +00002646 **
2647 ** If the siblings are on leaf pages, then the child pointers of the
2648 ** divider cells are stripped from the cells before they are copied
2649 ** into aTemp[]. In this wall, all cells in apCell[] are without
2650 ** child pointers. If siblings are not leaves, then all cell in
2651 ** apCell[] include child pointers. Either way, all cells in apCell[]
2652 ** are alike.
drh8b2f49b2001-06-08 00:21:52 +00002653 */
2654 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00002655 leafCorrection = pPage->leaf*4;
drh8b2f49b2001-06-08 00:21:52 +00002656 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002657 MemPage *pOld = apCopy[i];
drh8b2f49b2001-06-08 00:21:52 +00002658 for(j=0; j<pOld->nCell; j++){
drh4b70f112004-05-02 21:12:19 +00002659 apCell[nCell] = pOld->aCell[j];
2660 szCell[nCell] = cellSize(pOld, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002661 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002662 }
2663 if( i<nOld-1 ){
drhc8629a12004-05-08 20:07:40 +00002664 szCell[nCell] = cellSize(pParent, apDiv[i]);
2665 memcpy(aTemp[i], apDiv[i], szCell[nCell]);
drh4b70f112004-05-02 21:12:19 +00002666 apCell[nCell] = &aTemp[i][leafCorrection];
2667 dropCell(pParent, nxDiv, szCell[nCell]);
drhc8629a12004-05-08 20:07:40 +00002668 szCell[nCell] -= leafCorrection;
2669 assert( get4byte(&aTemp[i][2])==pgnoOld[i] );
drh4b70f112004-05-02 21:12:19 +00002670 if( !pOld->leaf ){
2671 assert( leafCorrection==0 );
2672 /* The right pointer of the child page pOld becomes the left
2673 ** pointer of the divider cell */
2674 memcpy(&apCell[nCell][2], &pOld->aData[pOld->hdrOffset+6], 4);
2675 }else{
2676 assert( leafCorrection==4 );
2677 }
drh14acc042001-06-10 19:56:58 +00002678 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002679 }
2680 }
2681
2682 /*
drh6019e162001-07-02 17:51:45 +00002683 ** Figure out the number of pages needed to hold all nCell cells.
2684 ** Store this number in "k". Also compute szNew[] which is the total
2685 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00002686 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00002687 ** cntNew[k] should equal nCell.
2688 **
2689 ** This little patch of code is critical for keeping the tree
2690 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00002691 */
drh4b70f112004-05-02 21:12:19 +00002692 usableSpace = pBt->pageSize - 10 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00002693 for(subtotal=k=i=0; i<nCell; i++){
2694 subtotal += szCell[i];
drh4b70f112004-05-02 21:12:19 +00002695 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00002696 szNew[k] = subtotal - szCell[i];
2697 cntNew[k] = i;
2698 subtotal = 0;
2699 k++;
2700 }
2701 }
2702 szNew[k] = subtotal;
2703 cntNew[k] = nCell;
2704 k++;
2705 for(i=k-1; i>0; i--){
drh4b70f112004-05-02 21:12:19 +00002706 while( szNew[i]<usableSpace/2 ){
drh6019e162001-07-02 17:51:45 +00002707 cntNew[i-1]--;
2708 assert( cntNew[i-1]>0 );
2709 szNew[i] += szCell[cntNew[i-1]];
2710 szNew[i-1] -= szCell[cntNew[i-1]-1];
2711 }
2712 }
2713 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00002714
2715 /*
drh6b308672002-07-08 02:16:37 +00002716 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00002717 */
drh4b70f112004-05-02 21:12:19 +00002718 assert( pPage->pgno>1 );
2719 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00002720 for(i=0; i<k; i++){
drhc8629a12004-05-08 20:07:40 +00002721 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00002722 if( i<nOld ){
drhc8629a12004-05-08 20:07:40 +00002723 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00002724 pgnoNew[i] = pgnoOld[i];
2725 apOld[i] = 0;
drhc8629a12004-05-08 20:07:40 +00002726 sqlite3pager_write(pNew->aData);
drh6b308672002-07-08 02:16:37 +00002727 }else{
drhc8629a12004-05-08 20:07:40 +00002728 rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00002729 if( rc ) goto balance_cleanup;
drhc8629a12004-05-08 20:07:40 +00002730 apNew[i] = pNew;
drh6b308672002-07-08 02:16:37 +00002731 }
drh14acc042001-06-10 19:56:58 +00002732 nNew++;
drhc8629a12004-05-08 20:07:40 +00002733 zeroPage(pNew, pageFlags);
drh8b2f49b2001-06-08 00:21:52 +00002734 }
2735
drh6b308672002-07-08 02:16:37 +00002736 /* Free any old pages that were not reused as new pages.
2737 */
2738 while( i<nOld ){
drh4b70f112004-05-02 21:12:19 +00002739 rc = freePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002740 if( rc ) goto balance_cleanup;
drhc8629a12004-05-08 20:07:40 +00002741 releasePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002742 apOld[i] = 0;
2743 i++;
2744 }
2745
drh8b2f49b2001-06-08 00:21:52 +00002746 /*
drhf9ffac92002-03-02 19:00:31 +00002747 ** Put the new pages in accending order. This helps to
2748 ** keep entries in the disk file in order so that a scan
2749 ** of the table is a linear scan through the file. That
2750 ** in turn helps the operating system to deliver pages
2751 ** from the disk more rapidly.
2752 **
2753 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00002754 ** n is never more than NB (a small constant), that should
2755 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00002756 **
drhc3b70572003-01-04 19:44:07 +00002757 ** When NB==3, this one optimization makes the database
2758 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00002759 */
2760 for(i=0; i<k-1; i++){
2761 int minV = pgnoNew[i];
2762 int minI = i;
2763 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00002764 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00002765 minI = j;
2766 minV = pgnoNew[j];
2767 }
2768 }
2769 if( minI>i ){
2770 int t;
2771 MemPage *pT;
2772 t = pgnoNew[i];
2773 pT = apNew[i];
2774 pgnoNew[i] = pgnoNew[minI];
2775 apNew[i] = apNew[minI];
2776 pgnoNew[minI] = t;
2777 apNew[minI] = pT;
2778 }
2779 }
2780
2781 /*
drh14acc042001-06-10 19:56:58 +00002782 ** Evenly distribute the data in apCell[] across the new pages.
2783 ** Insert divider cells into pParent as necessary.
2784 */
2785 j = 0;
2786 for(i=0; i<nNew; i++){
2787 MemPage *pNew = apNew[i];
drh4b70f112004-05-02 21:12:19 +00002788 assert( pNew->pgno==pgnoNew[i] );
2789 resizeCellArray(pNew, cntNew[i] - j);
drh6019e162001-07-02 17:51:45 +00002790 while( j<cntNew[i] ){
2791 assert( pNew->nFree>=szCell[j] );
drha34b6762004-05-07 13:30:42 +00002792 insertCell(pNew, pNew->nCell, apCell[j], szCell[j]);
drh14acc042001-06-10 19:56:58 +00002793 j++;
2794 }
drh6019e162001-07-02 17:51:45 +00002795 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00002796 assert( !pNew->isOverfull );
drh4b70f112004-05-02 21:12:19 +00002797 relinkCellList(pNew);
drh14acc042001-06-10 19:56:58 +00002798 if( i<nNew-1 && j<nCell ){
drh4b70f112004-05-02 21:12:19 +00002799 u8 *pCell = apCell[j];
2800 if( !pNew->leaf ){
2801 memcpy(&pNew->aData[6], &apCell[j][2], 4);
2802 }else{
2803 pCell -= 4;
2804 }
2805 insertCell(pParent, nxDiv, pCell, szCell[j]+leafCorrection);
2806 put4byte(&pParent->aCell[nxDiv][2], pNew->pgno);
drh14acc042001-06-10 19:56:58 +00002807 j++;
2808 nxDiv++;
2809 }
2810 }
drh6019e162001-07-02 17:51:45 +00002811 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00002812 if( (pageFlags & PTF_LEAF)==0 ){
2813 memcpy(&apNew[nNew-1]->aData[6], &apCopy[nOld-1]->aData[6], 4);
drh14acc042001-06-10 19:56:58 +00002814 }
drh4b70f112004-05-02 21:12:19 +00002815 if( nxDiv==pParent->nCell ){
2816 /* Right-most sibling is the right-most child of pParent */
2817 put4byte(&pParent->aData[pParent->hdrOffset+6], pgnoNew[nNew-1]);
2818 }else{
2819 /* Right-most sibling is the left child of the first entry in pParent
2820 ** past the right-most divider entry */
drha34b6762004-05-07 13:30:42 +00002821 put4byte(&pParent->aCell[nxDiv][2], pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00002822 }
2823
2824 /*
2825 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00002826 */
2827 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00002828 reparentChildPages(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00002829 }
drh4b70f112004-05-02 21:12:19 +00002830 reparentChildPages(pParent);
drh8b2f49b2001-06-08 00:21:52 +00002831
2832 /*
drh14acc042001-06-10 19:56:58 +00002833 ** balance the parent page.
drh8b2f49b2001-06-08 00:21:52 +00002834 */
drhc8629a12004-05-08 20:07:40 +00002835 assert( pPage->isInit );
2836 assert( pParent->isInit );
drh4b70f112004-05-02 21:12:19 +00002837 rc = balance(pParent);
drhc8629a12004-05-08 20:07:40 +00002838
drh8b2f49b2001-06-08 00:21:52 +00002839
2840 /*
drh14acc042001-06-10 19:56:58 +00002841 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00002842 */
drh14acc042001-06-10 19:56:58 +00002843balance_cleanup:
drh8b2f49b2001-06-08 00:21:52 +00002844 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00002845 releasePage(apOld[i]);
2846 if( apCopy[i] ){
drh91025292004-05-03 19:49:32 +00002847 sqliteFree(apCopy[i]->aCell);
2848 }
drh8b2f49b2001-06-08 00:21:52 +00002849 }
drh14acc042001-06-10 19:56:58 +00002850 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00002851 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00002852 }
drh91025292004-05-03 19:49:32 +00002853 releasePage(pParent);
drhc8629a12004-05-08 20:07:40 +00002854 releasePage(extraUnref);
drh8b2f49b2001-06-08 00:21:52 +00002855 return rc;
2856}
2857
2858/*
drhf74b8d92002-09-01 23:20:45 +00002859** This routine checks all cursors that point to the same table
2860** as pCur points to. If any of those cursors were opened with
2861** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
2862** cursors point to the same table were opened with wrFlag==1
2863** then this routine returns SQLITE_OK.
2864**
2865** In addition to checking for read-locks (where a read-lock
2866** means a cursor opened with wrFlag==0) this routine also moves
2867** all cursors other than pCur so that they are pointing to the
2868** first Cell on root page. This is necessary because an insert
2869** or delete might change the number of cells on a page or delete
2870** a page entirely and we do not want to leave any cursors
2871** pointing to non-existant pages or cells.
2872*/
2873static int checkReadLocks(BtCursor *pCur){
2874 BtCursor *p;
2875 assert( pCur->wrFlag );
2876 for(p=pCur->pShared; p!=pCur; p=p->pShared){
2877 assert( p );
2878 assert( p->pgnoRoot==pCur->pgnoRoot );
drha34b6762004-05-07 13:30:42 +00002879 assert( p->pPage->pgno==sqlite3pager_pagenumber(p->pPage->aData) );
drhf74b8d92002-09-01 23:20:45 +00002880 if( p->wrFlag==0 ) return SQLITE_LOCKED;
drh91025292004-05-03 19:49:32 +00002881 if( p->pPage->pgno!=p->pgnoRoot ){
drhf74b8d92002-09-01 23:20:45 +00002882 moveToRoot(p);
2883 }
2884 }
2885 return SQLITE_OK;
2886}
2887
2888/*
drh3b7511c2001-05-26 13:15:44 +00002889** Insert a new record into the BTree. The key is given by (pKey,nKey)
2890** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00002891** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00002892** is left pointing at a random location.
2893**
2894** For an INTKEY table, only the nKey value of the key is used. pKey is
2895** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00002896*/
drh3aac2dd2004-04-26 14:10:20 +00002897int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00002898 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4b70f112004-05-02 21:12:19 +00002899 const void *pKey, u64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00002900 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00002901){
drh3b7511c2001-05-26 13:15:44 +00002902 int rc;
2903 int loc;
drh14acc042001-06-10 19:56:58 +00002904 int szNew;
drh3b7511c2001-05-26 13:15:44 +00002905 MemPage *pPage;
2906 Btree *pBt = pCur->pBt;
drha34b6762004-05-07 13:30:42 +00002907 unsigned char *oldCell;
2908 unsigned char newCell[MX_CELL_SIZE];
drh3b7511c2001-05-26 13:15:44 +00002909
drhc39e0002004-05-07 23:50:57 +00002910 if( pCur->status ){
2911 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00002912 }
drhf74b8d92002-09-01 23:20:45 +00002913 if( !pBt->inTrans || nKey+nData==0 ){
2914 /* Must start a transaction before doing an insert */
2915 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002916 }
drhf74b8d92002-09-01 23:20:45 +00002917 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00002918 if( !pCur->wrFlag ){
2919 return SQLITE_PERM; /* Cursor not open for writing */
2920 }
drhf74b8d92002-09-01 23:20:45 +00002921 if( checkReadLocks(pCur) ){
2922 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2923 }
drh3aac2dd2004-04-26 14:10:20 +00002924 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00002925 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00002926 pPage = pCur->pPage;
drh7aa128d2002-06-21 13:09:16 +00002927 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00002928 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00002929 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002930 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
drh3b7511c2001-05-26 13:15:44 +00002931 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002932 assert( szNew==cellSize(pPage, newCell) );
drh3b7511c2001-05-26 13:15:44 +00002933 if( loc==0 ){
drha34b6762004-05-07 13:30:42 +00002934 int szOld;
2935 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002936 oldCell = pPage->aCell[pCur->idx];
2937 if( !pPage->leaf ){
2938 memcpy(&newCell[2], &oldCell[2], 4);
2939 }
2940 szOld = cellSize(pPage, oldCell);
2941 rc = clearCell(pPage, oldCell);
drh5e2f8b92001-05-28 00:41:15 +00002942 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002943 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00002944 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00002945 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00002946 pCur->idx++;
2947 }else{
drh4b70f112004-05-02 21:12:19 +00002948 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00002949 }
drha34b6762004-05-07 13:30:42 +00002950 insertCell(pPage, pCur->idx, newCell, szNew);
drh4b70f112004-05-02 21:12:19 +00002951 rc = balance(pPage);
drh23e11ca2004-05-04 17:27:28 +00002952 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
drh3fc190c2001-09-14 03:24:23 +00002953 /* fflush(stdout); */
drh4b70f112004-05-02 21:12:19 +00002954 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00002955 return rc;
2956}
2957
2958/*
drh4b70f112004-05-02 21:12:19 +00002959** Delete the entry that the cursor is pointing to. The cursor
2960** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00002961*/
drh3aac2dd2004-04-26 14:10:20 +00002962int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00002963 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00002964 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00002965 int rc;
drh8c42ca92001-06-22 19:15:00 +00002966 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00002967 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00002968
drh7aa128d2002-06-21 13:09:16 +00002969 assert( pPage->isInit );
drhc39e0002004-05-07 23:50:57 +00002970 if( pCur->status ){
2971 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00002972 }
drhf74b8d92002-09-01 23:20:45 +00002973 if( !pBt->inTrans ){
2974 /* Must start a transaction before doing a delete */
2975 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002976 }
drhf74b8d92002-09-01 23:20:45 +00002977 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00002978 if( pCur->idx >= pPage->nCell ){
2979 return SQLITE_ERROR; /* The cursor is not pointing to anything */
2980 }
drhecdc7532001-09-23 02:35:53 +00002981 if( !pCur->wrFlag ){
2982 return SQLITE_PERM; /* Did not open this cursor for writing */
2983 }
drhf74b8d92002-09-01 23:20:45 +00002984 if( checkReadLocks(pCur) ){
2985 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2986 }
drha34b6762004-05-07 13:30:42 +00002987 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00002988 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002989 pCell = pPage->aCell[pCur->idx];
2990 if( !pPage->leaf ){
2991 pgnoChild = get4byte(&pCell[2]);
2992 }
2993 clearCell(pPage, pCell);
2994 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00002995 /*
drh5e00f6c2001-09-13 13:46:56 +00002996 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00002997 ** do something we will leave a hole on an internal page.
2998 ** We have to fill the hole by moving in a cell from a leaf. The
2999 ** next Cell after the one to be deleted is guaranteed to exist and
3000 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00003001 */
drh14acc042001-06-10 19:56:58 +00003002 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00003003 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00003004 int szNext;
drh8c1238a2003-01-02 14:43:55 +00003005 int notUsed;
drhc8629a12004-05-08 20:07:40 +00003006 unsigned char tempbuf[4];
drh14acc042001-06-10 19:56:58 +00003007 getTempCursor(pCur, &leafCur);
drh3aac2dd2004-04-26 14:10:20 +00003008 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00003009 if( rc!=SQLITE_OK ){
drh8a6ac0a2004-02-14 17:35:07 +00003010 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3011 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003012 }
drha34b6762004-05-07 13:30:42 +00003013 rc = sqlite3pager_write(leafCur.pPage->aData);
drh6019e162001-07-02 17:51:45 +00003014 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003015 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3016 pNext = leafCur.pPage->aCell[leafCur.idx];
3017 szNext = cellSize(leafCur.pPage, pNext);
drhc8629a12004-05-08 20:07:40 +00003018 memcpy(tempbuf, &pNext[-2], 4);
drh4b70f112004-05-02 21:12:19 +00003019 put4byte(&pNext[-2], pgnoChild);
drhc8629a12004-05-08 20:07:40 +00003020 insertCell(pPage, pCur->idx, &pNext[-4], szNext+4);
drh4b70f112004-05-02 21:12:19 +00003021 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003022 if( rc ) return rc;
drhc8629a12004-05-08 20:07:40 +00003023 memcpy(&pNext[-2], tempbuf, 4);
drh4b70f112004-05-02 21:12:19 +00003024 dropCell(leafCur.pPage, leafCur.idx, szNext);
3025 rc = balance(leafCur.pPage);
drh8c42ca92001-06-22 19:15:00 +00003026 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00003027 }else{
drh4b70f112004-05-02 21:12:19 +00003028 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3029 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003030 }
drh4b70f112004-05-02 21:12:19 +00003031 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003032 return rc;
drh3b7511c2001-05-26 13:15:44 +00003033}
drh8b2f49b2001-06-08 00:21:52 +00003034
3035/*
drhc6b52df2002-01-04 03:09:29 +00003036** Create a new BTree table. Write into *piTable the page
3037** number for the root page of the new table.
3038**
3039** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00003040** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00003041** are restricted to having a 4-byte integer key and arbitrary data and
3042** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00003043** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00003044*/
drh3aac2dd2004-04-26 14:10:20 +00003045int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
drh8b2f49b2001-06-08 00:21:52 +00003046 MemPage *pRoot;
3047 Pgno pgnoRoot;
3048 int rc;
3049 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003050 /* Must start a transaction first */
3051 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003052 }
drh5df72a52002-06-06 23:16:05 +00003053 if( pBt->readOnly ){
3054 return SQLITE_READONLY;
3055 }
drhbea00b92002-07-08 10:59:50 +00003056 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
drh8b2f49b2001-06-08 00:21:52 +00003057 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003058 assert( sqlite3pager_iswriteable(pRoot->aData) );
drhde647132004-05-07 17:57:49 +00003059 zeroPage(pRoot, flags | PTF_LEAF);
drha34b6762004-05-07 13:30:42 +00003060 sqlite3pager_unref(pRoot->aData);
drh8b2f49b2001-06-08 00:21:52 +00003061 *piTable = (int)pgnoRoot;
3062 return SQLITE_OK;
3063}
3064
3065/*
3066** Erase the given database page and all its children. Return
3067** the page to the freelist.
3068*/
drh4b70f112004-05-02 21:12:19 +00003069static int clearDatabasePage(
3070 Btree *pBt, /* The BTree that contains the table */
3071 Pgno pgno, /* Page number to clear */
3072 MemPage *pParent, /* Parent page. NULL for the root */
3073 int freePageFlag /* Deallocate page if true */
3074){
drh8b2f49b2001-06-08 00:21:52 +00003075 MemPage *pPage;
3076 int rc;
drh4b70f112004-05-02 21:12:19 +00003077 unsigned char *pCell;
3078 int i;
drh8b2f49b2001-06-08 00:21:52 +00003079
drhde647132004-05-07 17:57:49 +00003080 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
drh8b2f49b2001-06-08 00:21:52 +00003081 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003082 rc = sqlite3pager_write(pPage->aData);
drh6019e162001-07-02 17:51:45 +00003083 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003084 for(i=0; i<pPage->nCell; i++){
3085 pCell = pPage->aCell[i];
3086 if( !pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003087 rc = clearDatabasePage(pBt, get4byte(&pCell[2]), pPage->pParent, 1);
drh8b2f49b2001-06-08 00:21:52 +00003088 if( rc ) return rc;
3089 }
drh4b70f112004-05-02 21:12:19 +00003090 rc = clearCell(pPage, pCell);
drh8b2f49b2001-06-08 00:21:52 +00003091 if( rc ) return rc;
3092 }
drha34b6762004-05-07 13:30:42 +00003093 if( !pPage->leaf ){
3094 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[6]), pPage->pParent, 1);
drh2aa679f2001-06-25 02:11:07 +00003095 if( rc ) return rc;
3096 }
3097 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00003098 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003099 }else{
drh4b70f112004-05-02 21:12:19 +00003100 zeroPage(pPage, pPage->aData[0]);
drh2aa679f2001-06-25 02:11:07 +00003101 }
drh4b70f112004-05-02 21:12:19 +00003102 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003103 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003104}
3105
3106/*
3107** Delete all information from a single table in the database.
3108*/
drh3aac2dd2004-04-26 14:10:20 +00003109int sqlite3BtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003110 int rc;
drhf74b8d92002-09-01 23:20:45 +00003111 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003112 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003113 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003114 }
drhf74b8d92002-09-01 23:20:45 +00003115 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3116 if( pCur->pgnoRoot==(Pgno)iTable ){
3117 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3118 moveToRoot(pCur);
3119 }
drhecdc7532001-09-23 02:35:53 +00003120 }
drha34b6762004-05-07 13:30:42 +00003121 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
drh8b2f49b2001-06-08 00:21:52 +00003122 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003123 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00003124 }
drh8c42ca92001-06-22 19:15:00 +00003125 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003126}
3127
3128/*
3129** Erase all information in a table and add the root of the table to
3130** the freelist. Except, the root of the principle table (the one on
3131** page 2) is never added to the freelist.
3132*/
drh3aac2dd2004-04-26 14:10:20 +00003133int sqlite3BtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003134 int rc;
3135 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00003136 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003137 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003138 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003139 }
drhf74b8d92002-09-01 23:20:45 +00003140 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3141 if( pCur->pgnoRoot==(Pgno)iTable ){
3142 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
3143 }
drh5df72a52002-06-06 23:16:05 +00003144 }
drha34b6762004-05-07 13:30:42 +00003145 rc = getPage(pBt, (Pgno)iTable, &pPage);
drh2aa679f2001-06-25 02:11:07 +00003146 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003147 rc = sqlite3BtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00003148 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003149 if( iTable>1 ){
drha34b6762004-05-07 13:30:42 +00003150 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003151 }else{
drha34b6762004-05-07 13:30:42 +00003152 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
drh8b2f49b2001-06-08 00:21:52 +00003153 }
drh4b70f112004-05-02 21:12:19 +00003154 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00003155 return rc;
3156}
3157
drh001bbcb2003-03-19 03:14:00 +00003158
drh8b2f49b2001-06-08 00:21:52 +00003159/*
drh23e11ca2004-05-04 17:27:28 +00003160** Read the meta-information out of a database file. Meta[0]
3161** is the number of free pages currently in the database. Meta[1]
3162** through meta[15] are available for use by higher layers.
drh8b2f49b2001-06-08 00:21:52 +00003163*/
drh3aac2dd2004-04-26 14:10:20 +00003164int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00003165 int rc;
drh4b70f112004-05-02 21:12:19 +00003166 unsigned char *pP1;
drh8b2f49b2001-06-08 00:21:52 +00003167
drh23e11ca2004-05-04 17:27:28 +00003168 assert( idx>=0 && idx<=15 );
drha34b6762004-05-07 13:30:42 +00003169 rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00003170 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003171 *pMeta = get4byte(&pP1[36 + idx*4]);
drha34b6762004-05-07 13:30:42 +00003172 sqlite3pager_unref(pP1);
drh8b2f49b2001-06-08 00:21:52 +00003173 return SQLITE_OK;
3174}
3175
3176/*
drh23e11ca2004-05-04 17:27:28 +00003177** Write meta-information back into the database. Meta[0] is
3178** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00003179*/
drh3aac2dd2004-04-26 14:10:20 +00003180int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
drh4b70f112004-05-02 21:12:19 +00003181 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00003182 int rc;
drh23e11ca2004-05-04 17:27:28 +00003183 assert( idx>=1 && idx<=15 );
drh8b2f49b2001-06-08 00:21:52 +00003184 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003185 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003186 }
drhde647132004-05-07 17:57:49 +00003187 assert( pBt->pPage1!=0 );
3188 pP1 = pBt->pPage1->aData;
drha34b6762004-05-07 13:30:42 +00003189 rc = sqlite3pager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00003190 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003191 put4byte(&pP1[36 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00003192 return SQLITE_OK;
3193}
drh8c42ca92001-06-22 19:15:00 +00003194
drh5eddca62001-06-30 21:53:53 +00003195/******************************************************************************
3196** The complete implementation of the BTree subsystem is above this line.
3197** All the code the follows is for testing and troubleshooting the BTree
3198** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003199******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003200
drh8c42ca92001-06-22 19:15:00 +00003201/*
3202** Print a disassembly of the given page on standard output. This routine
3203** is used for debugging and testing only.
3204*/
drhaaab5722002-02-19 13:39:21 +00003205#ifdef SQLITE_TEST
drh23e11ca2004-05-04 17:27:28 +00003206int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003207 int rc;
3208 MemPage *pPage;
drhc8629a12004-05-08 20:07:40 +00003209 int i, j, c;
drh8c42ca92001-06-22 19:15:00 +00003210 int nFree;
3211 u16 idx;
drhab9f7f12004-05-08 10:56:11 +00003212 int hdr;
3213 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00003214 char range[20];
3215 unsigned char payload[20];
drhab9f7f12004-05-08 10:56:11 +00003216
drh4b70f112004-05-02 21:12:19 +00003217 rc = getPage(pBt, (Pgno)pgno, &pPage);
drh8c42ca92001-06-22 19:15:00 +00003218 if( rc ){
3219 return rc;
3220 }
drhab9f7f12004-05-08 10:56:11 +00003221 hdr = pPage->hdrOffset;
3222 data = pPage->aData;
drhc8629a12004-05-08 20:07:40 +00003223 c = data[hdr];
3224 pPage->intKey = (c & PTF_INTKEY)!=0;
3225 pPage->zeroData = (c & PTF_ZERODATA)!=0;
3226 pPage->leaf = (c & PTF_LEAF)!=0;
drha34b6762004-05-07 13:30:42 +00003227 printf("PAGE %d: flags=0x%02x frag=%d\n", pgno,
drhab9f7f12004-05-08 10:56:11 +00003228 data[hdr], data[hdr+5]);
drh8c42ca92001-06-22 19:15:00 +00003229 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003230 assert( hdr == (pgno==1 ? 100 : 0) );
3231 idx = get2byte(&data[hdr+3]);
drh4b70f112004-05-02 21:12:19 +00003232 while( idx>0 && idx<=pBt->pageSize ){
3233 u64 nData, nKey;
3234 int nHeader;
3235 Pgno child;
drhab9f7f12004-05-08 10:56:11 +00003236 unsigned char *pCell = &data[idx];
drh4b70f112004-05-02 21:12:19 +00003237 int sz = cellSize(pPage, pCell);
drh8c42ca92001-06-22 19:15:00 +00003238 sprintf(range,"%d..%d", idx, idx+sz-1);
drh4b70f112004-05-02 21:12:19 +00003239 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3240 if( pPage->leaf ){
3241 child = 0;
3242 }else{
3243 child = get4byte(&pCell[2]);
3244 }
drha34b6762004-05-07 13:30:42 +00003245 sz = nData;
3246 if( !pPage->intKey ) sz += nKey;
drh8c42ca92001-06-22 19:15:00 +00003247 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
drha34b6762004-05-07 13:30:42 +00003248 memcpy(payload, &pCell[nHeader], sz);
drh8c42ca92001-06-22 19:15:00 +00003249 for(j=0; j<sz; j++){
3250 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3251 }
3252 payload[sz] = 0;
3253 printf(
drha34b6762004-05-07 13:30:42 +00003254 "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4lld payload=%s\n",
3255 i, range, child, nKey, nData, payload
drh8c42ca92001-06-22 19:15:00 +00003256 );
drh4b70f112004-05-02 21:12:19 +00003257 if( pPage->isInit && pPage->aCell[i]!=pCell ){
3258 printf("**** aCell[%d] does not match on prior entry ****\n", i);
drh2aa679f2001-06-25 02:11:07 +00003259 }
drh7c717f72001-06-24 20:39:41 +00003260 i++;
drh4b70f112004-05-02 21:12:19 +00003261 idx = get2byte(pCell);
drh8c42ca92001-06-22 19:15:00 +00003262 }
3263 if( idx!=0 ){
3264 printf("ERROR: next cell index out of range: %d\n", idx);
3265 }
drh4b70f112004-05-02 21:12:19 +00003266 if( !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003267 printf("right_child: %d\n", get4byte(&data[6]));
drh4b70f112004-05-02 21:12:19 +00003268 }
drh8c42ca92001-06-22 19:15:00 +00003269 nFree = 0;
3270 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003271 idx = get2byte(&data[hdr+1]);
drhc39e0002004-05-07 23:50:57 +00003272 while( idx>0 && idx<pPage->pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003273 int sz = get2byte(&data[idx+2]);
drh4b70f112004-05-02 21:12:19 +00003274 sprintf(range,"%d..%d", idx, idx+sz-1);
3275 nFree += sz;
drh8c42ca92001-06-22 19:15:00 +00003276 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00003277 i, range, sz, nFree);
drhab9f7f12004-05-08 10:56:11 +00003278 idx = get2byte(&data[idx]);
drh2aa679f2001-06-25 02:11:07 +00003279 i++;
drh8c42ca92001-06-22 19:15:00 +00003280 }
3281 if( idx!=0 ){
3282 printf("ERROR: next freeblock index out of range: %d\n", idx);
3283 }
drha34b6762004-05-07 13:30:42 +00003284 if( recursive && !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003285 idx = get2byte(&data[hdr+3]);
drha34b6762004-05-07 13:30:42 +00003286 while( idx>0 && idx<pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003287 unsigned char *pCell = &data[idx];
drha34b6762004-05-07 13:30:42 +00003288 sqlite3BtreePageDump(pBt, get4byte(&pCell[2]), 1);
3289 idx = get2byte(pCell);
drh6019e162001-07-02 17:51:45 +00003290 }
drhab9f7f12004-05-08 10:56:11 +00003291 sqlite3BtreePageDump(pBt, get4byte(&data[hdr+6]), 1);
drh6019e162001-07-02 17:51:45 +00003292 }
drhab9f7f12004-05-08 10:56:11 +00003293 sqlite3pager_unref(data);
drh8c42ca92001-06-22 19:15:00 +00003294 return SQLITE_OK;
3295}
drhaaab5722002-02-19 13:39:21 +00003296#endif
drh8c42ca92001-06-22 19:15:00 +00003297
drhaaab5722002-02-19 13:39:21 +00003298#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003299/*
drha34b6762004-05-07 13:30:42 +00003300** Return the flag byte at the beginning of the page that the cursor
3301** is currently pointing to.
3302*/
3303int sqlite3BtreeFlags(BtCursor *pCur){
3304 return pCur->pPage->aData[pCur->pPage->hdrOffset];
3305}
3306#endif
3307
3308#ifdef SQLITE_TEST
3309/*
drh2aa679f2001-06-25 02:11:07 +00003310** Fill aResult[] with information about the entry and page that the
3311** cursor is pointing to.
3312**
3313** aResult[0] = The page number
3314** aResult[1] = The entry number
3315** aResult[2] = Total number of entries on this page
3316** aResult[3] = Size of this entry
3317** aResult[4] = Number of free bytes on this page
3318** aResult[5] = Number of free blocks on the page
3319** aResult[6] = Page number of the left child of this entry
3320** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003321**
3322** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003323*/
drha34b6762004-05-07 13:30:42 +00003324int sqlite3BtreeCursorDump(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003325 int cnt, idx;
3326 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00003327 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003328 aResult[0] = sqlite3pager_pagenumber(pPage->aData);
drh91025292004-05-03 19:49:32 +00003329 assert( aResult[0]==pPage->pgno );
drh8c42ca92001-06-22 19:15:00 +00003330 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003331 aResult[2] = pPage->nCell;
3332 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh4b70f112004-05-02 21:12:19 +00003333 aResult[3] = cellSize(pPage, pPage->aCell[pCur->idx]);
3334 aResult[6] = pPage->leaf ? 0 : get4byte(&pPage->aCell[pCur->idx][2]);
drh2aa679f2001-06-25 02:11:07 +00003335 }else{
3336 aResult[3] = 0;
3337 aResult[6] = 0;
3338 }
3339 aResult[4] = pPage->nFree;
3340 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00003341 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhc39e0002004-05-07 23:50:57 +00003342 while( idx>0 && idx<pPage->pBt->pageSize ){
drh2aa679f2001-06-25 02:11:07 +00003343 cnt++;
drh4b70f112004-05-02 21:12:19 +00003344 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003345 }
3346 aResult[5] = cnt;
drh4b70f112004-05-02 21:12:19 +00003347 aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh8c42ca92001-06-22 19:15:00 +00003348 return SQLITE_OK;
3349}
drhaaab5722002-02-19 13:39:21 +00003350#endif
drhdd793422001-06-28 01:54:48 +00003351
drhdd793422001-06-28 01:54:48 +00003352/*
drh5eddca62001-06-30 21:53:53 +00003353** Return the pager associated with a BTree. This routine is used for
3354** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003355*/
drh3aac2dd2004-04-26 14:10:20 +00003356Pager *sqlite3BtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003357 return pBt->pPager;
3358}
drh5eddca62001-06-30 21:53:53 +00003359
3360/*
3361** This structure is passed around through all the sanity checking routines
3362** in order to keep track of some global state information.
3363*/
drhaaab5722002-02-19 13:39:21 +00003364typedef struct IntegrityCk IntegrityCk;
3365struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003366 Btree *pBt; /* The tree being checked out */
3367 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3368 int nPage; /* Number of pages in the database */
3369 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00003370 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003371};
3372
3373/*
3374** Append a message to the error message string.
3375*/
drhaaab5722002-02-19 13:39:21 +00003376static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003377 if( pCheck->zErrMsg ){
3378 char *zOld = pCheck->zErrMsg;
3379 pCheck->zErrMsg = 0;
danielk19774adee202004-05-08 08:23:19 +00003380 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003381 sqliteFree(zOld);
3382 }else{
danielk19774adee202004-05-08 08:23:19 +00003383 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003384 }
3385}
3386
3387/*
3388** Add 1 to the reference count for page iPage. If this is the second
3389** reference to the page, add an error message to pCheck->zErrMsg.
3390** Return 1 if there are 2 ore more references to the page and 0 if
3391** if this is the first reference to the page.
3392**
3393** Also check that the page number is in bounds.
3394*/
drhaaab5722002-02-19 13:39:21 +00003395static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003396 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003397 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003398 char zBuf[100];
3399 sprintf(zBuf, "invalid page number %d", iPage);
3400 checkAppendMsg(pCheck, zContext, zBuf);
3401 return 1;
3402 }
3403 if( pCheck->anRef[iPage]==1 ){
3404 char zBuf[100];
3405 sprintf(zBuf, "2nd reference to page %d", iPage);
3406 checkAppendMsg(pCheck, zContext, zBuf);
3407 return 1;
3408 }
3409 return (pCheck->anRef[iPage]++)>1;
3410}
3411
3412/*
3413** Check the integrity of the freelist or of an overflow page list.
3414** Verify that the number of pages on the list is N.
3415*/
drh30e58752002-03-02 20:41:57 +00003416static void checkList(
3417 IntegrityCk *pCheck, /* Integrity checking context */
3418 int isFreeList, /* True for a freelist. False for overflow page list */
3419 int iPage, /* Page number for first page in the list */
3420 int N, /* Expected number of pages in the list */
3421 char *zContext /* Context for error messages */
3422){
3423 int i;
drh5eddca62001-06-30 21:53:53 +00003424 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003425 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00003426 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00003427 if( iPage<1 ){
3428 sprintf(zMsg, "%d pages missing from overflow list", N+1);
3429 checkAppendMsg(pCheck, zContext, zMsg);
3430 break;
3431 }
3432 if( checkRef(pCheck, iPage, zContext) ) break;
drha34b6762004-05-07 13:30:42 +00003433 if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
drh5eddca62001-06-30 21:53:53 +00003434 sprintf(zMsg, "failed to get page %d", iPage);
3435 checkAppendMsg(pCheck, zContext, zMsg);
3436 break;
3437 }
drh30e58752002-03-02 20:41:57 +00003438 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00003439 int n = get4byte(&pOvfl[4]);
drh0d316a42002-08-11 20:10:47 +00003440 for(i=0; i<n; i++){
drh4b70f112004-05-02 21:12:19 +00003441 checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
drh30e58752002-03-02 20:41:57 +00003442 }
drh0d316a42002-08-11 20:10:47 +00003443 N -= n;
drh30e58752002-03-02 20:41:57 +00003444 }
drh4b70f112004-05-02 21:12:19 +00003445 iPage = get4byte(pOvfl);
drha34b6762004-05-07 13:30:42 +00003446 sqlite3pager_unref(pOvfl);
drh5eddca62001-06-30 21:53:53 +00003447 }
3448}
3449
3450/*
drh1bffb9c2002-02-03 17:37:36 +00003451** Return negative if zKey1<zKey2.
3452** Return zero if zKey1==zKey2.
3453** Return positive if zKey1>zKey2.
3454*/
3455static int keyCompare(
3456 const char *zKey1, int nKey1,
3457 const char *zKey2, int nKey2
3458){
3459 int min = nKey1>nKey2 ? nKey2 : nKey1;
3460 int c = memcmp(zKey1, zKey2, min);
3461 if( c==0 ){
3462 c = nKey1 - nKey2;
3463 }
3464 return c;
3465}
3466
3467/*
drh5eddca62001-06-30 21:53:53 +00003468** Do various sanity checks on a single page of a tree. Return
3469** the tree depth. Root pages return 0. Parents of root pages
3470** return 1, and so forth.
3471**
3472** These checks are done:
3473**
3474** 1. Make sure that cells and freeblocks do not overlap
3475** but combine to completely cover the page.
3476** 2. Make sure cell keys are in order.
3477** 3. Make sure no key is less than or equal to zLowerBound.
3478** 4. Make sure no key is greater than or equal to zUpperBound.
3479** 5. Check the integrity of overflow pages.
3480** 6. Recursively call checkTreePage on all children.
3481** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003482** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003483** the root of the tree.
3484*/
3485static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003486 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003487 int iPage, /* Page number of the page to check */
3488 MemPage *pParent, /* Parent page */
3489 char *zParentContext, /* Parent context */
3490 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003491 int nLower, /* Number of characters in zLowerBound */
3492 char *zUpperBound, /* All keys should be less than this, if not NULL */
3493 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003494){
3495 MemPage *pPage;
3496 int i, rc, depth, d2, pgno;
3497 char *zKey1, *zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003498 int nKey1, nKey2;
drh5eddca62001-06-30 21:53:53 +00003499 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003500 Btree *pBt;
drh5eddca62001-06-30 21:53:53 +00003501 char zMsg[100];
3502 char zContext[100];
drhd0ba1932004-02-10 01:54:28 +00003503 char hit[SQLITE_USABLE_SIZE];
drh5eddca62001-06-30 21:53:53 +00003504
3505 /* Check that the page exists
3506 */
drh0d316a42002-08-11 20:10:47 +00003507 cur.pBt = pBt = pCheck->pBt;
drh5eddca62001-06-30 21:53:53 +00003508 if( iPage==0 ) return 0;
3509 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3510 sprintf(zContext, "On tree page %d: ", iPage);
drh4b70f112004-05-02 21:12:19 +00003511 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003512 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3513 checkAppendMsg(pCheck, zContext, zMsg);
3514 return 0;
3515 }
drh4b70f112004-05-02 21:12:19 +00003516 if( (rc = initPage(pPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003517 sprintf(zMsg, "initPage() returns error code %d", rc);
3518 checkAppendMsg(pCheck, zContext, zMsg);
drh91025292004-05-03 19:49:32 +00003519 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003520 return 0;
3521 }
3522
drh4b70f112004-05-02 21:12:19 +00003523#if 0
3524
drh5eddca62001-06-30 21:53:53 +00003525 /* Check out all the cells.
3526 */
3527 depth = 0;
drh1bffb9c2002-02-03 17:37:36 +00003528 if( zLowerBound ){
3529 zKey1 = sqliteMalloc( nLower+1 );
3530 memcpy(zKey1, zLowerBound, nLower);
3531 zKey1[nLower] = 0;
3532 }else{
3533 zKey1 = 0;
3534 }
3535 nKey1 = nLower;
drh5eddca62001-06-30 21:53:53 +00003536 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003537 for(i=0; i<pPage->nCell; i++){
drha34b6762004-05-07 13:30:42 +00003538 Cell *pCell = pPage->aCell[i];
drh5eddca62001-06-30 21:53:53 +00003539 int sz;
3540
3541 /* Check payload overflow pages
3542 */
drh0d316a42002-08-11 20:10:47 +00003543 nKey2 = NKEY(pBt, pCell->h);
3544 sz = nKey2 + NDATA(pBt, pCell->h);
drh5eddca62001-06-30 21:53:53 +00003545 sprintf(zContext, "On page %d cell %d: ", iPage, i);
3546 if( sz>MX_LOCAL_PAYLOAD ){
3547 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
drh0d316a42002-08-11 20:10:47 +00003548 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00003549 }
3550
3551 /* Check that keys are in the right order
3552 */
3553 cur.idx = i;
drh8c1238a2003-01-02 14:43:55 +00003554 zKey2 = sqliteMallocRaw( nKey2+1 );
drh1bffb9c2002-02-03 17:37:36 +00003555 getPayload(&cur, 0, nKey2, zKey2);
3556 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
drh5eddca62001-06-30 21:53:53 +00003557 checkAppendMsg(pCheck, zContext, "Key is out of order");
3558 }
3559
3560 /* Check sanity of left child page.
3561 */
drh0d316a42002-08-11 20:10:47 +00003562 pgno = SWAB32(pBt, pCell->h.leftChild);
drh1bffb9c2002-02-03 17:37:36 +00003563 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
drh5eddca62001-06-30 21:53:53 +00003564 if( i>0 && d2!=depth ){
3565 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3566 }
3567 depth = d2;
3568 sqliteFree(zKey1);
3569 zKey1 = zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003570 nKey1 = nKey2;
drh5eddca62001-06-30 21:53:53 +00003571 }
drh0d316a42002-08-11 20:10:47 +00003572 pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
drh5eddca62001-06-30 21:53:53 +00003573 sprintf(zContext, "On page %d at right child: ", iPage);
drh1bffb9c2002-02-03 17:37:36 +00003574 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
drh5eddca62001-06-30 21:53:53 +00003575 sqliteFree(zKey1);
3576
3577 /* Check for complete coverage of the page
3578 */
3579 memset(hit, 0, sizeof(hit));
3580 memset(hit, 1, sizeof(PageHdr));
drhd0ba1932004-02-10 01:54:28 +00003581 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003582 Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3583 int j;
drh0d316a42002-08-11 20:10:47 +00003584 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3585 i = SWAB16(pBt, pCell->h.iNext);
drh5eddca62001-06-30 21:53:53 +00003586 }
drhd0ba1932004-02-10 01:54:28 +00003587 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003588 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3589 int j;
drh0d316a42002-08-11 20:10:47 +00003590 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3591 i = SWAB16(pBt,pFBlk->iNext);
drh5eddca62001-06-30 21:53:53 +00003592 }
drhd0ba1932004-02-10 01:54:28 +00003593 for(i=0; i<SQLITE_USABLE_SIZE; i++){
drh5eddca62001-06-30 21:53:53 +00003594 if( hit[i]==0 ){
3595 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3596 checkAppendMsg(pCheck, zMsg, 0);
3597 break;
3598 }else if( hit[i]>1 ){
3599 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3600 checkAppendMsg(pCheck, zMsg, 0);
3601 break;
3602 }
3603 }
3604
drh6019e162001-07-02 17:51:45 +00003605#endif
3606
drh4b70f112004-05-02 21:12:19 +00003607 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003608 return depth;
3609}
3610
3611/*
3612** This routine does a complete check of the given BTree file. aRoot[] is
3613** an array of pages numbers were each page number is the root page of
3614** a table. nRoot is the number of entries in aRoot.
3615**
3616** If everything checks out, this routine returns NULL. If something is
3617** amiss, an error message is written into memory obtained from malloc()
3618** and a pointer to that error message is returned. The calling function
3619** is responsible for freeing the error message when it is done.
3620*/
drh3aac2dd2004-04-26 14:10:20 +00003621char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003622 int i;
3623 int nRef;
drhaaab5722002-02-19 13:39:21 +00003624 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003625
drha34b6762004-05-07 13:30:42 +00003626 nRef = *sqlite3pager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003627 if( lockBtree(pBt)!=SQLITE_OK ){
3628 return sqliteStrDup("Unable to acquire a read lock on the database");
3629 }
drh5eddca62001-06-30 21:53:53 +00003630 sCheck.pBt = pBt;
3631 sCheck.pPager = pBt->pPager;
drha34b6762004-05-07 13:30:42 +00003632 sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003633 if( sCheck.nPage==0 ){
3634 unlockBtreeIfUnused(pBt);
3635 return 0;
3636 }
drh8c1238a2003-01-02 14:43:55 +00003637 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drh5eddca62001-06-30 21:53:53 +00003638 sCheck.anRef[1] = 1;
3639 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3640 sCheck.zErrMsg = 0;
3641
3642 /* Check the integrity of the freelist
3643 */
drha34b6762004-05-07 13:30:42 +00003644 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
3645 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003646
3647 /* Check all the tables.
3648 */
3649 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00003650 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00003651 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00003652 }
3653
3654 /* Make sure every page in the file is referenced
3655 */
3656 for(i=1; i<=sCheck.nPage; i++){
3657 if( sCheck.anRef[i]==0 ){
3658 char zBuf[100];
3659 sprintf(zBuf, "Page %d is never used", i);
3660 checkAppendMsg(&sCheck, zBuf, 0);
3661 }
3662 }
3663
3664 /* Make sure this analysis did not leave any unref() pages
3665 */
drh5e00f6c2001-09-13 13:46:56 +00003666 unlockBtreeIfUnused(pBt);
drha34b6762004-05-07 13:30:42 +00003667 if( nRef != *sqlite3pager_stats(pBt->pPager) ){
drh5eddca62001-06-30 21:53:53 +00003668 char zBuf[100];
3669 sprintf(zBuf,
3670 "Outstanding page count goes from %d to %d during this analysis",
drha34b6762004-05-07 13:30:42 +00003671 nRef, *sqlite3pager_stats(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00003672 );
3673 checkAppendMsg(&sCheck, zBuf, 0);
3674 }
3675
3676 /* Clean up and report errors.
3677 */
3678 sqliteFree(sCheck.anRef);
3679 return sCheck.zErrMsg;
3680}
paulb95a8862003-04-01 21:16:41 +00003681
drh73509ee2003-04-06 20:44:45 +00003682/*
3683** Return the full pathname of the underlying database file.
3684*/
drh3aac2dd2004-04-26 14:10:20 +00003685const char *sqlite3BtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00003686 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00003687 return sqlite3pager_filename(pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00003688}
3689
3690/*
drhf7c57532003-04-25 13:22:51 +00003691** Copy the complete content of pBtFrom into pBtTo. A transaction
3692** must be active for both files.
3693**
3694** The size of file pBtFrom may be reduced by this operation.
3695** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00003696*/
drh3aac2dd2004-04-26 14:10:20 +00003697int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
drhf7c57532003-04-25 13:22:51 +00003698 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00003699 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00003700
3701 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
drhf7c57532003-04-25 13:22:51 +00003702 if( pBtTo->pCursor ) return SQLITE_BUSY;
drhc39e0002004-05-07 23:50:57 +00003703 memcpy(pBtTo->pPage1, pBtFrom->pPage1, pBtFrom->pageSize);
drha34b6762004-05-07 13:30:42 +00003704 rc = sqlite3pager_overwrite(pBtTo->pPager, 1, pBtFrom->pPage1);
3705 nToPage = sqlite3pager_pagecount(pBtTo->pPager);
3706 nPage = sqlite3pager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00003707 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00003708 void *pPage;
drha34b6762004-05-07 13:30:42 +00003709 rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
drhf7c57532003-04-25 13:22:51 +00003710 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003711 rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
drh2e6d11b2003-04-25 15:37:57 +00003712 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003713 sqlite3pager_unref(pPage);
drhf7c57532003-04-25 13:22:51 +00003714 }
drh2e6d11b2003-04-25 15:37:57 +00003715 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3716 void *pPage;
drha34b6762004-05-07 13:30:42 +00003717 rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
drh2e6d11b2003-04-25 15:37:57 +00003718 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003719 rc = sqlite3pager_write(pPage);
3720 sqlite3pager_unref(pPage);
3721 sqlite3pager_dont_write(pBtTo->pPager, i);
drh2e6d11b2003-04-25 15:37:57 +00003722 }
3723 if( !rc && nPage<nToPage ){
drha34b6762004-05-07 13:30:42 +00003724 rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
drh2e6d11b2003-04-25 15:37:57 +00003725 }
drhf7c57532003-04-25 13:22:51 +00003726 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003727 sqlite3BtreeRollback(pBtTo);
drhf7c57532003-04-25 13:22:51 +00003728 }
3729 return rc;
drh73509ee2003-04-06 20:44:45 +00003730}