blob: 9947e0e0d123b8c10de4d21c27022c752145dda5 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drh3644f082004-05-10 18:45:09 +000012** $Id: btree.c,v 1.123 2004/05/10 18:45:10 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
56** page contain a special header that describes the file. The format
57** of that header is as follows:
58**
59** OFFSET SIZE DESCRIPTION
drhde647132004-05-07 17:57:49 +000060** 0 16 Header string: "SQLite format 3\000"
drh9e572e62004-04-23 23:43:10 +000061** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
64** 20 2 Bytes of unused space at the end of each page
65** 22 2 Maximum allowed local payload per entry
66** 24 8 File change counter
67** 32 4 First freelist page
68** 36 4 Number of freelist pages in the file
69** 40 60 15 4-byte meta values passed to higher layers
70**
71** All of the integer values are big-endian (most significant byte first).
72** The file change counter is incremented every time the database is changed.
73** This allows other processes to know when the file has changed and thus
74** when they need to flush their cache.
75**
76** Each btree page begins with a header described below. Note that the
77** header for page one begins at byte 100. For all other btree pages, the
78** header begins on byte zero.
79**
80** OFFSET SIZE DESCRIPTION
81** 0 1 Flags. 01: leaf, 02: zerodata, 04: intkey, F8: type
82** 1 2 byte offset to the first freeblock
83** 3 2 byte offset to the first cell
84** 5 1 number of fragmented free bytes
85** 6 4 Right child (the Ptr(N+1) value). Omitted if leaf
86**
87** The flags define the format of this btree page. The leaf flag means that
88** this page has no children. The zerodata flag means that this page carries
89** only keys and no data. The intkey flag means that the key is a single
90** variable length integer at the beginning of the payload.
91**
92** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
93** byte are used. The integer consists of all bytes that have bit 8 set and
94** the first byte with bit 8 clear. Unlike fixed-length values, variable-
95** length integers are little-endian. Examples:
96**
97** 0x00 becomes 0x00000000
98** 0x1b becomes 0x0000001b
99** 0x9b 0x4a becomes 0x00000dca
100** 0x80 0x1b becomes 0x0000001b
101** 0xf8 0xac 0xb1 0x91 0x01 becomes 0x12345678
102** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
103**
104** Variable length integers are used for rowids and to hold the number of
105** bytes of key and data in a btree cell.
106**
107** Unused space within a btree page is collected into a linked list of
108** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
109** to the first freeblock is given in the header. Freeblocks occur in
110** increasing order. Because a freeblock is 4 bytes in size, the minimum
111** size allocation on a btree page is 4 bytes. Because a freeblock must be
112** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
113** exist on the freeblock chain. The total number of such fragmented bytes
114** is recorded in the page header at offset 5.
115**
116** SIZE DESCRIPTION
117** 2 Byte offset of the next freeblock
118** 2 Bytes in this freeblock
119**
120** Cells are of variable length. The first cell begins on the byte defined
121** in the page header. Cells do not necessarily occur in order - they can
122** skip around on the page.
123**
124** SIZE DESCRIPTION
125** 2 Byte offset of the next cell. 0 if this is the last cell
drh3aac2dd2004-04-26 14:10:20 +0000126** 4 Page number of the left child. Omitted if leaf flag is set.
127** var Number of bytes of data. Omitted if the zerodata flag is set.
128** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000129** * Payload
130** 4 First page of the overflow chain. Omitted if no overflow
131**
132** Overflow pages form a linked list. Each page except the last is completely
133** filled with data (pagesize - 4 bytes). The last page can have as little
134** as 1 byte of data.
135**
136** SIZE DESCRIPTION
137** 4 Page number of next overflow page
138** * Data
139**
140** Freelist pages come in two subtypes: trunk pages and leaf pages. The
141** file header points to first in a linked list of trunk page. Each trunk
142** page points to multiple leaf pages. The content of a leaf page is
143** unspecified. A trunk page looks like this:
144**
145** SIZE DESCRIPTION
146** 4 Page number of next trunk page
147** 4 Number of leaf pointers on this page
148** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000149*/
150#include "sqliteInt.h"
151#include "pager.h"
152#include "btree.h"
153#include <assert.h>
154
drh4b70f112004-05-02 21:12:19 +0000155
156/* Maximum page size. The upper bound on this value is 65536 (a limit
157** imposed by the 2-byte offset at the beginning of each cell.) The
158** maximum page size determines the amount of stack space allocated
159** by many of the routines in this module. On embedded architectures
160** or any machine where memory and especially stack memory is limited,
161** one may wish to chose a smaller value for the maximum page size.
162*/
163#ifndef MX_PAGE_SIZE
164# define MX_PAGE_SIZE 1024
165#endif
166
167/* Individual entries or "cells" are limited in size so that at least
168** this many cells will fit on one page. Changing this value will result
169** in an incompatible database.
170*/
171#define MN_CELLS_PER_PAGE 4
172
173/* The following value is the maximum cell size assuming a maximum page
174** size give above.
175*/
176#define MX_CELL_SIZE ((MX_PAGE_SIZE-10)/MN_CELLS_PER_PAGE)
177
178/* The maximum number of cells on a single page of the database. This
179** assumes a minimum cell size of 3 bytes. Such small cells will be
180** exceedingly rare, but they are possible.
181*/
182#define MX_CELL ((MX_PAGE_SIZE-10)/3)
183
paulb95a8862003-04-01 21:16:41 +0000184/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000185typedef struct MemPage MemPage;
paulb95a8862003-04-01 21:16:41 +0000186
drh8c42ca92001-06-22 19:15:00 +0000187/*
drhbd03cae2001-06-02 02:40:57 +0000188** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000189** SQLite database in order to identify the file as a real database.
drhde647132004-05-07 17:57:49 +0000190** 123456789 123456 */
191static const char zMagicHeader[] = "SQLite format 3";
drh08ed44e2001-04-29 23:32:55 +0000192
193/*
drh4b70f112004-05-02 21:12:19 +0000194** Page type flags. An ORed combination of these flags appear as the
195** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000196*/
drhde647132004-05-07 17:57:49 +0000197#define PTF_INTKEY 0x01
drh9e572e62004-04-23 23:43:10 +0000198#define PTF_ZERODATA 0x02
drhde647132004-05-07 17:57:49 +0000199#define PTF_LEAF 0x04
drh4b70f112004-05-02 21:12:19 +0000200/* Idea for the future: PTF_LEAFDATA */
drh8c42ca92001-06-22 19:15:00 +0000201
202/*
drh9e572e62004-04-23 23:43:10 +0000203** As each page of the file is loaded into memory, an instance of the following
204** structure is appended and initialized to zero. This structure stores
205** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000206**
drh72f82862001-05-24 21:06:34 +0000207** The pParent field points back to the parent page. This allows us to
208** walk up the BTree from any leaf to the root. Care must be taken to
209** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000210** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000211*/
212struct MemPage {
drhde647132004-05-07 17:57:49 +0000213 u32 notUsed;
drh3aac2dd2004-04-26 14:10:20 +0000214 u8 isInit; /* True if previously initialized */
drh9e572e62004-04-23 23:43:10 +0000215 u8 idxShift; /* True if Cell indices have changed */
drh3aac2dd2004-04-26 14:10:20 +0000216 u8 isOverfull; /* Some aCell[] do not fit on page */
drh9e572e62004-04-23 23:43:10 +0000217 u8 intKey; /* True if intkey flag is set */
218 u8 leaf; /* True if leaf flag is set */
219 u8 zeroData; /* True if zero data flag is set */
220 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
drhda200cc2004-05-09 11:51:38 +0000221 u8 needRelink; /* True if need to run relinkCellList() */
drh3aac2dd2004-04-26 14:10:20 +0000222 int idxParent; /* Index in pParent->aCell[] of this node */
drh9e572e62004-04-23 23:43:10 +0000223 int nFree; /* Number of free bytes on the page */
drh306dc212001-05-21 13:45:10 +0000224 int nCell; /* Number of entries on this page */
drh4b70f112004-05-02 21:12:19 +0000225 int nCellAlloc; /* Number of slots allocated in aCell[] */
drh9e572e62004-04-23 23:43:10 +0000226 unsigned char **aCell; /* Pointer to start of each cell */
drhc8629a12004-05-08 20:07:40 +0000227 struct Btree *pBt; /* Pointer back to BTree structure */
228
229 unsigned char *aData; /* Pointer back to the start of the page */
230 Pgno pgno; /* Page number for this page */
231 MemPage *pParent; /* The parent of this page. NULL for root */
drh8c42ca92001-06-22 19:15:00 +0000232};
drh7e3b0a02001-04-28 16:52:40 +0000233
234/*
drh3b7511c2001-05-26 13:15:44 +0000235** The in-memory image of a disk page has the auxiliary information appended
236** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
237** that extra information.
238*/
drh3aac2dd2004-04-26 14:10:20 +0000239#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000240
241/*
drha059ad02001-04-17 20:09:11 +0000242** Everything we need to know about an open database
243*/
244struct Btree {
245 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000246 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000247 MemPage *pPage1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000248 u8 inTrans; /* True if a transaction is in progress */
drh3aac2dd2004-04-26 14:10:20 +0000249 u8 inStmt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000250 u8 readOnly; /* True if the underlying file is readonly */
drh9e572e62004-04-23 23:43:10 +0000251 int pageSize; /* Number of usable bytes on each page */
252 int maxLocal; /* Maximum local payload */
drha059ad02001-04-17 20:09:11 +0000253};
254typedef Btree Bt;
255
drh365d68f2001-05-11 11:02:46 +0000256/*
257** A cursor is a pointer to a particular entry in the BTree.
258** The entry is identified by its MemPage and the index in
drha34b6762004-05-07 13:30:42 +0000259** MemPage.aCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000260*/
drh72f82862001-05-24 21:06:34 +0000261struct BtCursor {
drh5e2f8b92001-05-28 00:41:15 +0000262 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000263 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000264 BtCursor *pShared; /* Loop of cursors with the same root page */
drh3aac2dd2004-04-26 14:10:20 +0000265 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
266 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000267 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000268 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000269 int idx; /* Index of the entry in pPage->aCell[] */
drhecdc7532001-09-23 02:35:53 +0000270 u8 wrFlag; /* True if writable */
drh23e11ca2004-05-04 17:27:28 +0000271 u8 iMatch; /* compare result from last sqlite3BtreeMoveto() */
drhc39e0002004-05-07 23:50:57 +0000272 u8 isValid; /* TRUE if points to a valid entry */
273 u8 status; /* Set to SQLITE_ABORT if cursors is invalidated */
drh365d68f2001-05-11 11:02:46 +0000274};
drh7e3b0a02001-04-28 16:52:40 +0000275
drha059ad02001-04-17 20:09:11 +0000276/*
drh3aac2dd2004-04-26 14:10:20 +0000277** Read or write a two-, four-, and eight-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000278*/
drh9e572e62004-04-23 23:43:10 +0000279static u32 get2byte(unsigned char *p){
280 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000281}
drh9e572e62004-04-23 23:43:10 +0000282static u32 get4byte(unsigned char *p){
283 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
284}
drh3aac2dd2004-04-26 14:10:20 +0000285static u64 get8byte(unsigned char *p){
drh9e572e62004-04-23 23:43:10 +0000286 u64 v = get4byte(p);
287 return (v<<32) | get4byte(&p[4]);
288}
289static void put2byte(unsigned char *p, u32 v){
290 p[0] = v>>8;
291 p[1] = v;
292}
293static void put4byte(unsigned char *p, u32 v){
294 p[0] = v>>24;
295 p[1] = v>>16;
296 p[2] = v>>8;
297 p[3] = v;
298}
299static void put8byte(unsigned char *p, u64 v){
300 put4byte(&p[4], v>>32);
301 put4byte(p, v);
302}
303
304/*
305** Read a variable-length integer. Store the result in *pResult.
306** Return the number of bytes in the integer.
307*/
308static unsigned int getVarint(unsigned char *p, u64 *pResult){
309 u64 x = p[0] & 0x7f;
310 int n = 0;
311 while( (p[n++]&0x80)!=0 ){
312 x |= (p[n]&0x7f)<<(n*7);
313 }
314 *pResult = x;
315 return n;
316}
317
318/*
319** Write a variable length integer with value v into p[]. Return
320** the number of bytes written.
321*/
322static unsigned int putVarint(unsigned char *p, u64 v){
323 int i = 0;
324 do{
drhde647132004-05-07 17:57:49 +0000325 p[i++] = (v & 0x7f) | 0x80;
drh9e572e62004-04-23 23:43:10 +0000326 v >>= 7;
327 }while( v!=0 );
drhde647132004-05-07 17:57:49 +0000328 p[i-1] &= 0x7f;
drh9e572e62004-04-23 23:43:10 +0000329 return i;
drh0d316a42002-08-11 20:10:47 +0000330}
331
332/*
drh3aac2dd2004-04-26 14:10:20 +0000333** Parse a cell header and fill in the CellInfo structure.
334*/
335static void parseCellHeader(
336 MemPage *pPage, /* Page containing the cell */
337 unsigned char *pCell, /* The cell */
338 u64 *pnData, /* Number of bytes of data in payload */
339 u64 *pnKey, /* Number of bytes of key, or key value for intKey */
340 int *pnHeader /* Size of header in bytes. Offset to payload */
341){
342 int n;
343 if( pPage->leaf ){
344 n = 2;
345 }else{
346 n = 6;
347 }
348 if( pPage->zeroData ){
349 *pnData = 0;
350 }else{
351 n += getVarint(&pCell[n], pnData);
352 }
drhde647132004-05-07 17:57:49 +0000353 n += getVarint(&pCell[n], pnKey);
drh3aac2dd2004-04-26 14:10:20 +0000354 *pnHeader = n;
355}
356
357/*
drh3b7511c2001-05-26 13:15:44 +0000358** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000359** database page. The number returned includes the Cell header,
360** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000361** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000362** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000363*/
drh9e572e62004-04-23 23:43:10 +0000364static int cellSize(MemPage *pPage, unsigned char *pCell){
drh3aac2dd2004-04-26 14:10:20 +0000365 int n;
drh9e572e62004-04-23 23:43:10 +0000366 u64 nData, nKey;
drh3aac2dd2004-04-26 14:10:20 +0000367 int nPayload, maxPayload;
368
369 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
370 nPayload = (int)nData;
371 if( !pPage->intKey ){
372 nPayload += (int)nKey;
drh3b7511c2001-05-26 13:15:44 +0000373 }
drh3aac2dd2004-04-26 14:10:20 +0000374 maxPayload = pPage->pBt->maxLocal;
drh9e572e62004-04-23 23:43:10 +0000375 if( nPayload>maxPayload ){
376 nPayload = maxPayload + 4;
377 }
378 return n + nPayload;
drh3b7511c2001-05-26 13:15:44 +0000379}
380
381/*
drhda200cc2004-05-09 11:51:38 +0000382** Do sanity checking on a page. Throw an exception if anything is
383** not right.
384**
385** This routine is used for internal error checking only. It is omitted
386** from most builds.
387*/
388#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
389static void _pageIntegrity(MemPage *pPage){
390 int pageSize;
391 u8 *data;
392 int i, idx, c, pc, hdr, nFree;
393 u8 used[MX_PAGE_SIZE];
394
395 pageSize = pPage->pBt->pageSize;
396 assert( pPage->aData==&((unsigned char*)pPage)[-pageSize] );
397 hdr = pPage->hdrOffset;
398 assert( hdr==(pPage->pgno==1 ? 100 : 0) );
399 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
400 c = pPage->aData[hdr];
401 if( pPage->isInit ){
402 assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
403 assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
404 assert( pPage->intKey == ((c & PTF_INTKEY)!=0) );
405 }
406 data = pPage->aData;
407 memset(used, 0, pageSize);
408 for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
409 nFree = 0;
410 pc = get2byte(&data[hdr+1]);
411 while( pc ){
412 int size;
413 assert( pc>0 && pc<pageSize-4 );
414 size = get2byte(&data[pc+2]);
415 assert( pc+size<=pageSize );
416 nFree += size;
417 for(i=pc; i<pc+size; i++){
418 assert( used[i]==0 );
419 used[i] = 1;
420 }
421 pc = get2byte(&data[pc]);
422 }
423 assert( pPage->isInit==0 || pPage->nFree==nFree+data[hdr+5] );
424 idx = 0;
425 pc = get2byte(&data[hdr+3]);
426 while( pc ){
427 int size;
428 assert( pPage->isInit==0 || idx<pPage->nCell );
429 assert( pc>0 && pc<pageSize-4 );
430 assert( pPage->isInit==0 || pPage->aCell[idx]==&data[pc] );
431 size = cellSize(pPage, &data[pc]);
432 assert( pc+size<=pageSize );
433 for(i=pc; i<pc+size; i++){
434 assert( used[i]==0 );
435 used[i] = 1;
436 }
437 pc = get2byte(&data[pc]);
438 idx++;
439 }
440 assert( idx==pPage->nCell );
441 nFree = 0;
442 for(i=0; i<pageSize; i++){
443 assert( used[i]<=1 );
444 if( used[i]==0 ) nFree++;
445 }
446 assert( nFree==data[hdr+5] );
447}
448#define pageIntegrity(X) _pageIntegrity(X)
449#else
450# define pageIntegrity(X)
451#endif
452
453/*
drh72f82862001-05-24 21:06:34 +0000454** Defragment the page given. All Cells are moved to the
455** beginning of the page and all free space is collected
456** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000457*/
drh9e572e62004-04-23 23:43:10 +0000458static void defragmentPage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000459 int pc, i, n, addr;
460 int start, hdr, size;
drh9e572e62004-04-23 23:43:10 +0000461 int leftover;
462 unsigned char *oldPage;
drh23e11ca2004-05-04 17:27:28 +0000463 unsigned char newPage[MX_PAGE_SIZE];
drh2af926b2001-05-15 00:39:25 +0000464
drha34b6762004-05-07 13:30:42 +0000465 assert( sqlite3pager_iswriteable(pPage->aData) );
drh9e572e62004-04-23 23:43:10 +0000466 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000467 assert( pPage->pBt->pageSize <= MX_PAGE_SIZE );
drhda200cc2004-05-09 11:51:38 +0000468 assert( !pPage->needRelink );
469 assert( !pPage->isOverfull );
drh9e572e62004-04-23 23:43:10 +0000470 oldPage = pPage->aData;
471 hdr = pPage->hdrOffset;
drh3aac2dd2004-04-26 14:10:20 +0000472 addr = 3+hdr;
drh9e572e62004-04-23 23:43:10 +0000473 n = 6+hdr;
474 if( !pPage->leaf ){
475 n += 4;
drh2af926b2001-05-15 00:39:25 +0000476 }
drhc39e0002004-05-07 23:50:57 +0000477 memcpy(&newPage[hdr], &oldPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000478 start = n;
drh3aac2dd2004-04-26 14:10:20 +0000479 pc = get2byte(&oldPage[addr]);
drh9e572e62004-04-23 23:43:10 +0000480 i = 0;
481 while( pc>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000482 assert( n<pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000483 size = cellSize(pPage, &oldPage[pc]);
484 memcpy(&newPage[n], &oldPage[pc], size);
drh3aac2dd2004-04-26 14:10:20 +0000485 put2byte(&newPage[addr],n);
drhda200cc2004-05-09 11:51:38 +0000486 assert( pPage->aCell[i]==&oldPage[pc] );
drhc39e0002004-05-07 23:50:57 +0000487 pPage->aCell[i++] = &oldPage[n];
drhda200cc2004-05-09 11:51:38 +0000488 addr = n;
drh9e572e62004-04-23 23:43:10 +0000489 n += size;
drh9e572e62004-04-23 23:43:10 +0000490 pc = get2byte(&oldPage[pc]);
drh2aa679f2001-06-25 02:11:07 +0000491 }
drhc39e0002004-05-07 23:50:57 +0000492 assert( i==pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +0000493 leftover = pPage->pBt->pageSize - n;
drh9e572e62004-04-23 23:43:10 +0000494 assert( leftover>=0 );
495 assert( pPage->nFree==leftover );
496 if( leftover<4 ){
497 oldPage[hdr+5] = leftover;
498 leftover = 0;
drh3aac2dd2004-04-26 14:10:20 +0000499 n = pPage->pBt->pageSize;
drh9e572e62004-04-23 23:43:10 +0000500 }
drhc39e0002004-05-07 23:50:57 +0000501 memcpy(&oldPage[hdr], &newPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000502 if( leftover==0 ){
drhc39e0002004-05-07 23:50:57 +0000503 put2byte(&oldPage[hdr+1], 0);
drh9e572e62004-04-23 23:43:10 +0000504 }else if( leftover>=4 ){
drhc39e0002004-05-07 23:50:57 +0000505 put2byte(&oldPage[hdr+1], n);
drh9e572e62004-04-23 23:43:10 +0000506 put2byte(&oldPage[n], 0);
507 put2byte(&oldPage[n+2], leftover);
508 memset(&oldPage[n+4], 0, leftover-4);
509 }
drhc39e0002004-05-07 23:50:57 +0000510 oldPage[hdr+5] = 0;
drh365d68f2001-05-11 11:02:46 +0000511}
512
drha059ad02001-04-17 20:09:11 +0000513/*
drh9e572e62004-04-23 23:43:10 +0000514** Allocate nByte bytes of space on a page. If nByte is less than
515** 4 it is rounded up to 4.
drhbd03cae2001-06-02 02:40:57 +0000516**
drh9e572e62004-04-23 23:43:10 +0000517** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000518** the new allocation. Or return 0 if there is not enough free
519** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000520**
drh72f82862001-05-24 21:06:34 +0000521** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000522** nBytes of contiguous free space, then this routine automatically
523** calls defragementPage() to consolidate all free space before
524** allocating the new chunk.
drh9e572e62004-04-23 23:43:10 +0000525**
526** Algorithm: Carve a piece off of the first freeblock that is
527** nByte in size or that larger.
drh7e3b0a02001-04-28 16:52:40 +0000528*/
drh9e572e62004-04-23 23:43:10 +0000529static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000530 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000531 int size;
drh24cd67e2004-05-10 16:18:47 +0000532 int nFrag;
drh9e572e62004-04-23 23:43:10 +0000533 unsigned char *data;
drh44ce7e22003-06-17 02:57:17 +0000534#ifndef NDEBUG
535 int cnt = 0;
536#endif
drh72f82862001-05-24 21:06:34 +0000537
drh9e572e62004-04-23 23:43:10 +0000538 data = pPage->aData;
drha34b6762004-05-07 13:30:42 +0000539 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000540 assert( pPage->pBt );
541 if( nByte<4 ) nByte = 4;
drh14acc042001-06-10 19:56:58 +0000542 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
drh9e572e62004-04-23 23:43:10 +0000543 hdr = pPage->hdrOffset;
drh24cd67e2004-05-10 16:18:47 +0000544 nFrag = data[hdr+5];
545 if( nFrag>=60 || nFrag>pPage->nFree-nByte ){
drh9e572e62004-04-23 23:43:10 +0000546 defragmentPage(pPage);
drh2af926b2001-05-15 00:39:25 +0000547 }
drh3aac2dd2004-04-26 14:10:20 +0000548 addr = hdr+1;
549 pc = get2byte(&data[addr]);
550 assert( addr<pc );
drha34b6762004-05-07 13:30:42 +0000551 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000552 while( (size = get2byte(&data[pc+2]))<nByte ){
553 addr = pc;
554 pc = get2byte(&data[addr]);
drha34b6762004-05-07 13:30:42 +0000555 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000556 assert( pc>=addr+size+4 || pc==0 );
drh9e572e62004-04-23 23:43:10 +0000557 if( pc==0 ){
558 assert( (cnt++)==0 );
559 defragmentPage(pPage);
560 assert( data[hdr+5]==0 );
drh3aac2dd2004-04-26 14:10:20 +0000561 addr = pPage->hdrOffset+1;
562 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000563 }
564 }
565 assert( pc>0 && size>=nByte );
drha34b6762004-05-07 13:30:42 +0000566 assert( pc+size<=pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000567 if( size>nByte+4 ){
drhde647132004-05-07 17:57:49 +0000568 int newStart = pc+nByte;
569 put2byte(&data[addr], newStart);
570 put2byte(&data[newStart], get2byte(&data[pc]));
571 put2byte(&data[newStart+2], size-nByte);
drh2af926b2001-05-15 00:39:25 +0000572 }else{
drh3aac2dd2004-04-26 14:10:20 +0000573 put2byte(&data[addr], get2byte(&data[pc]));
drh9e572e62004-04-23 23:43:10 +0000574 data[hdr+5] += size-nByte;
drh2af926b2001-05-15 00:39:25 +0000575 }
576 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000577 assert( pPage->nFree>=0 );
578 return pc;
drh7e3b0a02001-04-28 16:52:40 +0000579}
580
581/*
drh9e572e62004-04-23 23:43:10 +0000582** Return a section of the pPage->aData to the freelist.
583** The first byte of the new free block is pPage->aDisk[start]
584** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000585**
586** Most of the effort here is involved in coalesing adjacent
587** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000588*/
drh9e572e62004-04-23 23:43:10 +0000589static void freeSpace(MemPage *pPage, int start, int size){
590 int end = start + size; /* End of the segment being freed */
drha34b6762004-05-07 13:30:42 +0000591 int addr, pbegin;
drh9e572e62004-04-23 23:43:10 +0000592#ifndef NDEBUG
593 int tsize = 0; /* Total size of all freeblocks */
594#endif
595 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000596
drh9e572e62004-04-23 23:43:10 +0000597 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000598 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000599 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
600 assert( end<=pPage->pBt->pageSize );
601 if( size<4 ) size = 4;
602
603 /* Add the space back into the linked list of freeblocks */
drh3aac2dd2004-04-26 14:10:20 +0000604 addr = pPage->hdrOffset + 1;
605 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drh9e572e62004-04-23 23:43:10 +0000606 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000607 assert( pbegin>addr );
608 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000609 }
drh9e572e62004-04-23 23:43:10 +0000610 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000611 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000612 put2byte(&data[addr], start);
613 put2byte(&data[start], pbegin);
614 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000615 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000616
617 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000618 addr = pPage->hdrOffset + 1;
619 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000620 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000621 assert( pbegin>addr );
drh9e572e62004-04-23 23:43:10 +0000622 assert( pbegin<pPage->pBt->pageSize-4 );
623 pnext = get2byte(&data[pbegin]);
624 psize = get2byte(&data[pbegin+2]);
625 if( pbegin + psize + 3 >= pnext && pnext>0 ){
626 int frag = pnext - (pbegin+psize);
627 assert( frag<=data[pPage->hdrOffset+5] );
628 data[pPage->hdrOffset+5] -= frag;
629 put2byte(&data[pbegin], get2byte(&data[pnext]));
630 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
631 }else{
632 assert( (tsize += psize)>0 );
drh3aac2dd2004-04-26 14:10:20 +0000633 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000634 }
635 }
636 assert( tsize+data[pPage->hdrOffset+5]==pPage->nFree );
drh7e3b0a02001-04-28 16:52:40 +0000637}
638
drh9e572e62004-04-23 23:43:10 +0000639/*
drh4b70f112004-05-02 21:12:19 +0000640** Resize the aCell[] array of the given page so that it is able to
641** hold at least nNewSz entries.
642**
643** Return SQLITE_OK or SQLITE_NOMEM.
644*/
645static int resizeCellArray(MemPage *pPage, int nNewSz){
drha34b6762004-05-07 13:30:42 +0000646 if( pPage->nCellAlloc<nNewSz ){
drh4b70f112004-05-02 21:12:19 +0000647 pPage->aCell = sqliteRealloc(pPage->aCell, nNewSz*sizeof(pPage->aCell[0]) );
danielk197724b03fd2004-05-10 10:34:34 +0000648 if( sqlite3_malloc_failed ) return SQLITE_NOMEM;
drha34b6762004-05-07 13:30:42 +0000649 pPage->nCellAlloc = nNewSz;
drh4b70f112004-05-02 21:12:19 +0000650 }
651 return SQLITE_OK;
652}
653
654/*
drh7e3b0a02001-04-28 16:52:40 +0000655** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000656**
drhbd03cae2001-06-02 02:40:57 +0000657** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +0000658** is the parent of the page being initialized. The root of a
659** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000660**
drh72f82862001-05-24 21:06:34 +0000661** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000662** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000663** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
664** guarantee that the page is well-formed. It only shows that
665** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000666*/
drh9e572e62004-04-23 23:43:10 +0000667static int initPage(
drh3aac2dd2004-04-26 14:10:20 +0000668 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +0000669 MemPage *pParent /* The parent. Might be NULL */
670){
drh3aac2dd2004-04-26 14:10:20 +0000671 int c, pc, i, hdr;
drha34b6762004-05-07 13:30:42 +0000672 unsigned char *data;
673 int pageSize;
drh9e572e62004-04-23 23:43:10 +0000674 int sumCell = 0; /* Total size of all cells */
drh2af926b2001-05-15 00:39:25 +0000675
drh3aac2dd2004-04-26 14:10:20 +0000676 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000677 assert( pParent==0 || pParent->pBt==pPage->pBt );
drha34b6762004-05-07 13:30:42 +0000678 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
drhde647132004-05-07 17:57:49 +0000679 assert( pPage->aData == &((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhda200cc2004-05-09 11:51:38 +0000680 assert( pPage->pParent==0 || pPage->pParent==pParent );
681 if( pPage->pParent==0 && pParent!=0 ){
682 pPage->pParent = pParent;
drha34b6762004-05-07 13:30:42 +0000683 sqlite3pager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +0000684 }
drhda200cc2004-05-09 11:51:38 +0000685 if( pPage->isInit ) return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +0000686 pPage->nCell = pPage->nCellAlloc = 0;
drhde647132004-05-07 17:57:49 +0000687 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
688 hdr = pPage->hdrOffset;
drha34b6762004-05-07 13:30:42 +0000689 data = pPage->aData;
690 c = data[hdr];
drh9e572e62004-04-23 23:43:10 +0000691 pPage->intKey = (c & PTF_INTKEY)!=0;
692 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh4b70f112004-05-02 21:12:19 +0000693 pPage->leaf = (c & PTF_LEAF)!=0;
drhc8629a12004-05-08 20:07:40 +0000694 pPage->isOverfull = 0;
drhda200cc2004-05-09 11:51:38 +0000695 pPage->needRelink = 0;
drhc8629a12004-05-08 20:07:40 +0000696 pPage->idxShift = 0;
drha34b6762004-05-07 13:30:42 +0000697 pageSize = pPage->pBt->pageSize;
drh2af926b2001-05-15 00:39:25 +0000698
drh9e572e62004-04-23 23:43:10 +0000699 /* Initialize the cell count and cell pointers */
700 pc = get2byte(&data[hdr+3]);
701 while( pc>0 ){
drha34b6762004-05-07 13:30:42 +0000702 if( pc>=pageSize ) return SQLITE_CORRUPT;
703 if( pPage->nCell>pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000704 pPage->nCell++;
705 pc = get2byte(&data[pc]);
706 }
drh4b70f112004-05-02 21:12:19 +0000707 if( resizeCellArray(pPage, pPage->nCell) ){
drh9e572e62004-04-23 23:43:10 +0000708 return SQLITE_NOMEM;
709 }
710 pc = get2byte(&data[hdr+3]);
711 for(i=0; pc>0; i++){
712 pPage->aCell[i] = &data[pc];
drh9e572e62004-04-23 23:43:10 +0000713 sumCell += cellSize(pPage, &data[pc]);
drhde647132004-05-07 17:57:49 +0000714 pc = get2byte(&data[pc]);
drh9e572e62004-04-23 23:43:10 +0000715 }
716
717 /* Compute the total free space on the page */
718 pPage->nFree = data[hdr+5];
719 pc = get2byte(&data[hdr+1]);
720 while( pc>0 ){
721 int next, size;
drha34b6762004-05-07 13:30:42 +0000722 if( pc>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000723 next = get2byte(&data[pc]);
724 size = get2byte(&data[pc+2]);
drh3aac2dd2004-04-26 14:10:20 +0000725 if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000726 pPage->nFree += size;
727 pc = next;
728 }
drha34b6762004-05-07 13:30:42 +0000729 if( pPage->nFree>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000730
731 /* Sanity check: Cells and freespace and header must sum to the size
732 ** a page. */
drha34b6762004-05-07 13:30:42 +0000733 if( sumCell+pPage->nFree+hdr+10-pPage->leaf*4 != pageSize ){
734 return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000735 }
736
drhde647132004-05-07 17:57:49 +0000737 pPage->isInit = 1;
drhda200cc2004-05-09 11:51:38 +0000738 pageIntegrity(pPage);
drh9e572e62004-04-23 23:43:10 +0000739 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000740}
741
742/*
drh8b2f49b2001-06-08 00:21:52 +0000743** Set up a raw page so that it looks like a database page holding
744** no entries.
drhbd03cae2001-06-02 02:40:57 +0000745*/
drh9e572e62004-04-23 23:43:10 +0000746static void zeroPage(MemPage *pPage, int flags){
747 unsigned char *data = pPage->aData;
748 Btree *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +0000749 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +0000750 int first;
751
drhda200cc2004-05-09 11:51:38 +0000752 assert( sqlite3pager_pagenumber(data)==pPage->pgno );
753 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000754 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000755 memset(&data[hdr], 0, pBt->pageSize - hdr);
756 data[hdr] = flags;
drhde647132004-05-07 17:57:49 +0000757 first = hdr + 6 + 4*((flags&PTF_LEAF)==0);
drh9e572e62004-04-23 23:43:10 +0000758 put2byte(&data[hdr+1], first);
759 put2byte(&data[first+2], pBt->pageSize - first);
760 sqliteFree(pPage->aCell);
761 pPage->aCell = 0;
drh8c42ca92001-06-22 19:15:00 +0000762 pPage->nCell = 0;
drhde647132004-05-07 17:57:49 +0000763 pPage->nCellAlloc = 0;
drh9e572e62004-04-23 23:43:10 +0000764 pPage->nFree = pBt->pageSize - first;
765 pPage->intKey = (flags & PTF_INTKEY)!=0;
766 pPage->leaf = (flags & PTF_LEAF)!=0;
767 pPage->zeroData = (flags & PTF_ZERODATA)!=0;
768 pPage->hdrOffset = hdr;
drhda200cc2004-05-09 11:51:38 +0000769 pPage->isOverfull = 0;
770 pPage->needRelink = 0;
771 pPage->idxShift = 0;
772 pPage->isInit = 1;
773 pageIntegrity(pPage);
drhbd03cae2001-06-02 02:40:57 +0000774}
775
776/*
drh3aac2dd2004-04-26 14:10:20 +0000777** Get a page from the pager. Initialize the MemPage.pBt and
778** MemPage.aData elements if needed.
779*/
780static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
781 int rc;
782 unsigned char *aData;
783 MemPage *pPage;
drha34b6762004-05-07 13:30:42 +0000784 rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
drh3aac2dd2004-04-26 14:10:20 +0000785 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +0000786 pPage = (MemPage*)&aData[pBt->pageSize];
drh3aac2dd2004-04-26 14:10:20 +0000787 pPage->aData = aData;
788 pPage->pBt = pBt;
789 pPage->pgno = pgno;
drhde647132004-05-07 17:57:49 +0000790 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
drh3aac2dd2004-04-26 14:10:20 +0000791 *ppPage = pPage;
792 return SQLITE_OK;
793}
794
795/*
drhde647132004-05-07 17:57:49 +0000796** Get a page from the pager and initialize it. This routine
797** is just a convenience wrapper around separate calls to
798** getPage() and initPage().
799*/
800static int getAndInitPage(
801 Btree *pBt, /* The database file */
802 Pgno pgno, /* Number of the page to get */
803 MemPage **ppPage, /* Write the page pointer here */
804 MemPage *pParent /* Parent of the page */
805){
806 int rc;
807 rc = getPage(pBt, pgno, ppPage);
808 if( rc==SQLITE_OK ){
809 rc = initPage(*ppPage, pParent);
810 }
811 return rc;
812}
813
814/*
drh3aac2dd2004-04-26 14:10:20 +0000815** Release a MemPage. This should be called once for each prior
816** call to getPage.
817*/
drh4b70f112004-05-02 21:12:19 +0000818static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000819 if( pPage ){
820 assert( pPage->aData );
821 assert( pPage->pBt );
822 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000823 sqlite3pager_unref(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +0000824 }
825}
826
827/*
drh72f82862001-05-24 21:06:34 +0000828** This routine is called when the reference count for a page
829** reaches zero. We need to unref the pParent pointer when that
830** happens.
831*/
832static void pageDestructor(void *pData){
drh9e572e62004-04-23 23:43:10 +0000833 MemPage *pPage = (MemPage*)&((char*)pData)[SQLITE_PAGE_SIZE];
drhda200cc2004-05-09 11:51:38 +0000834 assert( pPage->isInit==0 || pPage->needRelink==0 );
drh72f82862001-05-24 21:06:34 +0000835 if( pPage->pParent ){
836 MemPage *pParent = pPage->pParent;
837 pPage->pParent = 0;
drha34b6762004-05-07 13:30:42 +0000838 releasePage(pParent);
drh72f82862001-05-24 21:06:34 +0000839 }
drh9e572e62004-04-23 23:43:10 +0000840 sqliteFree(pPage->aCell);
841 pPage->aCell = 0;
drh3aac2dd2004-04-26 14:10:20 +0000842 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +0000843}
844
845/*
drh306dc212001-05-21 13:45:10 +0000846** Open a new database.
847**
848** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000849** for accessing the database. We do not open the database file
850** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000851**
852** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000853** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +0000854** database file will be deleted when sqlite3BtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000855*/
drh23e11ca2004-05-04 17:27:28 +0000856int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +0000857 const char *zFilename, /* Name of the file containing the BTree database */
858 Btree **ppBtree, /* Pointer to new Btree object written here */
859 int nCache, /* Number of cache pages */
860 int flags /* Options */
drh6019e162001-07-02 17:51:45 +0000861){
drha059ad02001-04-17 20:09:11 +0000862 Btree *pBt;
drha34b6762004-05-07 13:30:42 +0000863 int rc;
drha059ad02001-04-17 20:09:11 +0000864
drhd62d3d02003-01-24 12:14:20 +0000865 /*
866 ** The following asserts make sure that structures used by the btree are
867 ** the right size. This is to guard against size changes that result
868 ** when compiling on a different architecture.
869 */
drh9e572e62004-04-23 23:43:10 +0000870 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +0000871 assert( sizeof(u32)==4 );
872 assert( sizeof(u16)==2 );
873 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +0000874 assert( sizeof(ptr)==sizeof(char*) );
875 assert( sizeof(uptr)==sizeof(ptr) );
876
drha059ad02001-04-17 20:09:11 +0000877 pBt = sqliteMalloc( sizeof(*pBt) );
878 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000879 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000880 return SQLITE_NOMEM;
881 }
drh6019e162001-07-02 17:51:45 +0000882 if( nCache<10 ) nCache = 10;
drha34b6762004-05-07 13:30:42 +0000883 rc = sqlite3pager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
drh3aac2dd2004-04-26 14:10:20 +0000884 (flags & BTREE_OMIT_JOURNAL)==0);
drha059ad02001-04-17 20:09:11 +0000885 if( rc!=SQLITE_OK ){
drha34b6762004-05-07 13:30:42 +0000886 if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000887 sqliteFree(pBt);
888 *ppBtree = 0;
889 return rc;
890 }
drha34b6762004-05-07 13:30:42 +0000891 sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000892 pBt->pCursor = 0;
drha34b6762004-05-07 13:30:42 +0000893 pBt->pPage1 = 0;
894 pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
drh4b70f112004-05-02 21:12:19 +0000895 pBt->pageSize = SQLITE_PAGE_SIZE; /* FIX ME - read from header */
drh3a4c1412004-05-09 20:40:11 +0000896
897 /* maxLocal is the maximum amount of payload to store locally for
898 ** a cell. Make sure it is small enough so that at least MN_CELLS_PER_PAGE
899 ** will fit on one page. We assume a 10-byte page header. Besides
900 ** the payload, the cell must store:
901 ** 2-byte pointer to next cell
902 ** 4-byte child pointer
903 ** 9-byte nKey value
904 ** 4-byte nData value
905 ** 4-byte overflow page pointer
906 */
907 pBt->maxLocal = (pBt->pageSize-10)/MN_CELLS_PER_PAGE - 23;
908 assert( pBt->maxLocal + 23 <= MX_CELL_SIZE );
drha059ad02001-04-17 20:09:11 +0000909 *ppBtree = pBt;
910 return SQLITE_OK;
911}
912
913/*
914** Close an open database and invalidate all cursors.
915*/
drh3aac2dd2004-04-26 14:10:20 +0000916int sqlite3BtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000917 while( pBt->pCursor ){
drh3aac2dd2004-04-26 14:10:20 +0000918 sqlite3BtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000919 }
drha34b6762004-05-07 13:30:42 +0000920 sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000921 sqliteFree(pBt);
922 return SQLITE_OK;
923}
924
925/*
drhda47d772002-12-02 04:25:19 +0000926** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000927**
928** The maximum number of cache pages is set to the absolute
929** value of mxPage. If mxPage is negative, the pager will
930** operate asynchronously - it will not stop to do fsync()s
931** to insure data is written to the disk surface before
932** continuing. Transactions still work if synchronous is off,
933** and the database cannot be corrupted if this program
934** crashes. But if the operating system crashes or there is
935** an abrupt power failure when synchronous is off, the database
936** could be left in an inconsistent and unrecoverable state.
937** Synchronous is on by default so database corruption is not
938** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000939*/
drh23e11ca2004-05-04 17:27:28 +0000940int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
drha34b6762004-05-07 13:30:42 +0000941 sqlite3pager_set_cachesize(pBt->pPager, mxPage);
drhf57b14a2001-09-14 18:54:08 +0000942 return SQLITE_OK;
943}
944
945/*
drh973b6e32003-02-12 14:09:42 +0000946** Change the way data is synced to disk in order to increase or decrease
947** how well the database resists damage due to OS crashes and power
948** failures. Level 1 is the same as asynchronous (no syncs() occur and
949** there is a high probability of damage) Level 2 is the default. There
950** is a very low but non-zero probability of damage. Level 3 reduces the
951** probability of damage to near zero but with a write performance reduction.
952*/
drh3aac2dd2004-04-26 14:10:20 +0000953int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
drha34b6762004-05-07 13:30:42 +0000954 sqlite3pager_set_safety_level(pBt->pPager, level);
drh973b6e32003-02-12 14:09:42 +0000955 return SQLITE_OK;
956}
957
958/*
drha34b6762004-05-07 13:30:42 +0000959** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +0000960** also acquire a readlock on that file.
961**
962** SQLITE_OK is returned on success. If the file is not a
963** well-formed database file, then SQLITE_CORRUPT is returned.
964** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
965** is returned if we run out of memory. SQLITE_PROTOCOL is returned
966** if there is a locking protocol violation.
967*/
968static int lockBtree(Btree *pBt){
969 int rc;
drh3aac2dd2004-04-26 14:10:20 +0000970 MemPage *pPage1;
drha34b6762004-05-07 13:30:42 +0000971 if( pBt->pPage1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000972 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +0000973 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +0000974
drh306dc212001-05-21 13:45:10 +0000975
976 /* Do some checking to help insure the file we opened really is
977 ** a valid database file.
978 */
drha34b6762004-05-07 13:30:42 +0000979 if( sqlite3pager_pagecount(pBt->pPager)>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000980 if( memcmp(pPage1->aData, zMagicHeader, 16)!=0 ){
drhc602f9a2004-02-12 19:01:04 +0000981 rc = SQLITE_NOTADB;
drh72f82862001-05-24 21:06:34 +0000982 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +0000983 }
drh9e572e62004-04-23 23:43:10 +0000984 /*** TBD: Other header checks such as page size ****/
drh306dc212001-05-21 13:45:10 +0000985 }
drh3aac2dd2004-04-26 14:10:20 +0000986 pBt->pPage1 = pPage1;
drh306dc212001-05-21 13:45:10 +0000987 return rc;
988
drh72f82862001-05-24 21:06:34 +0000989page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +0000990 releasePage(pPage1);
991 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +0000992 return rc;
drh306dc212001-05-21 13:45:10 +0000993}
994
995/*
drhb8ca3072001-12-05 00:21:20 +0000996** If there are no outstanding cursors and we are not in the middle
997** of a transaction but there is a read lock on the database, then
998** this routine unrefs the first page of the database file which
999** has the effect of releasing the read lock.
1000**
1001** If there are any outstanding cursors, this routine is a no-op.
1002**
1003** If there is a transaction in progress, this routine is a no-op.
1004*/
1005static void unlockBtreeIfUnused(Btree *pBt){
drh3aac2dd2004-04-26 14:10:20 +00001006 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->pPage1!=0 ){
1007 releasePage(pBt->pPage1);
1008 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +00001009 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001010 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001011 }
1012}
1013
1014/*
drh9e572e62004-04-23 23:43:10 +00001015** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001016** file.
drh8b2f49b2001-06-08 00:21:52 +00001017*/
1018static int newDatabase(Btree *pBt){
drh9e572e62004-04-23 23:43:10 +00001019 MemPage *pP1;
1020 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001021 int rc;
drhde647132004-05-07 17:57:49 +00001022 if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001023 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001024 assert( pP1!=0 );
1025 data = pP1->aData;
drha34b6762004-05-07 13:30:42 +00001026 rc = sqlite3pager_write(data);
drh8b2f49b2001-06-08 00:21:52 +00001027 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001028 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1029 assert( sizeof(zMagicHeader)==16 );
1030 put2byte(&data[16], SQLITE_PAGE_SIZE);
1031 data[18] = 1;
1032 data[19] = 1;
1033 put2byte(&data[22], (SQLITE_PAGE_SIZE-10)/4-12);
1034 zeroPage(pP1, PTF_INTKEY|PTF_LEAF);
drh8b2f49b2001-06-08 00:21:52 +00001035 return SQLITE_OK;
1036}
1037
1038/*
drh72f82862001-05-24 21:06:34 +00001039** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001040**
1041** A transaction must be started before attempting any changes
1042** to the database. None of the following routines will work
1043** unless a transaction is started first:
1044**
drh23e11ca2004-05-04 17:27:28 +00001045** sqlite3BtreeCreateTable()
1046** sqlite3BtreeCreateIndex()
1047** sqlite3BtreeClearTable()
1048** sqlite3BtreeDropTable()
1049** sqlite3BtreeInsert()
1050** sqlite3BtreeDelete()
1051** sqlite3BtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +00001052*/
drh3aac2dd2004-04-26 14:10:20 +00001053int sqlite3BtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001054 int rc;
1055 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +00001056 if( pBt->readOnly ) return SQLITE_READONLY;
drh3aac2dd2004-04-26 14:10:20 +00001057 if( pBt->pPage1==0 ){
drh7e3b0a02001-04-28 16:52:40 +00001058 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +00001059 if( rc!=SQLITE_OK ){
1060 return rc;
1061 }
drha059ad02001-04-17 20:09:11 +00001062 }
drha34b6762004-05-07 13:30:42 +00001063 rc = sqlite3pager_begin(pBt->pPage1->aData);
drhf74b8d92002-09-01 23:20:45 +00001064 if( rc==SQLITE_OK ){
1065 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +00001066 }
drhb8ca3072001-12-05 00:21:20 +00001067 if( rc==SQLITE_OK ){
1068 pBt->inTrans = 1;
drh3aac2dd2004-04-26 14:10:20 +00001069 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001070 }else{
1071 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001072 }
drhb8ca3072001-12-05 00:21:20 +00001073 return rc;
drha059ad02001-04-17 20:09:11 +00001074}
1075
1076/*
drh2aa679f2001-06-25 02:11:07 +00001077** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00001078**
1079** This will release the write lock on the database file. If there
1080** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001081*/
drh3aac2dd2004-04-26 14:10:20 +00001082int sqlite3BtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001083 int rc;
drha34b6762004-05-07 13:30:42 +00001084 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +00001085 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001086 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00001087 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001088 return rc;
1089}
1090
1091/*
drhc39e0002004-05-07 23:50:57 +00001092** Invalidate all cursors
1093*/
1094static void invalidateCursors(Btree *pBt){
1095 BtCursor *pCur;
1096 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1097 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001098 if( pPage /* && !pPage->isInit */ ){
1099 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001100 releasePage(pPage);
1101 pCur->pPage = 0;
1102 pCur->isValid = 0;
1103 pCur->status = SQLITE_ABORT;
1104 }
1105 }
1106}
1107
drhda200cc2004-05-09 11:51:38 +00001108#ifdef SQLITE_TEST
1109/*
1110** Print debugging information about all cursors to standard output.
1111*/
1112void sqlite3BtreeCursorList(Btree *pBt){
1113 BtCursor *pCur;
1114 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1115 MemPage *pPage = pCur->pPage;
1116 char *zMode = pCur->wrFlag ? "rw" : "ro";
1117 printf("CURSOR %08x rooted at %4d(%s) currently at %d.%d%s\n",
1118 (int)pCur, pCur->pgnoRoot, zMode,
1119 pPage ? pPage->pgno : 0, pCur->idx,
1120 pCur->isValid ? "" : " eof"
1121 );
1122 }
1123}
1124#endif
1125
drhc39e0002004-05-07 23:50:57 +00001126/*
drhecdc7532001-09-23 02:35:53 +00001127** Rollback the transaction in progress. All cursors will be
1128** invalided by this operation. Any attempt to use a cursor
1129** that was open at the beginning of this operation will result
1130** in an error.
drh5e00f6c2001-09-13 13:46:56 +00001131**
1132** This will release the write lock on the database file. If there
1133** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001134*/
drh3aac2dd2004-04-26 14:10:20 +00001135int sqlite3BtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001136 int rc;
drh24cd67e2004-05-10 16:18:47 +00001137 MemPage *pPage1;
drh7c717f72001-06-24 20:39:41 +00001138 if( pBt->inTrans==0 ) return SQLITE_OK;
1139 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001140 pBt->inStmt = 0;
drh24cd67e2004-05-10 16:18:47 +00001141 if( pBt->readOnly ){
1142 rc = SQLITE_OK;
1143 }else{
1144 rc = sqlite3pager_rollback(pBt->pPager);
1145 /* The rollback may have destroyed the pPage1->aData value. So
1146 ** call getPage() on page 1 again to make sure pPage1->aData is
1147 ** set correctly. */
1148 if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
1149 releasePage(pPage1);
1150 }
1151 }
drhc39e0002004-05-07 23:50:57 +00001152 invalidateCursors(pBt);
drh5e00f6c2001-09-13 13:46:56 +00001153 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001154 return rc;
1155}
1156
1157/*
drh663fc632002-02-02 18:49:19 +00001158** Set the checkpoint for the current transaction. The checkpoint serves
1159** as a sub-transaction that can be rolled back independently of the
1160** main transaction. You must start a transaction before starting a
1161** checkpoint. The checkpoint is ended automatically if the transaction
1162** commits or rolls back.
1163**
1164** Only one checkpoint may be active at a time. It is an error to try
1165** to start a new checkpoint if another checkpoint is already active.
1166*/
drh3aac2dd2004-04-26 14:10:20 +00001167int sqlite3BtreeBeginStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001168 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001169 if( !pBt->inTrans || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00001170 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00001171 }
drha34b6762004-05-07 13:30:42 +00001172 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00001173 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00001174 return rc;
1175}
1176
1177
1178/*
1179** Commit a checkpoint to transaction currently in progress. If no
1180** checkpoint is active, this is a no-op.
1181*/
drh3aac2dd2004-04-26 14:10:20 +00001182int sqlite3BtreeCommitStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001183 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001184 if( pBt->inStmt && !pBt->readOnly ){
drha34b6762004-05-07 13:30:42 +00001185 rc = sqlite3pager_stmt_commit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00001186 }else{
1187 rc = SQLITE_OK;
1188 }
drh3aac2dd2004-04-26 14:10:20 +00001189 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001190 return rc;
1191}
1192
1193/*
1194** Rollback the checkpoint to the current transaction. If there
1195** is no active checkpoint or transaction, this routine is a no-op.
1196**
1197** All cursors will be invalided by this operation. Any attempt
1198** to use a cursor that was open at the beginning of this operation
1199** will result in an error.
1200*/
drh3aac2dd2004-04-26 14:10:20 +00001201int sqlite3BtreeRollbackStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001202 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001203 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drha34b6762004-05-07 13:30:42 +00001204 rc = sqlite3pager_stmt_rollback(pBt->pPager);
drhc39e0002004-05-07 23:50:57 +00001205 invalidateCursors(pBt);
drh3aac2dd2004-04-26 14:10:20 +00001206 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001207 return rc;
1208}
1209
1210/*
drh3aac2dd2004-04-26 14:10:20 +00001211** Default key comparison function to be used if no comparison function
1212** is specified on the sqlite3BtreeCursor() call.
1213*/
1214static int dfltCompare(
1215 void *NotUsed, /* User data is not used */
1216 int n1, const void *p1, /* First key to compare */
1217 int n2, const void *p2 /* Second key to compare */
1218){
1219 int c;
1220 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1221 if( c==0 ){
1222 c = n1 - n2;
1223 }
1224 return c;
1225}
1226
1227/*
drh8b2f49b2001-06-08 00:21:52 +00001228** Create a new cursor for the BTree whose root is on the page
1229** iTable. The act of acquiring a cursor gets a read lock on
1230** the database file.
drh1bee3d72001-10-15 00:44:35 +00001231**
1232** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001233** If wrFlag==1, then the cursor can be used for reading or for
1234** writing if other conditions for writing are also met. These
1235** are the conditions that must be met in order for writing to
1236** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001237**
drhf74b8d92002-09-01 23:20:45 +00001238** 1: The cursor must have been opened with wrFlag==1
1239**
1240** 2: No other cursors may be open with wrFlag==0 on the same table
1241**
1242** 3: The database must be writable (not on read-only media)
1243**
1244** 4: There must be an active transaction.
1245**
1246** Condition 2 warrants further discussion. If any cursor is opened
1247** on a table with wrFlag==0, that prevents all other cursors from
1248** writing to that table. This is a kind of "read-lock". When a cursor
1249** is opened with wrFlag==0 it is guaranteed that the table will not
1250** change as long as the cursor is open. This allows the cursor to
1251** do a sequential scan of the table without having to worry about
1252** entries being inserted or deleted during the scan. Cursors should
1253** be opened with wrFlag==0 only if this read-lock property is needed.
1254** That is to say, cursors should be opened with wrFlag==0 only if they
drh23e11ca2004-05-04 17:27:28 +00001255** intend to use the sqlite3BtreeNext() system call. All other cursors
drhf74b8d92002-09-01 23:20:45 +00001256** should be opened with wrFlag==1 even if they never really intend
1257** to write.
1258**
drh6446c4d2001-12-15 14:22:18 +00001259** No checking is done to make sure that page iTable really is the
1260** root page of a b-tree. If it is not, then the cursor acquired
1261** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00001262**
1263** The comparison function must be logically the same for every cursor
1264** on a particular table. Changing the comparison function will result
1265** in incorrect operations. If the comparison function is NULL, a
1266** default comparison function is used. The comparison function is
1267** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00001268*/
drh3aac2dd2004-04-26 14:10:20 +00001269int sqlite3BtreeCursor(
1270 Btree *pBt, /* The btree */
1271 int iTable, /* Root page of table to open */
1272 int wrFlag, /* 1 to write. 0 read-only */
1273 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1274 void *pArg, /* First arg to xCompare() */
1275 BtCursor **ppCur /* Write new cursor here */
1276){
drha059ad02001-04-17 20:09:11 +00001277 int rc;
drhf74b8d92002-09-01 23:20:45 +00001278 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001279
drha0c9a112004-03-10 13:42:37 +00001280 if( pBt->readOnly && wrFlag ){
1281 *ppCur = 0;
1282 return SQLITE_READONLY;
1283 }
drh4b70f112004-05-02 21:12:19 +00001284 if( pBt->pPage1==0 ){
drha059ad02001-04-17 20:09:11 +00001285 rc = lockBtree(pBt);
1286 if( rc!=SQLITE_OK ){
1287 *ppCur = 0;
1288 return rc;
1289 }
1290 }
1291 pCur = sqliteMalloc( sizeof(*pCur) );
1292 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001293 rc = SQLITE_NOMEM;
1294 goto create_cursor_exception;
1295 }
drh8b2f49b2001-06-08 00:21:52 +00001296 pCur->pgnoRoot = (Pgno)iTable;
drh24cd67e2004-05-10 16:18:47 +00001297 if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
1298 rc = SQLITE_EMPTY;
1299 goto create_cursor_exception;
1300 }
drhde647132004-05-07 17:57:49 +00001301 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00001302 if( rc!=SQLITE_OK ){
1303 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001304 }
drh3aac2dd2004-04-26 14:10:20 +00001305 pCur->xCompare = xCmp ? xCmp : dfltCompare;
1306 pCur->pArg = pArg;
drh14acc042001-06-10 19:56:58 +00001307 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001308 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001309 pCur->idx = 0;
drha059ad02001-04-17 20:09:11 +00001310 pCur->pNext = pBt->pCursor;
1311 if( pCur->pNext ){
1312 pCur->pNext->pPrev = pCur;
1313 }
drh14acc042001-06-10 19:56:58 +00001314 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001315 pRing = pBt->pCursor;
1316 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1317 if( pRing ){
1318 pCur->pShared = pRing->pShared;
1319 pRing->pShared = pCur;
1320 }else{
1321 pCur->pShared = pCur;
1322 }
drha059ad02001-04-17 20:09:11 +00001323 pBt->pCursor = pCur;
drhc39e0002004-05-07 23:50:57 +00001324 pCur->isValid = 0;
1325 pCur->status = SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001326 *ppCur = pCur;
1327 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001328
1329create_cursor_exception:
1330 *ppCur = 0;
1331 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00001332 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001333 sqliteFree(pCur);
1334 }
drh5e00f6c2001-09-13 13:46:56 +00001335 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001336 return rc;
drha059ad02001-04-17 20:09:11 +00001337}
1338
1339/*
drh5e00f6c2001-09-13 13:46:56 +00001340** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001341** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001342*/
drh3aac2dd2004-04-26 14:10:20 +00001343int sqlite3BtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001344 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001345 if( pCur->pPrev ){
1346 pCur->pPrev->pNext = pCur->pNext;
1347 }else{
1348 pBt->pCursor = pCur->pNext;
1349 }
1350 if( pCur->pNext ){
1351 pCur->pNext->pPrev = pCur->pPrev;
1352 }
drh3aac2dd2004-04-26 14:10:20 +00001353 releasePage(pCur->pPage);
drhf74b8d92002-09-01 23:20:45 +00001354 if( pCur->pShared!=pCur ){
1355 BtCursor *pRing = pCur->pShared;
1356 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1357 pRing->pShared = pCur->pShared;
1358 }
drh5e00f6c2001-09-13 13:46:56 +00001359 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001360 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001361 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001362}
1363
drh7e3b0a02001-04-28 16:52:40 +00001364/*
drh5e2f8b92001-05-28 00:41:15 +00001365** Make a temporary cursor by filling in the fields of pTempCur.
1366** The temporary cursor is not on the cursor list for the Btree.
1367*/
drh14acc042001-06-10 19:56:58 +00001368static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001369 memcpy(pTempCur, pCur, sizeof(*pCur));
1370 pTempCur->pNext = 0;
1371 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001372 if( pTempCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001373 sqlite3pager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001374 }
drh5e2f8b92001-05-28 00:41:15 +00001375}
1376
1377/*
drhbd03cae2001-06-02 02:40:57 +00001378** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001379** function above.
1380*/
drh14acc042001-06-10 19:56:58 +00001381static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001382 if( pCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001383 sqlite3pager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001384 }
drh5e2f8b92001-05-28 00:41:15 +00001385}
1386
1387/*
drh3aac2dd2004-04-26 14:10:20 +00001388** Set *pSize to the size of the buffer needed to hold the value of
1389** the key for the current entry. If the cursor is not pointing
1390** to a valid entry, *pSize is set to 0.
1391**
drh4b70f112004-05-02 21:12:19 +00001392** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00001393** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00001394*/
drh3aac2dd2004-04-26 14:10:20 +00001395int sqlite3BtreeKeySize(BtCursor *pCur, u64 *pSize){
drh2af926b2001-05-15 00:39:25 +00001396 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001397 unsigned char *cell;
drh2af926b2001-05-15 00:39:25 +00001398
drhc39e0002004-05-07 23:50:57 +00001399 if( !pCur->isValid ){
drh72f82862001-05-24 21:06:34 +00001400 *pSize = 0;
1401 }else{
drhc39e0002004-05-07 23:50:57 +00001402 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001403 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001404 assert( pPage!=0 );
1405 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1406 cell = pPage->aCell[pCur->idx];
drh3aac2dd2004-04-26 14:10:20 +00001407 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001408 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001409 cell += 4; /* Skip the child pointer */
1410 }
1411 if( !pPage->zeroData ){
drha34b6762004-05-07 13:30:42 +00001412 while( (0x80&*(cell++))!=0 ){} /* Skip the data size number */
drh3aac2dd2004-04-26 14:10:20 +00001413 }
drha34b6762004-05-07 13:30:42 +00001414 getVarint(cell, pSize);
drh72f82862001-05-24 21:06:34 +00001415 }
1416 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001417}
drh2af926b2001-05-15 00:39:25 +00001418
drh72f82862001-05-24 21:06:34 +00001419/*
1420** Read payload information from the entry that the pCur cursor is
1421** pointing to. Begin reading the payload at "offset" and read
1422** a total of "amt" bytes. Put the result in zBuf.
1423**
1424** This routine does not make a distinction between key and data.
1425** It just reads bytes from the payload area.
1426*/
drh3aac2dd2004-04-26 14:10:20 +00001427static int getPayload(
1428 BtCursor *pCur, /* Cursor pointing to entry to read from */
1429 int offset, /* Begin reading this far into payload */
1430 int amt, /* Read this many bytes */
1431 unsigned char *pBuf, /* Write the bytes into this buffer */
1432 int skipKey /* offset begins at data if this is true */
1433){
1434 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001435 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001436 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001437 MemPage *pPage;
1438 Btree *pBt;
1439 u64 nData, nKey;
1440 int maxLocal, ovflSize;
1441
drh72f82862001-05-24 21:06:34 +00001442 assert( pCur!=0 && pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001443 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001444 pBt = pCur->pBt;
1445 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001446 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00001447 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1448 aPayload = pPage->aCell[pCur->idx];
1449 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001450 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001451 aPayload += 4; /* Skip the child pointer */
1452 }
1453 if( pPage->zeroData ){
1454 nData = 0;
1455 }else{
1456 aPayload += getVarint(aPayload, &nData);
1457 }
drha34b6762004-05-07 13:30:42 +00001458 aPayload += getVarint(aPayload, &nKey);
drh3aac2dd2004-04-26 14:10:20 +00001459 if( pPage->intKey ){
1460 nKey = 0;
1461 }
1462 assert( offset>=0 );
1463 if( skipKey ){
1464 offset += nKey;
1465 }
1466 if( offset+amt > nKey+nData ){
drha34b6762004-05-07 13:30:42 +00001467 return SQLITE_ERROR;
drh3aac2dd2004-04-26 14:10:20 +00001468 }
drha34b6762004-05-07 13:30:42 +00001469 maxLocal = pBt->maxLocal;
drh3aac2dd2004-04-26 14:10:20 +00001470 if( offset<maxLocal ){
drh2af926b2001-05-15 00:39:25 +00001471 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001472 if( a+offset>maxLocal ){
1473 a = maxLocal - offset;
drh2af926b2001-05-15 00:39:25 +00001474 }
drha34b6762004-05-07 13:30:42 +00001475 memcpy(pBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001476 if( a==amt ){
1477 return SQLITE_OK;
1478 }
drh2aa679f2001-06-25 02:11:07 +00001479 offset = 0;
drha34b6762004-05-07 13:30:42 +00001480 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00001481 amt -= a;
drhdd793422001-06-28 01:54:48 +00001482 }else{
drh3aac2dd2004-04-26 14:10:20 +00001483 offset -= maxLocal;
drhbd03cae2001-06-02 02:40:57 +00001484 }
1485 if( amt>0 ){
drha34b6762004-05-07 13:30:42 +00001486 nextPage = get4byte(&aPayload[maxLocal]);
drh2af926b2001-05-15 00:39:25 +00001487 }
drh3aac2dd2004-04-26 14:10:20 +00001488 ovflSize = pBt->pageSize - 4;
drh2af926b2001-05-15 00:39:25 +00001489 while( amt>0 && nextPage ){
drha34b6762004-05-07 13:30:42 +00001490 rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
drh2af926b2001-05-15 00:39:25 +00001491 if( rc!=0 ){
1492 return rc;
1493 }
drha34b6762004-05-07 13:30:42 +00001494 nextPage = get4byte(aPayload);
drh3aac2dd2004-04-26 14:10:20 +00001495 if( offset<ovflSize ){
drh2af926b2001-05-15 00:39:25 +00001496 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001497 if( a + offset > ovflSize ){
1498 a = ovflSize - offset;
drh2af926b2001-05-15 00:39:25 +00001499 }
drh9b171272004-05-08 02:03:22 +00001500 memcpy(pBuf, &aPayload[offset+4], a);
drh2aa679f2001-06-25 02:11:07 +00001501 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001502 amt -= a;
drha34b6762004-05-07 13:30:42 +00001503 pBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001504 }else{
drh3aac2dd2004-04-26 14:10:20 +00001505 offset -= ovflSize;
drh2af926b2001-05-15 00:39:25 +00001506 }
drha34b6762004-05-07 13:30:42 +00001507 sqlite3pager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00001508 }
drha7fcb052001-12-14 15:09:55 +00001509 if( amt>0 ){
1510 return SQLITE_CORRUPT;
1511 }
1512 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001513}
1514
drh72f82862001-05-24 21:06:34 +00001515/*
drh3aac2dd2004-04-26 14:10:20 +00001516** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001517** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001518** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00001519**
drh3aac2dd2004-04-26 14:10:20 +00001520** Return SQLITE_OK on success or an error code if anything goes
1521** wrong. An error is returned if "offset+amt" is larger than
1522** the available payload.
drh72f82862001-05-24 21:06:34 +00001523*/
drha34b6762004-05-07 13:30:42 +00001524int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh8c1238a2003-01-02 14:43:55 +00001525 assert( amt>=0 );
1526 assert( offset>=0 );
drhc39e0002004-05-07 23:50:57 +00001527 if( pCur->isValid==0 ){
1528 return pCur->status;
drh3aac2dd2004-04-26 14:10:20 +00001529 }
drhc39e0002004-05-07 23:50:57 +00001530 assert( pCur->pPage!=0 );
1531 assert( pCur->pPage->intKey==0 );
1532 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001533 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1534}
1535
1536/*
1537** Return a pointer to the key of record that cursor pCur
1538** is point to if the entire key is in contiguous memory.
1539** If the key is split up among multiple tables, return 0.
1540** If pCur is not pointing to a valid entry return 0.
1541**
1542** The pointer returned is ephemeral. The key may move
1543** or be destroyed on the next call to any Btree routine.
1544**
1545** This routine is used to do quick key comparisons in the
1546** common case where the entire key fits in the payload area
drh4b70f112004-05-02 21:12:19 +00001547** of a cell and does not overflow onto secondary pages. If
1548** this routine returns 0 for a valid cursor, the caller will
1549** need to allocate a buffer big enough to hold the whole key
1550** then use sqlite3BtreeKey() to copy the key value into the
1551** buffer. That is substantially slower. But fortunately,
1552** most keys are small enough to fit in the payload area so
1553** the slower method is rarely needed.
drh3aac2dd2004-04-26 14:10:20 +00001554*/
1555void *sqlite3BtreeKeyFetch(BtCursor *pCur){
1556 unsigned char *aPayload;
1557 MemPage *pPage;
1558 Btree *pBt;
1559 u64 nData, nKey;
1560
drhc39e0002004-05-07 23:50:57 +00001561 assert( pCur!=0 );
1562 if( !pCur->isValid ){
1563 return 0;
1564 }
1565 assert( pCur->pPage!=0 );
1566 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001567 pBt = pCur->pBt;
1568 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001569 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00001570 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drhc39e0002004-05-07 23:50:57 +00001571 assert( pPage->intKey==0 );
drh3aac2dd2004-04-26 14:10:20 +00001572 aPayload = pPage->aCell[pCur->idx];
1573 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001574 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001575 aPayload += 4; /* Skip the child pointer */
1576 }
1577 if( !pPage->zeroData ){
1578 aPayload += getVarint(aPayload, &nData);
1579 }
drha34b6762004-05-07 13:30:42 +00001580 aPayload += getVarint(aPayload, &nKey);
drhc39e0002004-05-07 23:50:57 +00001581 if( nKey>pBt->maxLocal ){
drh5e00f6c2001-09-13 13:46:56 +00001582 return 0;
drh72f82862001-05-24 21:06:34 +00001583 }
drh3aac2dd2004-04-26 14:10:20 +00001584 return aPayload;
drh72f82862001-05-24 21:06:34 +00001585}
1586
drh3aac2dd2004-04-26 14:10:20 +00001587
drh72f82862001-05-24 21:06:34 +00001588/*
drhbd03cae2001-06-02 02:40:57 +00001589** Set *pSize to the number of bytes of data in the entry the
1590** cursor currently points to. Always return SQLITE_OK.
1591** Failure is not possible. If the cursor is not currently
1592** pointing to an entry (which can happen, for example, if
1593** the database is empty) then *pSize is set to 0.
drh72f82862001-05-24 21:06:34 +00001594*/
drh3aac2dd2004-04-26 14:10:20 +00001595int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drh72f82862001-05-24 21:06:34 +00001596 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001597 unsigned char *cell;
1598 u64 size;
drh72f82862001-05-24 21:06:34 +00001599
drhc39e0002004-05-07 23:50:57 +00001600 if( !pCur->isValid ){
1601 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1602 }
drh72f82862001-05-24 21:06:34 +00001603 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001604 assert( pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001605 assert( pPage->isInit );
drhda200cc2004-05-09 11:51:38 +00001606 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001607 if( pPage->zeroData ){
drh72f82862001-05-24 21:06:34 +00001608 *pSize = 0;
1609 }else{
drhc39e0002004-05-07 23:50:57 +00001610 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001611 cell = pPage->aCell[pCur->idx];
1612 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001613 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001614 cell += 4; /* Skip the child pointer */
1615 }
drha34b6762004-05-07 13:30:42 +00001616 getVarint(cell, &size);
drh3aac2dd2004-04-26 14:10:20 +00001617 assert( (size & 0x00000000ffffffff)==size );
drha34b6762004-05-07 13:30:42 +00001618 *pSize = (u32)size;
drh72f82862001-05-24 21:06:34 +00001619 }
1620 return SQLITE_OK;
1621}
1622
1623/*
drh3aac2dd2004-04-26 14:10:20 +00001624** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001625** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001626** begins at "offset".
1627**
1628** Return SQLITE_OK on success or an error code if anything goes
1629** wrong. An error is returned if "offset+amt" is larger than
1630** the available payload.
drh72f82862001-05-24 21:06:34 +00001631*/
drh3aac2dd2004-04-26 14:10:20 +00001632int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhc39e0002004-05-07 23:50:57 +00001633 if( !pCur->isValid ){
1634 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1635 }
drh8c1238a2003-01-02 14:43:55 +00001636 assert( amt>=0 );
1637 assert( offset>=0 );
1638 assert( pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001639 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001640 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00001641}
1642
drh72f82862001-05-24 21:06:34 +00001643/*
drh8178a752003-01-05 21:41:40 +00001644** Move the cursor down to a new child page. The newPgno argument is the
1645** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001646*/
drh3aac2dd2004-04-26 14:10:20 +00001647static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00001648 int rc;
1649 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00001650 MemPage *pOldPage;
drh0d316a42002-08-11 20:10:47 +00001651 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001652
drhc39e0002004-05-07 23:50:57 +00001653 assert( pCur->isValid );
drhde647132004-05-07 17:57:49 +00001654 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001655 if( rc ) return rc;
drhda200cc2004-05-09 11:51:38 +00001656 pageIntegrity(pNewPage);
drh428ae8c2003-01-04 16:48:09 +00001657 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00001658 pOldPage = pCur->pPage;
1659 pOldPage->idxShift = 0;
1660 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00001661 pCur->pPage = pNewPage;
1662 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001663 if( pNewPage->nCell<1 ){
1664 return SQLITE_CORRUPT;
1665 }
drh72f82862001-05-24 21:06:34 +00001666 return SQLITE_OK;
1667}
1668
1669/*
drh8856d6a2004-04-29 14:42:46 +00001670** Return true if the page is the virtual root of its table.
1671**
1672** The virtual root page is the root page for most tables. But
1673** for the table rooted on page 1, sometime the real root page
1674** is empty except for the right-pointer. In such cases the
1675** virtual root page is the page that the right-pointer of page
1676** 1 is pointing to.
1677*/
1678static int isRootPage(MemPage *pPage){
1679 MemPage *pParent = pPage->pParent;
drhda200cc2004-05-09 11:51:38 +00001680 if( pParent==0 ) return 1;
1681 if( pParent->pgno>1 ) return 0;
1682 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
drh8856d6a2004-04-29 14:42:46 +00001683 return 0;
1684}
1685
1686/*
drh5e2f8b92001-05-28 00:41:15 +00001687** Move the cursor up to the parent page.
1688**
1689** pCur->idx is set to the cell index that contains the pointer
1690** to the page we are coming from. If we are coming from the
1691** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001692** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001693*/
drh8178a752003-01-05 21:41:40 +00001694static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001695 Pgno oldPgno;
1696 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001697 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001698 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001699
drhc39e0002004-05-07 23:50:57 +00001700 assert( pCur->isValid );
drh8178a752003-01-05 21:41:40 +00001701 pPage = pCur->pPage;
1702 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00001703 assert( !isRootPage(pPage) );
drhda200cc2004-05-09 11:51:38 +00001704 pageIntegrity(pPage);
drh8178a752003-01-05 21:41:40 +00001705 pParent = pPage->pParent;
1706 assert( pParent!=0 );
drhda200cc2004-05-09 11:51:38 +00001707 pageIntegrity(pParent);
drh8178a752003-01-05 21:41:40 +00001708 idxParent = pPage->idxParent;
drha34b6762004-05-07 13:30:42 +00001709 sqlite3pager_ref(pParent->aData);
drh3aac2dd2004-04-26 14:10:20 +00001710 oldPgno = pPage->pgno;
1711 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00001712 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001713 assert( pParent->idxShift==0 );
1714 if( pParent->idxShift==0 ){
1715 pCur->idx = idxParent;
1716#ifndef NDEBUG
1717 /* Verify that pCur->idx is the correct index to point back to the child
1718 ** page we just came from
1719 */
drh428ae8c2003-01-04 16:48:09 +00001720 if( pCur->idx<pParent->nCell ){
drha34b6762004-05-07 13:30:42 +00001721 assert( get4byte(&pParent->aCell[idxParent][2])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001722 }else{
drha34b6762004-05-07 13:30:42 +00001723 assert( get4byte(&pParent->aData[pParent->hdrOffset+6])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001724 }
1725#endif
1726 }else{
1727 /* The MemPage.idxShift flag indicates that cell indices might have
1728 ** changed since idxParent was set and hence idxParent might be out
1729 ** of date. So recompute the parent cell index by scanning all cells
1730 ** and locating the one that points to the child we just came from.
1731 */
1732 int i;
1733 pCur->idx = pParent->nCell;
drh428ae8c2003-01-04 16:48:09 +00001734 for(i=0; i<pParent->nCell; i++){
drh3aac2dd2004-04-26 14:10:20 +00001735 if( get4byte(&pParent->aCell[i][2])==oldPgno ){
drh428ae8c2003-01-04 16:48:09 +00001736 pCur->idx = i;
1737 break;
1738 }
drh72f82862001-05-24 21:06:34 +00001739 }
1740 }
1741}
1742
1743/*
1744** Move the cursor to the root page
1745*/
drh5e2f8b92001-05-28 00:41:15 +00001746static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00001747 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00001748 int rc;
drh0d316a42002-08-11 20:10:47 +00001749 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001750
drhde647132004-05-07 17:57:49 +00001751 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
drhc39e0002004-05-07 23:50:57 +00001752 if( rc ){
1753 pCur->isValid = 0;
1754 return rc;
1755 }
drh3aac2dd2004-04-26 14:10:20 +00001756 releasePage(pCur->pPage);
drhda200cc2004-05-09 11:51:38 +00001757 pageIntegrity(pRoot);
drh3aac2dd2004-04-26 14:10:20 +00001758 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00001759 pCur->idx = 0;
drh8856d6a2004-04-29 14:42:46 +00001760 if( pRoot->nCell==0 && !pRoot->leaf ){
1761 Pgno subpage;
1762 assert( pRoot->pgno==1 );
1763 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+6]);
1764 assert( subpage>0 );
drh3644f082004-05-10 18:45:09 +00001765 pCur->isValid = 1;
drh4b70f112004-05-02 21:12:19 +00001766 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00001767 }
drhc39e0002004-05-07 23:50:57 +00001768 pCur->isValid = pCur->pPage->nCell>0;
drh8856d6a2004-04-29 14:42:46 +00001769 return rc;
drh72f82862001-05-24 21:06:34 +00001770}
drh2af926b2001-05-15 00:39:25 +00001771
drh5e2f8b92001-05-28 00:41:15 +00001772/*
1773** Move the cursor down to the left-most leaf entry beneath the
1774** entry to which it is currently pointing.
1775*/
1776static int moveToLeftmost(BtCursor *pCur){
1777 Pgno pgno;
1778 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001779 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00001780
drhc39e0002004-05-07 23:50:57 +00001781 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001782 while( !(pPage = pCur->pPage)->leaf ){
drha34b6762004-05-07 13:30:42 +00001783 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1784 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001785 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001786 if( rc ) return rc;
1787 }
1788 return SQLITE_OK;
1789}
1790
drh2dcc9aa2002-12-04 13:40:25 +00001791/*
1792** Move the cursor down to the right-most leaf entry beneath the
1793** page to which it is currently pointing. Notice the difference
1794** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1795** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1796** finds the right-most entry beneath the *page*.
1797*/
1798static int moveToRightmost(BtCursor *pCur){
1799 Pgno pgno;
1800 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001801 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001802
drhc39e0002004-05-07 23:50:57 +00001803 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001804 while( !(pPage = pCur->pPage)->leaf ){
1805 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
1806 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001807 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001808 if( rc ) return rc;
1809 }
drh3aac2dd2004-04-26 14:10:20 +00001810 pCur->idx = pPage->nCell - 1;
drh2dcc9aa2002-12-04 13:40:25 +00001811 return SQLITE_OK;
1812}
1813
drh5e00f6c2001-09-13 13:46:56 +00001814/* Move the cursor to the first entry in the table. Return SQLITE_OK
1815** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001816** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001817*/
drh3aac2dd2004-04-26 14:10:20 +00001818int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001819 int rc;
drhc39e0002004-05-07 23:50:57 +00001820 if( pCur->status ){
1821 return pCur->status;
1822 }
drh5e00f6c2001-09-13 13:46:56 +00001823 rc = moveToRoot(pCur);
1824 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001825 if( pCur->isValid==0 ){
1826 assert( pCur->pPage->nCell==0 );
drh5e00f6c2001-09-13 13:46:56 +00001827 *pRes = 1;
1828 return SQLITE_OK;
1829 }
drhc39e0002004-05-07 23:50:57 +00001830 assert( pCur->pPage->nCell>0 );
drh5e00f6c2001-09-13 13:46:56 +00001831 *pRes = 0;
1832 rc = moveToLeftmost(pCur);
1833 return rc;
1834}
drh5e2f8b92001-05-28 00:41:15 +00001835
drh9562b552002-02-19 15:00:07 +00001836/* Move the cursor to the last entry in the table. Return SQLITE_OK
1837** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001838** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001839*/
drh3aac2dd2004-04-26 14:10:20 +00001840int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001841 int rc;
drhc39e0002004-05-07 23:50:57 +00001842 if( pCur->status ){
1843 return pCur->status;
1844 }
drh9562b552002-02-19 15:00:07 +00001845 rc = moveToRoot(pCur);
1846 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001847 if( pCur->isValid==0 ){
1848 assert( pCur->pPage->nCell==0 );
drh9562b552002-02-19 15:00:07 +00001849 *pRes = 1;
1850 return SQLITE_OK;
1851 }
drhc39e0002004-05-07 23:50:57 +00001852 assert( pCur->isValid );
drh9562b552002-02-19 15:00:07 +00001853 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001854 rc = moveToRightmost(pCur);
drh9562b552002-02-19 15:00:07 +00001855 return rc;
1856}
1857
drh3aac2dd2004-04-26 14:10:20 +00001858/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00001859** Return a success code.
1860**
drh3aac2dd2004-04-26 14:10:20 +00001861** For INTKEY tables, only the nKey parameter is used. pKey is
1862** ignored. For other tables, nKey is the number of bytes of data
1863** in nKey. The comparison function specified when the cursor was
1864** created is used to compare keys.
1865**
drh5e2f8b92001-05-28 00:41:15 +00001866** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001867** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001868** were present. The cursor might point to an entry that comes
1869** before or after the key.
1870**
drhbd03cae2001-06-02 02:40:57 +00001871** The result of comparing the key with the entry to which the
1872** cursor is left pointing is stored in pCur->iMatch. The same
1873** value is also written to *pRes if pRes!=NULL. The meaning of
1874** this value is as follows:
1875**
1876** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001877** is smaller than pKey or if the table is empty
1878** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001879**
1880** *pRes==0 The cursor is left pointing at an entry that
1881** exactly matches pKey.
1882**
1883** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001884** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001885*/
drh3aac2dd2004-04-26 14:10:20 +00001886int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, u64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001887 int rc;
drhc39e0002004-05-07 23:50:57 +00001888
1889 if( pCur->status ){
1890 return pCur->status;
1891 }
drh5e2f8b92001-05-28 00:41:15 +00001892 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001893 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001894 assert( pCur->pPage );
1895 assert( pCur->pPage->isInit );
1896 if( pCur->isValid==0 ){
1897 assert( pCur->pPage->nCell==0 );
1898 return SQLITE_OK;
1899 }
drh72f82862001-05-24 21:06:34 +00001900 for(;;){
1901 int lwr, upr;
1902 Pgno chldPg;
1903 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001904 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001905 lwr = 0;
1906 upr = pPage->nCell-1;
drhda200cc2004-05-09 11:51:38 +00001907 pageIntegrity(pPage);
drh72f82862001-05-24 21:06:34 +00001908 while( lwr<=upr ){
drh3aac2dd2004-04-26 14:10:20 +00001909 void *pCellKey;
1910 u64 nCellKey;
drh72f82862001-05-24 21:06:34 +00001911 pCur->idx = (lwr+upr)/2;
drhde647132004-05-07 17:57:49 +00001912 sqlite3BtreeKeySize(pCur, &nCellKey);
drh3aac2dd2004-04-26 14:10:20 +00001913 if( pPage->intKey ){
1914 if( nCellKey<nKey ){
1915 c = -1;
1916 }else if( nCellKey>nKey ){
1917 c = +1;
1918 }else{
1919 c = 0;
1920 }
1921 }else if( (pCellKey = sqlite3BtreeKeyFetch(pCur))!=0 ){
1922 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1923 }else{
1924 pCellKey = sqliteMalloc( nCellKey );
1925 if( pCellKey==0 ) return SQLITE_NOMEM;
1926 rc = sqlite3BtreeKey(pCur, 0, nCellKey, pCellKey);
1927 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1928 sqliteFree(pCellKey);
1929 if( rc ) return rc;
1930 }
drh72f82862001-05-24 21:06:34 +00001931 if( c==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001932 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001933 if( pRes ) *pRes = 0;
1934 return SQLITE_OK;
1935 }
1936 if( c<0 ){
1937 lwr = pCur->idx+1;
1938 }else{
1939 upr = pCur->idx-1;
1940 }
1941 }
1942 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00001943 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00001944 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00001945 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00001946 }else if( lwr>=pPage->nCell ){
1947 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh72f82862001-05-24 21:06:34 +00001948 }else{
drh3aac2dd2004-04-26 14:10:20 +00001949 chldPg = get4byte(&pPage->aCell[lwr][2]);
drh72f82862001-05-24 21:06:34 +00001950 }
1951 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001952 pCur->iMatch = c;
drhc39e0002004-05-07 23:50:57 +00001953 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001954 if( pRes ) *pRes = c;
1955 return SQLITE_OK;
1956 }
drh428ae8c2003-01-04 16:48:09 +00001957 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00001958 rc = moveToChild(pCur, chldPg);
drhc39e0002004-05-07 23:50:57 +00001959 if( rc ){
1960 return rc;
1961 }
drh72f82862001-05-24 21:06:34 +00001962 }
drhbd03cae2001-06-02 02:40:57 +00001963 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00001964}
1965
1966/*
drhc39e0002004-05-07 23:50:57 +00001967** Return TRUE if the cursor is not pointing at an entry of the table.
1968**
1969** TRUE will be returned after a call to sqlite3BtreeNext() moves
1970** past the last entry in the table or sqlite3BtreePrev() moves past
1971** the first entry. TRUE is also returned if the table is empty.
1972*/
1973int sqlite3BtreeEof(BtCursor *pCur){
1974 return pCur->isValid==0;
1975}
1976
1977/*
drhbd03cae2001-06-02 02:40:57 +00001978** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00001979** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00001980** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00001981** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00001982*/
drh3aac2dd2004-04-26 14:10:20 +00001983int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00001984 int rc;
drh8178a752003-01-05 21:41:40 +00001985 MemPage *pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001986 assert( pRes!=0 );
drhc39e0002004-05-07 23:50:57 +00001987 if( pCur->isValid==0 ){
drh8c1238a2003-01-02 14:43:55 +00001988 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00001989 return SQLITE_OK;
drhecdc7532001-09-23 02:35:53 +00001990 }
drh8178a752003-01-05 21:41:40 +00001991 assert( pPage->isInit );
drh8178a752003-01-05 21:41:40 +00001992 assert( pCur->idx<pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001993 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00001994 if( pCur->idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00001995 if( !pPage->leaf ){
1996 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+6]));
drh5e2f8b92001-05-28 00:41:15 +00001997 if( rc ) return rc;
1998 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001999 *pRes = 0;
2000 return rc;
drh72f82862001-05-24 21:06:34 +00002001 }
drh5e2f8b92001-05-28 00:41:15 +00002002 do{
drh8856d6a2004-04-29 14:42:46 +00002003 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00002004 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00002005 pCur->isValid = 0;
drh5e2f8b92001-05-28 00:41:15 +00002006 return SQLITE_OK;
2007 }
drh8178a752003-01-05 21:41:40 +00002008 moveToParent(pCur);
2009 pPage = pCur->pPage;
2010 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00002011 *pRes = 0;
drh8178a752003-01-05 21:41:40 +00002012 return SQLITE_OK;
2013 }
2014 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00002015 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00002016 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00002017 }
drh5e2f8b92001-05-28 00:41:15 +00002018 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00002019 return rc;
drh72f82862001-05-24 21:06:34 +00002020}
2021
drh3b7511c2001-05-26 13:15:44 +00002022/*
drh2dcc9aa2002-12-04 13:40:25 +00002023** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00002024** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00002025** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00002026** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00002027*/
drh3aac2dd2004-04-26 14:10:20 +00002028int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00002029 int rc;
2030 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00002031 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00002032 if( pCur->isValid==0 ){
2033 *pRes = 1;
2034 return SQLITE_OK;
2035 }
drh8178a752003-01-05 21:41:40 +00002036 pPage = pCur->pPage;
drh8178a752003-01-05 21:41:40 +00002037 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00002038 assert( pCur->idx>=0 );
drha34b6762004-05-07 13:30:42 +00002039 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00002040 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00002041 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00002042 if( rc ) return rc;
2043 rc = moveToRightmost(pCur);
2044 }else{
2045 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00002046 if( isRootPage(pPage) ){
drhc39e0002004-05-07 23:50:57 +00002047 pCur->isValid = 0;
2048 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00002049 return SQLITE_OK;
2050 }
drh8178a752003-01-05 21:41:40 +00002051 moveToParent(pCur);
2052 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00002053 }
2054 pCur->idx--;
2055 rc = SQLITE_OK;
2056 }
drh8178a752003-01-05 21:41:40 +00002057 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00002058 return rc;
2059}
2060
2061/*
drh3a4c1412004-05-09 20:40:11 +00002062** The TRACE macro will print high-level status information about the
2063** btree operation when the global variable sqlite3_btree_trace is
2064** enabled.
2065*/
2066#if SQLITE_TEST
2067# define TRACE(X) if( sqlite3_btree_trace ){ printf X; fflush(stdout); }
2068#else
2069# define TRACE(X)
2070#endif
2071int sqlite3_btree_trace=0; /* True to enable tracing */
2072
2073/*
drh3b7511c2001-05-26 13:15:44 +00002074** Allocate a new page from the database file.
2075**
drha34b6762004-05-07 13:30:42 +00002076** The new page is marked as dirty. (In other words, sqlite3pager_write()
drh3b7511c2001-05-26 13:15:44 +00002077** has already been called on the new page.) The new page has also
2078** been referenced and the calling routine is responsible for calling
drha34b6762004-05-07 13:30:42 +00002079** sqlite3pager_unref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00002080**
2081** SQLITE_OK is returned on success. Any other return value indicates
2082** an error. *ppPage and *pPgno are undefined in the event of an error.
drha34b6762004-05-07 13:30:42 +00002083** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00002084**
drh199e3cf2002-07-18 11:01:47 +00002085** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2086** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00002087** attempt to keep related pages close to each other in the database file,
2088** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00002089*/
drh199e3cf2002-07-18 11:01:47 +00002090static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drh3aac2dd2004-04-26 14:10:20 +00002091 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00002092 int rc;
drh3aac2dd2004-04-26 14:10:20 +00002093 int n; /* Number of pages on the freelist */
2094 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00002095
drh3aac2dd2004-04-26 14:10:20 +00002096 pPage1 = pBt->pPage1;
2097 n = get4byte(&pPage1->aData[36]);
2098 if( n>0 ){
drh91025292004-05-03 19:49:32 +00002099 /* There are pages on the freelist. Reuse one of those pages. */
drh3aac2dd2004-04-26 14:10:20 +00002100 MemPage *pTrunk;
drha34b6762004-05-07 13:30:42 +00002101 rc = sqlite3pager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00002102 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002103 put4byte(&pPage1->aData[36], n-1);
2104 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002105 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002106 rc = sqlite3pager_write(pTrunk->aData);
drh3b7511c2001-05-26 13:15:44 +00002107 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00002108 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002109 return rc;
2110 }
drh3aac2dd2004-04-26 14:10:20 +00002111 k = get4byte(&pTrunk->aData[4]);
2112 if( k==0 ){
2113 /* The trunk has no leaves. So extract the trunk page itself and
2114 ** use it as the newly allocated page */
drha34b6762004-05-07 13:30:42 +00002115 *pPgno = get4byte(&pPage1->aData[32]);
drh3aac2dd2004-04-26 14:10:20 +00002116 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
2117 *ppPage = pTrunk;
drh3a4c1412004-05-09 20:40:11 +00002118 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh30e58752002-03-02 20:41:57 +00002119 }else{
drh3aac2dd2004-04-26 14:10:20 +00002120 /* Extract a leaf from the trunk */
2121 int closest;
2122 unsigned char *aData = pTrunk->aData;
2123 if( nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00002124 int i, dist;
2125 closest = 0;
drh3aac2dd2004-04-26 14:10:20 +00002126 dist = get4byte(&aData[8]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002127 if( dist<0 ) dist = -dist;
drh3a4c1412004-05-09 20:40:11 +00002128 for(i=1; i<k; i++){
drh3aac2dd2004-04-26 14:10:20 +00002129 int d2 = get4byte(&aData[8+i*4]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002130 if( d2<0 ) d2 = -d2;
2131 if( d2<dist ) closest = i;
2132 }
2133 }else{
2134 closest = 0;
2135 }
drha34b6762004-05-07 13:30:42 +00002136 *pPgno = get4byte(&aData[8+closest*4]);
drh3a4c1412004-05-09 20:40:11 +00002137 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d: %d more free pages\n",
2138 *pPgno, closest+1, k, pTrunk->pgno, n-1));
drh9b171272004-05-08 02:03:22 +00002139 if( closest<k-1 ){
2140 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
2141 }
drh3a4c1412004-05-09 20:40:11 +00002142 put4byte(&aData[4], k-1);
drh3aac2dd2004-04-26 14:10:20 +00002143 rc = getPage(pBt, *pPgno, ppPage);
2144 releasePage(pTrunk);
drh30e58752002-03-02 20:41:57 +00002145 if( rc==SQLITE_OK ){
drh9b171272004-05-08 02:03:22 +00002146 sqlite3pager_dont_rollback((*ppPage)->aData);
drha34b6762004-05-07 13:30:42 +00002147 rc = sqlite3pager_write((*ppPage)->aData);
drh30e58752002-03-02 20:41:57 +00002148 }
2149 }
drh3b7511c2001-05-26 13:15:44 +00002150 }else{
drh3aac2dd2004-04-26 14:10:20 +00002151 /* There are no pages on the freelist, so create a new page at the
2152 ** end of the file */
drha34b6762004-05-07 13:30:42 +00002153 *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
drh3aac2dd2004-04-26 14:10:20 +00002154 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00002155 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002156 rc = sqlite3pager_write((*ppPage)->aData);
drh3a4c1412004-05-09 20:40:11 +00002157 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00002158 }
2159 return rc;
2160}
2161
2162/*
drh3aac2dd2004-04-26 14:10:20 +00002163** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00002164**
drha34b6762004-05-07 13:30:42 +00002165** sqlite3pager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00002166*/
drh3aac2dd2004-04-26 14:10:20 +00002167static int freePage(MemPage *pPage){
2168 Btree *pBt = pPage->pBt;
2169 MemPage *pPage1 = pBt->pPage1;
2170 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00002171
drh3aac2dd2004-04-26 14:10:20 +00002172 /* Prepare the page for freeing */
2173 assert( pPage->pgno>1 );
2174 pPage->isInit = 0;
2175 releasePage(pPage->pParent);
2176 pPage->pParent = 0;
2177
drha34b6762004-05-07 13:30:42 +00002178 /* Increment the free page count on pPage1 */
2179 rc = sqlite3pager_write(pPage1->aData);
drh3aac2dd2004-04-26 14:10:20 +00002180 if( rc ) return rc;
2181 n = get4byte(&pPage1->aData[36]);
2182 put4byte(&pPage1->aData[36], n+1);
2183
2184 if( n==0 ){
2185 /* This is the first free page */
drhda200cc2004-05-09 11:51:38 +00002186 rc = sqlite3pager_write(pPage->aData);
2187 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002188 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00002189 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002190 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002191 }else{
2192 /* Other free pages already exist. Retrive the first trunk page
2193 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00002194 MemPage *pTrunk;
2195 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002196 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002197 k = get4byte(&pTrunk->aData[4]);
2198 if( k==pBt->pageSize/4 - 8 ){
2199 /* The trunk is full. Turn the page being freed into a new
2200 ** trunk page with no leaves. */
drha34b6762004-05-07 13:30:42 +00002201 rc = sqlite3pager_write(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00002202 if( rc ) return rc;
2203 put4byte(pPage->aData, pTrunk->pgno);
2204 put4byte(&pPage->aData[4], 0);
2205 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002206 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
2207 pPage->pgno, pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002208 }else{
2209 /* Add the newly freed page as a leaf on the current trunk */
drha34b6762004-05-07 13:30:42 +00002210 rc = sqlite3pager_write(pTrunk->aData);
drh3aac2dd2004-04-26 14:10:20 +00002211 if( rc ) return rc;
2212 put4byte(&pTrunk->aData[4], k+1);
2213 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drha34b6762004-05-07 13:30:42 +00002214 sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002215 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002216 }
2217 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002218 }
drh3b7511c2001-05-26 13:15:44 +00002219 return rc;
2220}
2221
2222/*
drh3aac2dd2004-04-26 14:10:20 +00002223** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00002224*/
drh3aac2dd2004-04-26 14:10:20 +00002225static int clearCell(MemPage *pPage, unsigned char *pCell){
2226 Btree *pBt = pPage->pBt;
drha34b6762004-05-07 13:30:42 +00002227 int rc, n, nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002228 u64 nData, nKey;
2229 Pgno ovflPgno;
drh3b7511c2001-05-26 13:15:44 +00002230
drh3aac2dd2004-04-26 14:10:20 +00002231 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
drha34b6762004-05-07 13:30:42 +00002232 assert( (nData&0x000000007fffffff)==nData );
2233 nPayload = (int)nData;
drh3aac2dd2004-04-26 14:10:20 +00002234 if( !pPage->intKey ){
2235 nPayload += nKey;
drh5e2f8b92001-05-28 00:41:15 +00002236 }
drh3aac2dd2004-04-26 14:10:20 +00002237 if( nPayload<=pBt->maxLocal ){
drha34b6762004-05-07 13:30:42 +00002238 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00002239 }
2240 ovflPgno = get4byte(&pCell[n+pBt->maxLocal]);
2241 while( ovflPgno!=0 ){
2242 MemPage *pOvfl;
2243 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00002244 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002245 ovflPgno = get4byte(pOvfl->aData);
drha34b6762004-05-07 13:30:42 +00002246 rc = freePage(pOvfl);
drhbd03cae2001-06-02 02:40:57 +00002247 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002248 sqlite3pager_unref(pOvfl->aData);
drh3b7511c2001-05-26 13:15:44 +00002249 }
drh5e2f8b92001-05-28 00:41:15 +00002250 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00002251}
2252
2253/*
drh91025292004-05-03 19:49:32 +00002254** Create the byte sequence used to represent a cell on page pPage
2255** and write that byte sequence into pCell[]. Overflow pages are
2256** allocated and filled in as necessary. The calling procedure
2257** is responsible for making sure sufficient space has been allocated
2258** for pCell[].
2259**
2260** Note that pCell does not necessary need to point to the pPage->aData
2261** area. pCell might point to some temporary storage. The cell will
2262** be constructed in this temporary area then copied into pPage->aData
2263** later.
drh3b7511c2001-05-26 13:15:44 +00002264*/
2265static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00002266 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00002267 unsigned char *pCell, /* Complete text of the cell */
drh3aac2dd2004-04-26 14:10:20 +00002268 const void *pKey, u64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00002269 const void *pData,int nData, /* The data */
2270 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00002271){
drh3b7511c2001-05-26 13:15:44 +00002272 int nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002273 const void *pSrc;
drha34b6762004-05-07 13:30:42 +00002274 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00002275 int spaceLeft;
2276 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00002277 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00002278 unsigned char *pPrior;
2279 unsigned char *pPayload;
2280 Btree *pBt = pPage->pBt;
2281 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00002282 int nHeader;
drh3b7511c2001-05-26 13:15:44 +00002283
drh91025292004-05-03 19:49:32 +00002284 /* Fill in the header. */
2285 nHeader = 2;
2286 if( !pPage->leaf ){
2287 nHeader += 4;
2288 }
2289 if( !pPage->zeroData ){
2290 nHeader += putVarint(&pCell[nHeader], nData);
2291 }
2292 nHeader += putVarint(&pCell[nHeader], nKey);
2293
2294 /* Fill in the payload */
2295 if( pPage->zeroData ){
2296 nData = 0;
2297 }
drh3aac2dd2004-04-26 14:10:20 +00002298 nPayload = nData;
2299 if( pPage->intKey ){
2300 pSrc = pData;
2301 nSrc = nData;
drh91025292004-05-03 19:49:32 +00002302 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00002303 }else{
2304 nPayload += nKey;
2305 pSrc = pKey;
2306 nSrc = nKey;
2307 }
drh4b70f112004-05-02 21:12:19 +00002308 if( nPayload>pBt->maxLocal ){
2309 *pnSize = nHeader + pBt->maxLocal + 4;
2310 }else{
2311 *pnSize = nHeader + nPayload;
2312 }
drh3aac2dd2004-04-26 14:10:20 +00002313 spaceLeft = pBt->maxLocal;
2314 pPayload = &pCell[nHeader];
2315 pPrior = &pPayload[pBt->maxLocal];
drh3b7511c2001-05-26 13:15:44 +00002316
drh3b7511c2001-05-26 13:15:44 +00002317 while( nPayload>0 ){
2318 if( spaceLeft==0 ){
drh3aac2dd2004-04-26 14:10:20 +00002319 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
drh3b7511c2001-05-26 13:15:44 +00002320 if( rc ){
drh9b171272004-05-08 02:03:22 +00002321 releasePage(pToRelease);
drh3aac2dd2004-04-26 14:10:20 +00002322 clearCell(pPage, pCell);
drh3b7511c2001-05-26 13:15:44 +00002323 return rc;
2324 }
drh3aac2dd2004-04-26 14:10:20 +00002325 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00002326 releasePage(pToRelease);
2327 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00002328 pPrior = pOvfl->aData;
2329 put4byte(pPrior, 0);
2330 pPayload = &pOvfl->aData[4];
2331 spaceLeft = pBt->pageSize - 4;
drh3b7511c2001-05-26 13:15:44 +00002332 }
2333 n = nPayload;
2334 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00002335 if( n>nSrc ) n = nSrc;
2336 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00002337 nPayload -= n;
drhde647132004-05-07 17:57:49 +00002338 pPayload += n;
drh9b171272004-05-08 02:03:22 +00002339 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00002340 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00002341 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00002342 if( nSrc==0 ){
2343 nSrc = nData;
2344 pSrc = pData;
2345 }
drhdd793422001-06-28 01:54:48 +00002346 }
drh9b171272004-05-08 02:03:22 +00002347 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00002348 return SQLITE_OK;
2349}
2350
2351/*
drhbd03cae2001-06-02 02:40:57 +00002352** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00002353** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00002354** pointer in the third argument.
2355*/
drh4b70f112004-05-02 21:12:19 +00002356static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00002357 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00002358 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00002359
drhdd793422001-06-28 01:54:48 +00002360 if( pgno==0 ) return;
drh4b70f112004-05-02 21:12:19 +00002361 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00002362 aData = sqlite3pager_lookup(pBt->pPager, pgno);
drhda200cc2004-05-09 11:51:38 +00002363 if( aData ){
2364 pThis = (MemPage*)&aData[pBt->pageSize];
2365 if( pThis->isInit ){
2366 if( pThis->pParent!=pNewParent ){
2367 if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
2368 pThis->pParent = pNewParent;
2369 if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
2370 }
2371 pThis->idxParent = idx;
drhdd793422001-06-28 01:54:48 +00002372 }
drha34b6762004-05-07 13:30:42 +00002373 sqlite3pager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00002374 }
2375}
2376
2377/*
drh4b70f112004-05-02 21:12:19 +00002378** Change the pParent pointer of all children of pPage to point back
2379** to pPage.
2380**
drhbd03cae2001-06-02 02:40:57 +00002381** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00002382** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00002383**
2384** This routine gets called after you memcpy() one page into
2385** another.
2386*/
drh4b70f112004-05-02 21:12:19 +00002387static void reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002388 int i;
drh4b70f112004-05-02 21:12:19 +00002389 Btree *pBt;
2390
drha34b6762004-05-07 13:30:42 +00002391 if( pPage->leaf ) return;
drh4b70f112004-05-02 21:12:19 +00002392 pBt = pPage->pBt;
drhbd03cae2001-06-02 02:40:57 +00002393 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002394 reparentPage(pBt, get4byte(&pPage->aCell[i][2]), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002395 }
drh4b70f112004-05-02 21:12:19 +00002396 reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+6]), pPage, i);
drh428ae8c2003-01-04 16:48:09 +00002397 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002398}
2399
2400/*
2401** Remove the i-th cell from pPage. This routine effects pPage only.
2402** The cell content is not freed or deallocated. It is assumed that
2403** the cell content has been copied someplace else. This routine just
2404** removes the reference to the cell from pPage.
2405**
2406** "sz" must be the number of bytes in the cell.
2407**
drhda200cc2004-05-09 11:51:38 +00002408** Try to maintain the integrity of the linked list of cells. But if
2409** the cell being inserted does not fit on the page, this will not be
2410** possible. If the linked list is not maintained, then just update
2411** pPage->aCell[] and set the pPage->needRelink flag so that we will
2412** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002413*/
drh4b70f112004-05-02 21:12:19 +00002414static void dropCell(MemPage *pPage, int idx, int sz){
drhde647132004-05-07 17:57:49 +00002415 int j, pc;
drhda200cc2004-05-09 11:51:38 +00002416 u8 *data;
drh8c42ca92001-06-22 19:15:00 +00002417 assert( idx>=0 && idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002418 assert( sz==cellSize(pPage, pPage->aCell[idx]) );
drha34b6762004-05-07 13:30:42 +00002419 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002420 assert( pPage->aCell[idx]>=pPage->aData );
drh3a4c1412004-05-09 20:40:11 +00002421 assert( pPage->aCell[idx]<=&pPage->aData[pPage->pBt->pageSize-sz] );
drhda200cc2004-05-09 11:51:38 +00002422 data = pPage->aData;
2423 pc = Addr(pPage->aCell[idx]) - Addr(data);
drhde647132004-05-07 17:57:49 +00002424 assert( pc>pPage->hdrOffset && pc+sz<=pPage->pBt->pageSize );
2425 freeSpace(pPage, pc, sz);
drh7c717f72001-06-24 20:39:41 +00002426 for(j=idx; j<pPage->nCell-1; j++){
drh4b70f112004-05-02 21:12:19 +00002427 pPage->aCell[j] = pPage->aCell[j+1];
drh14acc042001-06-10 19:56:58 +00002428 }
2429 pPage->nCell--;
drhda200cc2004-05-09 11:51:38 +00002430 if( !pPage->isOverfull && !pPage->needRelink ){
2431 u8 *pPrev;
2432 if( idx==0 ){
2433 pPrev = &data[pPage->hdrOffset+3];
2434 }else{
2435 pPrev = pPage->aCell[idx-1];
2436 }
2437 if( idx<pPage->nCell ){
2438 pc = Addr(pPage->aCell[idx]) - Addr(data);
2439 }else{
2440 pc = 0;
2441 }
2442 put2byte(pPrev, pc);
2443 pageIntegrity(pPage);
2444 }else{
2445 pPage->needRelink = 1;
2446 }
drh428ae8c2003-01-04 16:48:09 +00002447 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002448}
2449
2450/*
2451** Insert a new cell on pPage at cell index "i". pCell points to the
2452** content of the cell.
2453**
2454** If the cell content will fit on the page, then put it there. If it
drh24cd67e2004-05-10 16:18:47 +00002455** will not fit and pTemp is not NULL, then make a copy of the content
2456** into pTemp, set pPage->aCell[i] point to pTemp, and set pPage->isOverfull.
2457** If the content will not fit and pTemp is NULL, then make pPage->aCell[i]
2458** point to pCell and set pPage->isOverfull.
drh14acc042001-06-10 19:56:58 +00002459**
drhda200cc2004-05-09 11:51:38 +00002460** Try to maintain the integrity of the linked list of cells. But if
2461** the cell being inserted does not fit on the page, this will not be
2462** possible. If the linked list is not maintained, then just update
2463** pPage->aCell[] and set the pPage->needRelink flag so that we will
2464** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002465*/
drh24cd67e2004-05-10 16:18:47 +00002466static void insertCell(
2467 MemPage *pPage, /* Page into which we are copying */
2468 int i, /* Which cell on pPage to insert after */
2469 u8 *pCell, /* Text of the new cell to insert */
2470 int sz, /* Bytes of data in pCell */
2471 u8 *pTemp /* Temp storage space for pCell, if needed */
2472){
drh14acc042001-06-10 19:56:58 +00002473 int idx, j;
2474 assert( i>=0 && i<=pPage->nCell );
drha34b6762004-05-07 13:30:42 +00002475 assert( sz==cellSize(pPage, pCell) );
2476 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002477 idx = pPage->needRelink ? 0 : allocateSpace(pPage, sz);
drh4b70f112004-05-02 21:12:19 +00002478 resizeCellArray(pPage, pPage->nCell+1);
drh14acc042001-06-10 19:56:58 +00002479 for(j=pPage->nCell; j>i; j--){
drh4b70f112004-05-02 21:12:19 +00002480 pPage->aCell[j] = pPage->aCell[j-1];
drh14acc042001-06-10 19:56:58 +00002481 }
2482 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002483 if( idx<=0 ){
2484 pPage->isOverfull = 1;
drh24cd67e2004-05-10 16:18:47 +00002485 if( pTemp ){
2486 memcpy(pTemp, pCell, sz);
2487 }else{
2488 pTemp = pCell;
2489 }
2490 pPage->aCell[i] = pTemp;
drh14acc042001-06-10 19:56:58 +00002491 }else{
drhda200cc2004-05-09 11:51:38 +00002492 u8 *data = pPage->aData;
2493 memcpy(&data[idx], pCell, sz);
2494 pPage->aCell[i] = &data[idx];
2495 }
2496 if( !pPage->isOverfull && !pPage->needRelink ){
2497 u8 *pPrev;
2498 int pc;
2499 if( i==0 ){
2500 pPrev = &pPage->aData[pPage->hdrOffset+3];
2501 }else{
2502 pPrev = pPage->aCell[i-1];
2503 }
2504 pc = get2byte(pPrev);
2505 put2byte(pPrev, idx);
2506 put2byte(pPage->aCell[i], pc);
2507 pageIntegrity(pPage);
2508 }else{
2509 pPage->needRelink = 1;
drh14acc042001-06-10 19:56:58 +00002510 }
drh428ae8c2003-01-04 16:48:09 +00002511 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002512}
2513
2514/*
2515** Rebuild the linked list of cells on a page so that the cells
drh4b70f112004-05-02 21:12:19 +00002516** occur in the order specified by the pPage->aCell[] array.
drh8c42ca92001-06-22 19:15:00 +00002517** Invoke this routine once to repair damage after one or more
2518** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002519*/
drh4b70f112004-05-02 21:12:19 +00002520static void relinkCellList(MemPage *pPage){
2521 int i, idxFrom;
drha34b6762004-05-07 13:30:42 +00002522 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002523 if( !pPage->needRelink ) return;
drh4b70f112004-05-02 21:12:19 +00002524 idxFrom = pPage->hdrOffset+3;
drh14acc042001-06-10 19:56:58 +00002525 for(i=0; i<pPage->nCell; i++){
drhde647132004-05-07 17:57:49 +00002526 int idx = Addr(pPage->aCell[i]) - Addr(pPage->aData);
drh4b70f112004-05-02 21:12:19 +00002527 assert( idx>pPage->hdrOffset && idx<pPage->pBt->pageSize );
2528 put2byte(&pPage->aData[idxFrom], idx);
2529 idxFrom = idx;
drh14acc042001-06-10 19:56:58 +00002530 }
drh4b70f112004-05-02 21:12:19 +00002531 put2byte(&pPage->aData[idxFrom], 0);
drhda200cc2004-05-09 11:51:38 +00002532 pPage->needRelink = 0;
drh14acc042001-06-10 19:56:58 +00002533}
2534
2535/*
drhc8629a12004-05-08 20:07:40 +00002536** GCC does not define the offsetof() macro so we'll have to do it
2537** ourselves.
2538*/
2539#ifndef offsetof
2540#define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
2541#endif
2542
2543/*
drh91025292004-05-03 19:49:32 +00002544** Move the content of the page at pFrom over to pTo. The pFrom->aCell[]
drh4b70f112004-05-02 21:12:19 +00002545** pointers that point into pFrom->aData[] must be adjusted to point
2546** into pTo->aData[] instead. But some pFrom->aCell[] entries might
2547** not point to pFrom->aData[]. Those are unchanged.
drh91025292004-05-03 19:49:32 +00002548**
2549** Over this operation completes, the meta data for pFrom is zeroed.
drh14acc042001-06-10 19:56:58 +00002550*/
drhda200cc2004-05-09 11:51:38 +00002551static void movePage(MemPage *pTo, MemPage *pFrom){
drh14acc042001-06-10 19:56:58 +00002552 uptr from, to;
2553 int i;
drh4b70f112004-05-02 21:12:19 +00002554 int pageSize;
2555 int ofst;
2556
2557 assert( pTo->hdrOffset==0 );
drh3a4c1412004-05-09 20:40:11 +00002558 assert( pFrom->isInit );
drh4b70f112004-05-02 21:12:19 +00002559 ofst = pFrom->hdrOffset;
drhc8629a12004-05-08 20:07:40 +00002560 pageSize = pFrom->pBt->pageSize;
drh91025292004-05-03 19:49:32 +00002561 sqliteFree(pTo->aCell);
drhc8629a12004-05-08 20:07:40 +00002562 memcpy(pTo->aData, &pFrom->aData[ofst], pageSize - ofst);
2563 memcpy(pTo, pFrom, offsetof(MemPage, aData));
2564 pFrom->isInit = 0;
2565 pFrom->aCell = 0;
drh4b70f112004-05-02 21:12:19 +00002566 assert( pTo->aData[5]<155 );
2567 pTo->aData[5] += ofst;
drh14acc042001-06-10 19:56:58 +00002568 pTo->isOverfull = pFrom->isOverfull;
drh4b70f112004-05-02 21:12:19 +00002569 to = Addr(pTo->aData);
drh91025292004-05-03 19:49:32 +00002570 from = Addr(&pFrom->aData[ofst]);
drh14acc042001-06-10 19:56:58 +00002571 for(i=0; i<pTo->nCell; i++){
drh91025292004-05-03 19:49:32 +00002572 uptr x = Addr(pTo->aCell[i]);
2573 if( x>from && x<from+pageSize-ofst ){
drh4b70f112004-05-02 21:12:19 +00002574 *((uptr*)&pTo->aCell[i]) = x + to - from;
drh14acc042001-06-10 19:56:58 +00002575 }
2576 }
drhbd03cae2001-06-02 02:40:57 +00002577}
2578
2579/*
drhc3b70572003-01-04 19:44:07 +00002580** The following parameters determine how many adjacent pages get involved
2581** in a balancing operation. NN is the number of neighbors on either side
2582** of the page that participate in the balancing operation. NB is the
2583** total number of pages that participate, including the target page and
2584** NN neighbors on either side.
2585**
2586** The minimum value of NN is 1 (of course). Increasing NN above 1
2587** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2588** in exchange for a larger degradation in INSERT and UPDATE performance.
2589** The value of NN appears to give the best results overall.
2590*/
2591#define NN 1 /* Number of neighbors on either side of pPage */
2592#define NB (NN*2+1) /* Total pages involved in the balance */
2593
2594/*
drh8b2f49b2001-06-08 00:21:52 +00002595** This routine redistributes Cells on pPage and up to two siblings
2596** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002597** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002598** though both siblings might come from one side if pPage is the first
2599** or last child of its parent. If pPage has fewer than two siblings
2600** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002601** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002602**
2603** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002604** one in an effort to keep pages between 66% and 100% full. The root page
2605** is special and is allowed to be less than 66% full. If pPage is
2606** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002607** or decreased by one, as necessary, to keep the root page from being
2608** overfull or empty.
2609**
drh4b70f112004-05-02 21:12:19 +00002610** This routine alwyas calls relinkCellList() on its input page regardless of
drh14acc042001-06-10 19:56:58 +00002611** whether or not it does any real balancing. Client routines will typically
2612** invoke insertCell() or dropCell() before calling this routine, so we
2613** need to call relinkCellList() to clean up the mess that those other
2614** routines left behind.
2615**
drh8b2f49b2001-06-08 00:21:52 +00002616** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00002617** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002618** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00002619** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00002620**
drh8c42ca92001-06-22 19:15:00 +00002621** In the course of balancing the siblings of pPage, the parent of pPage
2622** might become overfull or underfull. If that happens, then this routine
2623** is called recursively on the parent.
2624**
drh5e00f6c2001-09-13 13:46:56 +00002625** If this routine fails for any reason, it might leave the database
2626** in a corrupted state. So if this routine fails, the database should
2627** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002628*/
drh4b70f112004-05-02 21:12:19 +00002629static int balance(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00002630 MemPage *pParent; /* The parent of pPage */
drh4b70f112004-05-02 21:12:19 +00002631 Btree *pBt; /* The whole database */
drha34b6762004-05-07 13:30:42 +00002632 int nCell; /* Number of cells in aCell[] */
drh8b2f49b2001-06-08 00:21:52 +00002633 int nOld; /* Number of pages in apOld[] */
2634 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002635 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002636 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00002637 int idx; /* Index of pPage in pParent->aCell[] */
2638 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00002639 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00002640 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
2641 int usableSpace; /* Bytes in pPage beyond the header */
2642 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00002643 int subtotal; /* Subtotal of bytes in cells on one page */
drhda200cc2004-05-09 11:51:38 +00002644 MemPage *extraUnref = 0; /* Unref this page if not zero */
drhc3b70572003-01-04 19:44:07 +00002645 MemPage *apOld[NB]; /* pPage and up to two siblings */
2646 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00002647 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drhc3b70572003-01-04 19:44:07 +00002648 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2649 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2650 int idxDiv[NB]; /* Indices of divider cells in pParent */
drh4b70f112004-05-02 21:12:19 +00002651 u8 *apDiv[NB]; /* Divider cells in pParent */
2652 u8 aTemp[NB][MX_CELL_SIZE]; /* Temporary holding area for apDiv[] */
drh24cd67e2004-05-10 16:18:47 +00002653 u8 aInsBuf[NB][MX_CELL_SIZE];/* Space to hold dividers cells during insert */
drha34b6762004-05-07 13:30:42 +00002654 int cntNew[NB+1]; /* Index in aCell[] of cell after i-th page */
drhc3b70572003-01-04 19:44:07 +00002655 int szNew[NB+1]; /* Combined size of cells place on i-th page */
drh4b70f112004-05-02 21:12:19 +00002656 u8 *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
drhc3b70572003-01-04 19:44:07 +00002657 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh4b70f112004-05-02 21:12:19 +00002658 u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)]; /* Space for apCopy[] */
drh8b2f49b2001-06-08 00:21:52 +00002659
drh14acc042001-06-10 19:56:58 +00002660 /*
2661 ** Return without doing any work if pPage is neither overfull nor
2662 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002663 */
drh3a4c1412004-05-09 20:40:11 +00002664 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00002665 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002666 pBt = pPage->pBt;
2667 if( !pPage->isOverfull && pPage->nFree<pBt->pageSize/2 && pPage->nCell>=2){
2668 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002669 return SQLITE_OK;
2670 }
2671
2672 /*
drh4b70f112004-05-02 21:12:19 +00002673 ** Find the parent of the page to be balanced. If there is no parent,
2674 ** it means this page is the root page and special rules apply.
drh8b2f49b2001-06-08 00:21:52 +00002675 */
drh14acc042001-06-10 19:56:58 +00002676 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002677 if( pParent==0 ){
2678 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002679 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002680 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002681 if( pPage->nCell==0 ){
drh8856d6a2004-04-29 14:42:46 +00002682 if( pPage->leaf ){
2683 /* The table is completely empty */
2684 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002685 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
drh8856d6a2004-04-29 14:42:46 +00002686 }else{
2687 /* The root page is empty but has one child. Transfer the
2688 ** information from that one child into the root page if it
drh3a4c1412004-05-09 20:40:11 +00002689 ** will fit. This reduces the depth of the tree by one.
drh8856d6a2004-04-29 14:42:46 +00002690 **
2691 ** If the root page is page 1, it has less space available than
drh4b70f112004-05-02 21:12:19 +00002692 ** its child (due to the 100 byte header that occurs at the beginning
2693 ** of the database fle), so it might not be able to hold all of the
2694 ** information currently contained in the child. If this is the
2695 ** case, then do not do the transfer. Leave page 1 empty except
2696 ** for the right-pointer to the child page. The child page becomes
2697 ** the virtual root of the tree.
drh8b2f49b2001-06-08 00:21:52 +00002698 */
drha34b6762004-05-07 13:30:42 +00002699 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+6]);
2700 assert( pgnoChild>0 && pgnoChild<=sqlite3pager_pagecount(pBt->pPager) );
drh8856d6a2004-04-29 14:42:46 +00002701 rc = getPage(pBt, pgnoChild, &pChild);
drh8b2f49b2001-06-08 00:21:52 +00002702 if( rc ) return rc;
drh8856d6a2004-04-29 14:42:46 +00002703 if( pPage->pgno==1 ){
drh4b70f112004-05-02 21:12:19 +00002704 rc = initPage(pChild, pPage);
drh8856d6a2004-04-29 14:42:46 +00002705 if( rc ) return rc;
2706 if( pChild->nFree>=100 ){
drh4b70f112004-05-02 21:12:19 +00002707 /* The child information will fit on the root page, so do the
2708 ** copy */
2709 zeroPage(pPage, pChild->aData[0]);
2710 resizeCellArray(pPage, pChild->nCell);
2711 for(i=0; i<pChild->nCell; i++){
2712 insertCell(pPage, i, pChild->aCell[i],
drh24cd67e2004-05-10 16:18:47 +00002713 cellSize(pChild, pChild->aCell[i]), 0);
drh4b70f112004-05-02 21:12:19 +00002714 }
2715 freePage(pChild);
drhda200cc2004-05-09 11:51:38 +00002716 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
drh4b70f112004-05-02 21:12:19 +00002717 }else{
2718 /* The child has more information that will fit on the root.
2719 ** The tree is already balanced. Do nothing. */
drhda200cc2004-05-09 11:51:38 +00002720 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
drh8856d6a2004-04-29 14:42:46 +00002721 }
2722 }else{
drh3a4c1412004-05-09 20:40:11 +00002723 memcpy(pPage->aData, pChild->aData, pBt->pageSize);
drh8856d6a2004-04-29 14:42:46 +00002724 pPage->isInit = 0;
drh4b70f112004-05-02 21:12:19 +00002725 pPage->pParent = 0;
2726 rc = initPage(pPage, 0);
drh8856d6a2004-04-29 14:42:46 +00002727 assert( rc==SQLITE_OK );
drh4b70f112004-05-02 21:12:19 +00002728 freePage(pChild);
drh3a4c1412004-05-09 20:40:11 +00002729 TRACE(("BALANCE: transfer child %d into root %d\n",
2730 pChild->pgno, pPage->pgno));
drh5edc3122001-09-13 21:53:09 +00002731 }
drh4b70f112004-05-02 21:12:19 +00002732 reparentChildPages(pPage);
2733 releasePage(pChild);
drh8b2f49b2001-06-08 00:21:52 +00002734 }
2735 return SQLITE_OK;
2736 }
drh14acc042001-06-10 19:56:58 +00002737 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002738 /* It is OK for the root page to be less than half full.
2739 */
drha34b6762004-05-07 13:30:42 +00002740 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002741 TRACE(("BALANCE: root page %d is low - no changes\n", pPage->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002742 return SQLITE_OK;
2743 }
drh14acc042001-06-10 19:56:58 +00002744 /*
2745 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002746 ** When this happens, Create a new child page and copy the
2747 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002748 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002749 ** child. Then fall thru to the code below which will cause
2750 ** the overfull child page to be split.
2751 */
drh4b70f112004-05-02 21:12:19 +00002752 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
drh14acc042001-06-10 19:56:58 +00002753 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002754 assert( sqlite3pager_iswriteable(pChild->aData) );
drhda200cc2004-05-09 11:51:38 +00002755 movePage(pChild, pPage);
drhc8629a12004-05-08 20:07:40 +00002756 assert( pChild->aData[0]==pPage->aData[pPage->hdrOffset] );
drh14acc042001-06-10 19:56:58 +00002757 pChild->pParent = pPage;
drh457f5012004-05-09 01:35:05 +00002758 sqlite3pager_ref(pPage->aData);
drhda200cc2004-05-09 11:51:38 +00002759 pChild->idxParent = 0;
drh14acc042001-06-10 19:56:58 +00002760 pChild->isOverfull = 1;
drhc8629a12004-05-08 20:07:40 +00002761 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
drh4b70f112004-05-02 21:12:19 +00002762 put4byte(&pPage->aData[pPage->hdrOffset+6], pChild->pgno);
drh8b2f49b2001-06-08 00:21:52 +00002763 pParent = pPage;
2764 pPage = pChild;
drhda200cc2004-05-09 11:51:38 +00002765 extraUnref = pChild;
drh3a4c1412004-05-09 20:40:11 +00002766 TRACE(("BALANCE: copy root %d into %d and balance %d\n",
2767 pParent->pgno, pPage->pgno, pPage->pgno));
2768 }else{
2769 TRACE(("BALANCE: begin page %d child of %d\n",
2770 pPage->pgno, pParent->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002771 }
drha34b6762004-05-07 13:30:42 +00002772 rc = sqlite3pager_write(pParent->aData);
drh6019e162001-07-02 17:51:45 +00002773 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002774 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002775
drh8b2f49b2001-06-08 00:21:52 +00002776 /*
drh4b70f112004-05-02 21:12:19 +00002777 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00002778 ** to pPage. The "idx" variable is the index of that cell. If pPage
2779 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002780 */
drhbb49aba2003-01-04 18:53:27 +00002781 if( pParent->idxShift ){
drha34b6762004-05-07 13:30:42 +00002782 Pgno pgno;
drh4b70f112004-05-02 21:12:19 +00002783 pgno = pPage->pgno;
drha34b6762004-05-07 13:30:42 +00002784 assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00002785 for(idx=0; idx<pParent->nCell; idx++){
drha34b6762004-05-07 13:30:42 +00002786 if( get4byte(&pParent->aCell[idx][2])==pgno ){
drhbb49aba2003-01-04 18:53:27 +00002787 break;
2788 }
drh8b2f49b2001-06-08 00:21:52 +00002789 }
drh4b70f112004-05-02 21:12:19 +00002790 assert( idx<pParent->nCell
2791 || get4byte(&pParent->aData[pParent->hdrOffset+6])==pgno );
drhbb49aba2003-01-04 18:53:27 +00002792 }else{
2793 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002794 }
drh8b2f49b2001-06-08 00:21:52 +00002795
2796 /*
drh14acc042001-06-10 19:56:58 +00002797 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002798 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002799 */
drh14acc042001-06-10 19:56:58 +00002800 nOld = nNew = 0;
drha34b6762004-05-07 13:30:42 +00002801 sqlite3pager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00002802
2803 /*
drh4b70f112004-05-02 21:12:19 +00002804 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002805 ** the siblings. An attempt is made to find NN siblings on either
2806 ** side of pPage. More siblings are taken from one side, however, if
2807 ** pPage there are fewer than NN siblings on the other side. If pParent
2808 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002809 */
drhc3b70572003-01-04 19:44:07 +00002810 nxDiv = idx - NN;
2811 if( nxDiv + NB > pParent->nCell ){
2812 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002813 }
drhc3b70572003-01-04 19:44:07 +00002814 if( nxDiv<0 ){
2815 nxDiv = 0;
2816 }
drh8b2f49b2001-06-08 00:21:52 +00002817 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002818 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002819 if( k<pParent->nCell ){
2820 idxDiv[i] = k;
drh4b70f112004-05-02 21:12:19 +00002821 apDiv[i] = pParent->aCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002822 nDiv++;
drha34b6762004-05-07 13:30:42 +00002823 assert( !pParent->leaf );
2824 pgnoOld[i] = get4byte(&apDiv[i][2]);
drh14acc042001-06-10 19:56:58 +00002825 }else if( k==pParent->nCell ){
drh4b70f112004-05-02 21:12:19 +00002826 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+6]);
drh14acc042001-06-10 19:56:58 +00002827 }else{
2828 break;
drh8b2f49b2001-06-08 00:21:52 +00002829 }
drhde647132004-05-07 17:57:49 +00002830 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002831 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002832 apOld[i]->idxParent = k;
drh91025292004-05-03 19:49:32 +00002833 apCopy[i] = 0;
2834 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00002835 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002836 }
2837
2838 /*
drh14acc042001-06-10 19:56:58 +00002839 ** Make copies of the content of pPage and its siblings into aOld[].
2840 ** The rest of this function will use data from the copies rather
2841 ** that the original pages since the original pages will be in the
2842 ** process of being overwritten.
2843 */
2844 for(i=0; i<nOld; i++){
drhc8629a12004-05-08 20:07:40 +00002845 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
2846 p->aData = &((u8*)p)[-pBt->pageSize];
drhda200cc2004-05-09 11:51:38 +00002847 p->aCell = 0;
2848 p->hdrOffset = 0;
2849 movePage(p, apOld[i]);
drh14acc042001-06-10 19:56:58 +00002850 }
2851
2852 /*
2853 ** Load pointers to all cells on sibling pages and the divider cells
2854 ** into the local apCell[] array. Make copies of the divider cells
2855 ** into aTemp[] and remove the the divider Cells from pParent.
drh4b70f112004-05-02 21:12:19 +00002856 **
2857 ** If the siblings are on leaf pages, then the child pointers of the
2858 ** divider cells are stripped from the cells before they are copied
2859 ** into aTemp[]. In this wall, all cells in apCell[] are without
2860 ** child pointers. If siblings are not leaves, then all cell in
2861 ** apCell[] include child pointers. Either way, all cells in apCell[]
2862 ** are alike.
drh8b2f49b2001-06-08 00:21:52 +00002863 */
2864 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00002865 leafCorrection = pPage->leaf*4;
drh8b2f49b2001-06-08 00:21:52 +00002866 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002867 MemPage *pOld = apCopy[i];
drh8b2f49b2001-06-08 00:21:52 +00002868 for(j=0; j<pOld->nCell; j++){
drh4b70f112004-05-02 21:12:19 +00002869 apCell[nCell] = pOld->aCell[j];
2870 szCell[nCell] = cellSize(pOld, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002871 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002872 }
2873 if( i<nOld-1 ){
drhda200cc2004-05-09 11:51:38 +00002874 szCell[nCell] = cellSize(pParent, apDiv[i]);
2875 memcpy(aTemp[i], apDiv[i], szCell[nCell]);
drh4b70f112004-05-02 21:12:19 +00002876 apCell[nCell] = &aTemp[i][leafCorrection];
2877 dropCell(pParent, nxDiv, szCell[nCell]);
drhda200cc2004-05-09 11:51:38 +00002878 szCell[nCell] -= leafCorrection;
2879 assert( get4byte(&aTemp[i][2])==pgnoOld[i] );
drh4b70f112004-05-02 21:12:19 +00002880 if( !pOld->leaf ){
2881 assert( leafCorrection==0 );
2882 /* The right pointer of the child page pOld becomes the left
2883 ** pointer of the divider cell */
2884 memcpy(&apCell[nCell][2], &pOld->aData[pOld->hdrOffset+6], 4);
2885 }else{
2886 assert( leafCorrection==4 );
2887 }
drh14acc042001-06-10 19:56:58 +00002888 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002889 }
2890 }
2891
2892 /*
drh6019e162001-07-02 17:51:45 +00002893 ** Figure out the number of pages needed to hold all nCell cells.
2894 ** Store this number in "k". Also compute szNew[] which is the total
2895 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00002896 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00002897 ** cntNew[k] should equal nCell.
2898 **
2899 ** This little patch of code is critical for keeping the tree
2900 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00002901 */
drh4b70f112004-05-02 21:12:19 +00002902 usableSpace = pBt->pageSize - 10 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00002903 for(subtotal=k=i=0; i<nCell; i++){
2904 subtotal += szCell[i];
drh4b70f112004-05-02 21:12:19 +00002905 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00002906 szNew[k] = subtotal - szCell[i];
2907 cntNew[k] = i;
2908 subtotal = 0;
2909 k++;
2910 }
2911 }
2912 szNew[k] = subtotal;
2913 cntNew[k] = nCell;
2914 k++;
2915 for(i=k-1; i>0; i--){
drh4b70f112004-05-02 21:12:19 +00002916 while( szNew[i]<usableSpace/2 ){
drh6019e162001-07-02 17:51:45 +00002917 cntNew[i-1]--;
2918 assert( cntNew[i-1]>0 );
2919 szNew[i] += szCell[cntNew[i-1]];
2920 szNew[i-1] -= szCell[cntNew[i-1]-1];
2921 }
2922 }
2923 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00002924
2925 /*
drh6b308672002-07-08 02:16:37 +00002926 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00002927 */
drh4b70f112004-05-02 21:12:19 +00002928 assert( pPage->pgno>1 );
2929 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00002930 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00002931 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00002932 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00002933 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00002934 pgnoNew[i] = pgnoOld[i];
2935 apOld[i] = 0;
drhda200cc2004-05-09 11:51:38 +00002936 sqlite3pager_write(pNew->aData);
drh6b308672002-07-08 02:16:37 +00002937 }else{
drhda200cc2004-05-09 11:51:38 +00002938 rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00002939 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00002940 apNew[i] = pNew;
drh6b308672002-07-08 02:16:37 +00002941 }
drh14acc042001-06-10 19:56:58 +00002942 nNew++;
drhda200cc2004-05-09 11:51:38 +00002943 zeroPage(pNew, pageFlags);
drh8b2f49b2001-06-08 00:21:52 +00002944 }
2945
drh6b308672002-07-08 02:16:37 +00002946 /* Free any old pages that were not reused as new pages.
2947 */
2948 while( i<nOld ){
drh4b70f112004-05-02 21:12:19 +00002949 rc = freePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002950 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00002951 releasePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002952 apOld[i] = 0;
2953 i++;
2954 }
2955
drh8b2f49b2001-06-08 00:21:52 +00002956 /*
drhf9ffac92002-03-02 19:00:31 +00002957 ** Put the new pages in accending order. This helps to
2958 ** keep entries in the disk file in order so that a scan
2959 ** of the table is a linear scan through the file. That
2960 ** in turn helps the operating system to deliver pages
2961 ** from the disk more rapidly.
2962 **
2963 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00002964 ** n is never more than NB (a small constant), that should
2965 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00002966 **
drhc3b70572003-01-04 19:44:07 +00002967 ** When NB==3, this one optimization makes the database
2968 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00002969 */
2970 for(i=0; i<k-1; i++){
2971 int minV = pgnoNew[i];
2972 int minI = i;
2973 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00002974 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00002975 minI = j;
2976 minV = pgnoNew[j];
2977 }
2978 }
2979 if( minI>i ){
2980 int t;
2981 MemPage *pT;
2982 t = pgnoNew[i];
2983 pT = apNew[i];
2984 pgnoNew[i] = pgnoNew[minI];
2985 apNew[i] = apNew[minI];
2986 pgnoNew[minI] = t;
2987 apNew[minI] = pT;
2988 }
2989 }
drh24cd67e2004-05-10 16:18:47 +00002990 TRACE(("BALANCE: old: %d %d %d new: %d %d %d %d\n",
2991 pgnoOld[0],
2992 nOld>=2 ? pgnoOld[1] : 0,
2993 nOld>=3 ? pgnoOld[2] : 0,
2994 pgnoNew[0],
2995 nNew>=2 ? pgnoNew[1] : 0,
2996 nNew>=3 ? pgnoNew[2] : 0,
2997 nNew>=4 ? pgnoNew[3] : 0));
2998
drhf9ffac92002-03-02 19:00:31 +00002999
3000 /*
drh14acc042001-06-10 19:56:58 +00003001 ** Evenly distribute the data in apCell[] across the new pages.
3002 ** Insert divider cells into pParent as necessary.
3003 */
3004 j = 0;
3005 for(i=0; i<nNew; i++){
3006 MemPage *pNew = apNew[i];
drh4b70f112004-05-02 21:12:19 +00003007 assert( pNew->pgno==pgnoNew[i] );
3008 resizeCellArray(pNew, cntNew[i] - j);
drh6019e162001-07-02 17:51:45 +00003009 while( j<cntNew[i] ){
3010 assert( pNew->nFree>=szCell[j] );
drh24cd67e2004-05-10 16:18:47 +00003011 insertCell(pNew, pNew->nCell, apCell[j], szCell[j], 0);
drh14acc042001-06-10 19:56:58 +00003012 j++;
3013 }
drh6019e162001-07-02 17:51:45 +00003014 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00003015 assert( !pNew->isOverfull );
drh4b70f112004-05-02 21:12:19 +00003016 relinkCellList(pNew);
drh14acc042001-06-10 19:56:58 +00003017 if( i<nNew-1 && j<nCell ){
drh4b70f112004-05-02 21:12:19 +00003018 u8 *pCell = apCell[j];
drh24cd67e2004-05-10 16:18:47 +00003019 u8 *pTemp;
drh4b70f112004-05-02 21:12:19 +00003020 if( !pNew->leaf ){
drh24cd67e2004-05-10 16:18:47 +00003021 memcpy(&pNew->aData[6], pCell+2, 4);
3022 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00003023 }else{
3024 pCell -= 4;
drh24cd67e2004-05-10 16:18:47 +00003025 pTemp = aInsBuf[i];
drh4b70f112004-05-02 21:12:19 +00003026 }
drh24cd67e2004-05-10 16:18:47 +00003027 insertCell(pParent, nxDiv, pCell, szCell[j]+leafCorrection, pTemp);
drh4b70f112004-05-02 21:12:19 +00003028 put4byte(&pParent->aCell[nxDiv][2], pNew->pgno);
drh14acc042001-06-10 19:56:58 +00003029 j++;
3030 nxDiv++;
3031 }
3032 }
drh6019e162001-07-02 17:51:45 +00003033 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00003034 if( (pageFlags & PTF_LEAF)==0 ){
3035 memcpy(&apNew[nNew-1]->aData[6], &apCopy[nOld-1]->aData[6], 4);
drh14acc042001-06-10 19:56:58 +00003036 }
drh4b70f112004-05-02 21:12:19 +00003037 if( nxDiv==pParent->nCell ){
3038 /* Right-most sibling is the right-most child of pParent */
3039 put4byte(&pParent->aData[pParent->hdrOffset+6], pgnoNew[nNew-1]);
3040 }else{
3041 /* Right-most sibling is the left child of the first entry in pParent
3042 ** past the right-most divider entry */
drha34b6762004-05-07 13:30:42 +00003043 put4byte(&pParent->aCell[nxDiv][2], pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00003044 }
3045
3046 /*
3047 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00003048 */
3049 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00003050 reparentChildPages(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003051 }
drh4b70f112004-05-02 21:12:19 +00003052 reparentChildPages(pParent);
drh8b2f49b2001-06-08 00:21:52 +00003053
3054 /*
drh3a4c1412004-05-09 20:40:11 +00003055 ** Balance the parent page. Note that the current page (pPage) might
3056 ** have been added to the freelist is it might no longer be initialized.
3057 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00003058 */
drhda200cc2004-05-09 11:51:38 +00003059 assert( pParent->isInit );
drh3a4c1412004-05-09 20:40:11 +00003060 /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
3061 /* pageIntegrity(pPage); // No! pPage might have been added to freelist */
drh4b70f112004-05-02 21:12:19 +00003062 rc = balance(pParent);
drhda200cc2004-05-09 11:51:38 +00003063
drh8b2f49b2001-06-08 00:21:52 +00003064 /*
drh14acc042001-06-10 19:56:58 +00003065 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00003066 */
drh14acc042001-06-10 19:56:58 +00003067balance_cleanup:
drh8b2f49b2001-06-08 00:21:52 +00003068 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00003069 releasePage(apOld[i]);
3070 if( apCopy[i] ){
drh91025292004-05-03 19:49:32 +00003071 sqliteFree(apCopy[i]->aCell);
3072 }
drh8b2f49b2001-06-08 00:21:52 +00003073 }
drh14acc042001-06-10 19:56:58 +00003074 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00003075 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003076 }
drh91025292004-05-03 19:49:32 +00003077 releasePage(pParent);
drhda200cc2004-05-09 11:51:38 +00003078 releasePage(extraUnref);
drh3a4c1412004-05-09 20:40:11 +00003079 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
3080 pPage->pgno, nOld, nNew, nCell));
drh8b2f49b2001-06-08 00:21:52 +00003081 return rc;
3082}
3083
3084/*
drhf74b8d92002-09-01 23:20:45 +00003085** This routine checks all cursors that point to the same table
3086** as pCur points to. If any of those cursors were opened with
3087** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
3088** cursors point to the same table were opened with wrFlag==1
3089** then this routine returns SQLITE_OK.
3090**
3091** In addition to checking for read-locks (where a read-lock
3092** means a cursor opened with wrFlag==0) this routine also moves
3093** all cursors other than pCur so that they are pointing to the
3094** first Cell on root page. This is necessary because an insert
3095** or delete might change the number of cells on a page or delete
3096** a page entirely and we do not want to leave any cursors
3097** pointing to non-existant pages or cells.
3098*/
3099static int checkReadLocks(BtCursor *pCur){
3100 BtCursor *p;
3101 assert( pCur->wrFlag );
3102 for(p=pCur->pShared; p!=pCur; p=p->pShared){
3103 assert( p );
3104 assert( p->pgnoRoot==pCur->pgnoRoot );
drha34b6762004-05-07 13:30:42 +00003105 assert( p->pPage->pgno==sqlite3pager_pagenumber(p->pPage->aData) );
drhf74b8d92002-09-01 23:20:45 +00003106 if( p->wrFlag==0 ) return SQLITE_LOCKED;
drh91025292004-05-03 19:49:32 +00003107 if( p->pPage->pgno!=p->pgnoRoot ){
drhf74b8d92002-09-01 23:20:45 +00003108 moveToRoot(p);
3109 }
3110 }
3111 return SQLITE_OK;
3112}
3113
3114/*
drh3b7511c2001-05-26 13:15:44 +00003115** Insert a new record into the BTree. The key is given by (pKey,nKey)
3116** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00003117** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00003118** is left pointing at a random location.
3119**
3120** For an INTKEY table, only the nKey value of the key is used. pKey is
3121** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00003122*/
drh3aac2dd2004-04-26 14:10:20 +00003123int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00003124 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4b70f112004-05-02 21:12:19 +00003125 const void *pKey, u64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00003126 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00003127){
drh3b7511c2001-05-26 13:15:44 +00003128 int rc;
3129 int loc;
drh14acc042001-06-10 19:56:58 +00003130 int szNew;
drh3b7511c2001-05-26 13:15:44 +00003131 MemPage *pPage;
3132 Btree *pBt = pCur->pBt;
drha34b6762004-05-07 13:30:42 +00003133 unsigned char *oldCell;
3134 unsigned char newCell[MX_CELL_SIZE];
drh3b7511c2001-05-26 13:15:44 +00003135
drhc39e0002004-05-07 23:50:57 +00003136 if( pCur->status ){
3137 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003138 }
drhf74b8d92002-09-01 23:20:45 +00003139 if( !pBt->inTrans || nKey+nData==0 ){
3140 /* Must start a transaction before doing an insert */
3141 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003142 }
drhf74b8d92002-09-01 23:20:45 +00003143 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00003144 if( !pCur->wrFlag ){
3145 return SQLITE_PERM; /* Cursor not open for writing */
3146 }
drhf74b8d92002-09-01 23:20:45 +00003147 if( checkReadLocks(pCur) ){
3148 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3149 }
drh3aac2dd2004-04-26 14:10:20 +00003150 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00003151 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00003152 pPage = pCur->pPage;
drh3a4c1412004-05-09 20:40:11 +00003153 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
3154 pCur->pgnoRoot, nKey, nData, pPage->pgno,
3155 loc==0 ? "overwrite" : "new entry"));
drh7aa128d2002-06-21 13:09:16 +00003156 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003157 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003158 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003159 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
drh3b7511c2001-05-26 13:15:44 +00003160 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003161 assert( szNew==cellSize(pPage, newCell) );
drh3a4c1412004-05-09 20:40:11 +00003162 assert( szNew<=sizeof(newCell) );
drh3b7511c2001-05-26 13:15:44 +00003163 if( loc==0 ){
drha34b6762004-05-07 13:30:42 +00003164 int szOld;
3165 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00003166 oldCell = pPage->aCell[pCur->idx];
3167 if( !pPage->leaf ){
3168 memcpy(&newCell[2], &oldCell[2], 4);
3169 }
3170 szOld = cellSize(pPage, oldCell);
3171 rc = clearCell(pPage, oldCell);
drh5e2f8b92001-05-28 00:41:15 +00003172 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003173 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00003174 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00003175 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00003176 pCur->idx++;
3177 }else{
drh4b70f112004-05-02 21:12:19 +00003178 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00003179 }
drh24cd67e2004-05-10 16:18:47 +00003180 insertCell(pPage, pCur->idx, newCell, szNew, 0);
drh4b70f112004-05-02 21:12:19 +00003181 rc = balance(pPage);
drh23e11ca2004-05-04 17:27:28 +00003182 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
drh3fc190c2001-09-14 03:24:23 +00003183 /* fflush(stdout); */
drh4b70f112004-05-02 21:12:19 +00003184 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003185 return rc;
3186}
3187
3188/*
drh4b70f112004-05-02 21:12:19 +00003189** Delete the entry that the cursor is pointing to. The cursor
3190** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00003191*/
drh3aac2dd2004-04-26 14:10:20 +00003192int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00003193 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00003194 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00003195 int rc;
drh8c42ca92001-06-22 19:15:00 +00003196 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00003197 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00003198
drh7aa128d2002-06-21 13:09:16 +00003199 assert( pPage->isInit );
drhc39e0002004-05-07 23:50:57 +00003200 if( pCur->status ){
3201 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003202 }
drhf74b8d92002-09-01 23:20:45 +00003203 if( !pBt->inTrans ){
3204 /* Must start a transaction before doing a delete */
3205 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003206 }
drhf74b8d92002-09-01 23:20:45 +00003207 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00003208 if( pCur->idx >= pPage->nCell ){
3209 return SQLITE_ERROR; /* The cursor is not pointing to anything */
3210 }
drhecdc7532001-09-23 02:35:53 +00003211 if( !pCur->wrFlag ){
3212 return SQLITE_PERM; /* Did not open this cursor for writing */
3213 }
drhf74b8d92002-09-01 23:20:45 +00003214 if( checkReadLocks(pCur) ){
3215 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3216 }
drha34b6762004-05-07 13:30:42 +00003217 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003218 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003219 pCell = pPage->aCell[pCur->idx];
3220 if( !pPage->leaf ){
3221 pgnoChild = get4byte(&pCell[2]);
3222 }
3223 clearCell(pPage, pCell);
3224 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00003225 /*
drh5e00f6c2001-09-13 13:46:56 +00003226 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00003227 ** do something we will leave a hole on an internal page.
3228 ** We have to fill the hole by moving in a cell from a leaf. The
3229 ** next Cell after the one to be deleted is guaranteed to exist and
3230 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00003231 */
drh14acc042001-06-10 19:56:58 +00003232 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00003233 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00003234 int szNext;
drh8c1238a2003-01-02 14:43:55 +00003235 int notUsed;
drh24cd67e2004-05-10 16:18:47 +00003236 unsigned char tempCell[MX_CELL_SIZE];
drh14acc042001-06-10 19:56:58 +00003237 getTempCursor(pCur, &leafCur);
drh3aac2dd2004-04-26 14:10:20 +00003238 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00003239 if( rc!=SQLITE_OK ){
drh8a6ac0a2004-02-14 17:35:07 +00003240 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3241 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003242 }
drha34b6762004-05-07 13:30:42 +00003243 rc = sqlite3pager_write(leafCur.pPage->aData);
drh6019e162001-07-02 17:51:45 +00003244 if( rc ) return rc;
drh3a4c1412004-05-09 20:40:11 +00003245 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
3246 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003247 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3248 pNext = leafCur.pPage->aCell[leafCur.idx];
3249 szNext = cellSize(leafCur.pPage, pNext);
drh24cd67e2004-05-10 16:18:47 +00003250 assert( sizeof(tempCell)>=szNext+4 );
3251 insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell);
3252 put4byte(pPage->aCell[pCur->idx]+2, pgnoChild);
drh4b70f112004-05-02 21:12:19 +00003253 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003254 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003255 dropCell(leafCur.pPage, leafCur.idx, szNext);
3256 rc = balance(leafCur.pPage);
drh8c42ca92001-06-22 19:15:00 +00003257 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00003258 }else{
drh3a4c1412004-05-09 20:40:11 +00003259 TRACE(("DELETE: table=%d delete from leaf %d\n",
3260 pCur->pgnoRoot, pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003261 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3262 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003263 }
drh4b70f112004-05-02 21:12:19 +00003264 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003265 return rc;
drh3b7511c2001-05-26 13:15:44 +00003266}
drh8b2f49b2001-06-08 00:21:52 +00003267
3268/*
drhc6b52df2002-01-04 03:09:29 +00003269** Create a new BTree table. Write into *piTable the page
3270** number for the root page of the new table.
3271**
3272** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00003273** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00003274** are restricted to having a 4-byte integer key and arbitrary data and
3275** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00003276** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00003277*/
drh3aac2dd2004-04-26 14:10:20 +00003278int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
drh8b2f49b2001-06-08 00:21:52 +00003279 MemPage *pRoot;
3280 Pgno pgnoRoot;
3281 int rc;
3282 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003283 /* Must start a transaction first */
3284 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003285 }
drh5df72a52002-06-06 23:16:05 +00003286 if( pBt->readOnly ){
3287 return SQLITE_READONLY;
3288 }
drhda200cc2004-05-09 11:51:38 +00003289 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1);
drh8b2f49b2001-06-08 00:21:52 +00003290 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003291 assert( sqlite3pager_iswriteable(pRoot->aData) );
drhde647132004-05-07 17:57:49 +00003292 zeroPage(pRoot, flags | PTF_LEAF);
drha34b6762004-05-07 13:30:42 +00003293 sqlite3pager_unref(pRoot->aData);
drh8b2f49b2001-06-08 00:21:52 +00003294 *piTable = (int)pgnoRoot;
3295 return SQLITE_OK;
3296}
3297
3298/*
3299** Erase the given database page and all its children. Return
3300** the page to the freelist.
3301*/
drh4b70f112004-05-02 21:12:19 +00003302static int clearDatabasePage(
3303 Btree *pBt, /* The BTree that contains the table */
3304 Pgno pgno, /* Page number to clear */
3305 MemPage *pParent, /* Parent page. NULL for the root */
3306 int freePageFlag /* Deallocate page if true */
3307){
drh8b2f49b2001-06-08 00:21:52 +00003308 MemPage *pPage;
3309 int rc;
drh4b70f112004-05-02 21:12:19 +00003310 unsigned char *pCell;
3311 int i;
drh8b2f49b2001-06-08 00:21:52 +00003312
drhde647132004-05-07 17:57:49 +00003313 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
drh8b2f49b2001-06-08 00:21:52 +00003314 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003315 rc = sqlite3pager_write(pPage->aData);
drh6019e162001-07-02 17:51:45 +00003316 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003317 for(i=0; i<pPage->nCell; i++){
3318 pCell = pPage->aCell[i];
3319 if( !pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003320 rc = clearDatabasePage(pBt, get4byte(&pCell[2]), pPage->pParent, 1);
drh8b2f49b2001-06-08 00:21:52 +00003321 if( rc ) return rc;
3322 }
drh4b70f112004-05-02 21:12:19 +00003323 rc = clearCell(pPage, pCell);
drh8b2f49b2001-06-08 00:21:52 +00003324 if( rc ) return rc;
3325 }
drha34b6762004-05-07 13:30:42 +00003326 if( !pPage->leaf ){
3327 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[6]), pPage->pParent, 1);
drh2aa679f2001-06-25 02:11:07 +00003328 if( rc ) return rc;
3329 }
3330 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00003331 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003332 }else{
drh3a4c1412004-05-09 20:40:11 +00003333 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00003334 }
drh4b70f112004-05-02 21:12:19 +00003335 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003336 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003337}
3338
3339/*
3340** Delete all information from a single table in the database.
3341*/
drh3aac2dd2004-04-26 14:10:20 +00003342int sqlite3BtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003343 int rc;
drhf74b8d92002-09-01 23:20:45 +00003344 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003345 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003346 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003347 }
drhf74b8d92002-09-01 23:20:45 +00003348 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3349 if( pCur->pgnoRoot==(Pgno)iTable ){
3350 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3351 moveToRoot(pCur);
3352 }
drhecdc7532001-09-23 02:35:53 +00003353 }
drha34b6762004-05-07 13:30:42 +00003354 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
drh8b2f49b2001-06-08 00:21:52 +00003355 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003356 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00003357 }
drh8c42ca92001-06-22 19:15:00 +00003358 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003359}
3360
3361/*
3362** Erase all information in a table and add the root of the table to
3363** the freelist. Except, the root of the principle table (the one on
3364** page 2) is never added to the freelist.
3365*/
drh3aac2dd2004-04-26 14:10:20 +00003366int sqlite3BtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003367 int rc;
3368 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00003369 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003370 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003371 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003372 }
drhf74b8d92002-09-01 23:20:45 +00003373 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3374 if( pCur->pgnoRoot==(Pgno)iTable ){
3375 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
3376 }
drh5df72a52002-06-06 23:16:05 +00003377 }
drha34b6762004-05-07 13:30:42 +00003378 rc = getPage(pBt, (Pgno)iTable, &pPage);
drh2aa679f2001-06-25 02:11:07 +00003379 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003380 rc = sqlite3BtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00003381 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003382 if( iTable>1 ){
drha34b6762004-05-07 13:30:42 +00003383 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003384 }else{
drha34b6762004-05-07 13:30:42 +00003385 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
drh8b2f49b2001-06-08 00:21:52 +00003386 }
drh4b70f112004-05-02 21:12:19 +00003387 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00003388 return rc;
3389}
3390
drh001bbcb2003-03-19 03:14:00 +00003391
drh8b2f49b2001-06-08 00:21:52 +00003392/*
drh23e11ca2004-05-04 17:27:28 +00003393** Read the meta-information out of a database file. Meta[0]
3394** is the number of free pages currently in the database. Meta[1]
3395** through meta[15] are available for use by higher layers.
drh8b2f49b2001-06-08 00:21:52 +00003396*/
drh3aac2dd2004-04-26 14:10:20 +00003397int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00003398 int rc;
drh4b70f112004-05-02 21:12:19 +00003399 unsigned char *pP1;
drh8b2f49b2001-06-08 00:21:52 +00003400
drh23e11ca2004-05-04 17:27:28 +00003401 assert( idx>=0 && idx<=15 );
drha34b6762004-05-07 13:30:42 +00003402 rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00003403 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003404 *pMeta = get4byte(&pP1[36 + idx*4]);
drha34b6762004-05-07 13:30:42 +00003405 sqlite3pager_unref(pP1);
drh8b2f49b2001-06-08 00:21:52 +00003406 return SQLITE_OK;
3407}
3408
3409/*
drh23e11ca2004-05-04 17:27:28 +00003410** Write meta-information back into the database. Meta[0] is
3411** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00003412*/
drh3aac2dd2004-04-26 14:10:20 +00003413int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
drh4b70f112004-05-02 21:12:19 +00003414 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00003415 int rc;
drh23e11ca2004-05-04 17:27:28 +00003416 assert( idx>=1 && idx<=15 );
drh8b2f49b2001-06-08 00:21:52 +00003417 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003418 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003419 }
drhde647132004-05-07 17:57:49 +00003420 assert( pBt->pPage1!=0 );
3421 pP1 = pBt->pPage1->aData;
drha34b6762004-05-07 13:30:42 +00003422 rc = sqlite3pager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00003423 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003424 put4byte(&pP1[36 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00003425 return SQLITE_OK;
3426}
drh8c42ca92001-06-22 19:15:00 +00003427
drh5eddca62001-06-30 21:53:53 +00003428/******************************************************************************
3429** The complete implementation of the BTree subsystem is above this line.
3430** All the code the follows is for testing and troubleshooting the BTree
3431** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003432******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003433
drh8c42ca92001-06-22 19:15:00 +00003434/*
3435** Print a disassembly of the given page on standard output. This routine
3436** is used for debugging and testing only.
3437*/
drhaaab5722002-02-19 13:39:21 +00003438#ifdef SQLITE_TEST
drh23e11ca2004-05-04 17:27:28 +00003439int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003440 int rc;
3441 MemPage *pPage;
drhc8629a12004-05-08 20:07:40 +00003442 int i, j, c;
drh8c42ca92001-06-22 19:15:00 +00003443 int nFree;
3444 u16 idx;
drhab9f7f12004-05-08 10:56:11 +00003445 int hdr;
3446 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00003447 char range[20];
3448 unsigned char payload[20];
drhab9f7f12004-05-08 10:56:11 +00003449
drh4b70f112004-05-02 21:12:19 +00003450 rc = getPage(pBt, (Pgno)pgno, &pPage);
drh8c42ca92001-06-22 19:15:00 +00003451 if( rc ){
3452 return rc;
3453 }
drhab9f7f12004-05-08 10:56:11 +00003454 hdr = pPage->hdrOffset;
3455 data = pPage->aData;
drhc8629a12004-05-08 20:07:40 +00003456 c = data[hdr];
3457 pPage->intKey = (c & PTF_INTKEY)!=0;
3458 pPage->zeroData = (c & PTF_ZERODATA)!=0;
3459 pPage->leaf = (c & PTF_LEAF)!=0;
drhda200cc2004-05-09 11:51:38 +00003460 printf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno,
3461 data[hdr], data[hdr+5],
3462 (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
drh8c42ca92001-06-22 19:15:00 +00003463 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003464 assert( hdr == (pgno==1 ? 100 : 0) );
3465 idx = get2byte(&data[hdr+3]);
drh4b70f112004-05-02 21:12:19 +00003466 while( idx>0 && idx<=pBt->pageSize ){
3467 u64 nData, nKey;
3468 int nHeader;
3469 Pgno child;
drhab9f7f12004-05-08 10:56:11 +00003470 unsigned char *pCell = &data[idx];
drh4b70f112004-05-02 21:12:19 +00003471 int sz = cellSize(pPage, pCell);
drh8c42ca92001-06-22 19:15:00 +00003472 sprintf(range,"%d..%d", idx, idx+sz-1);
drh4b70f112004-05-02 21:12:19 +00003473 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3474 if( pPage->leaf ){
3475 child = 0;
3476 }else{
3477 child = get4byte(&pCell[2]);
3478 }
drha34b6762004-05-07 13:30:42 +00003479 sz = nData;
3480 if( !pPage->intKey ) sz += nKey;
drh8c42ca92001-06-22 19:15:00 +00003481 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
drha34b6762004-05-07 13:30:42 +00003482 memcpy(payload, &pCell[nHeader], sz);
drh8c42ca92001-06-22 19:15:00 +00003483 for(j=0; j<sz; j++){
3484 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3485 }
3486 payload[sz] = 0;
3487 printf(
drha34b6762004-05-07 13:30:42 +00003488 "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4lld payload=%s\n",
3489 i, range, child, nKey, nData, payload
drh8c42ca92001-06-22 19:15:00 +00003490 );
drh4b70f112004-05-02 21:12:19 +00003491 if( pPage->isInit && pPage->aCell[i]!=pCell ){
3492 printf("**** aCell[%d] does not match on prior entry ****\n", i);
drh2aa679f2001-06-25 02:11:07 +00003493 }
drh7c717f72001-06-24 20:39:41 +00003494 i++;
drh4b70f112004-05-02 21:12:19 +00003495 idx = get2byte(pCell);
drh8c42ca92001-06-22 19:15:00 +00003496 }
3497 if( idx!=0 ){
3498 printf("ERROR: next cell index out of range: %d\n", idx);
3499 }
drh4b70f112004-05-02 21:12:19 +00003500 if( !pPage->leaf ){
drh3644f082004-05-10 18:45:09 +00003501 printf("right_child: %d\n", get4byte(&data[hdr+6]));
drh4b70f112004-05-02 21:12:19 +00003502 }
drh8c42ca92001-06-22 19:15:00 +00003503 nFree = 0;
3504 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003505 idx = get2byte(&data[hdr+1]);
drhc39e0002004-05-07 23:50:57 +00003506 while( idx>0 && idx<pPage->pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003507 int sz = get2byte(&data[idx+2]);
drh4b70f112004-05-02 21:12:19 +00003508 sprintf(range,"%d..%d", idx, idx+sz-1);
3509 nFree += sz;
drh8c42ca92001-06-22 19:15:00 +00003510 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00003511 i, range, sz, nFree);
drhab9f7f12004-05-08 10:56:11 +00003512 idx = get2byte(&data[idx]);
drh2aa679f2001-06-25 02:11:07 +00003513 i++;
drh8c42ca92001-06-22 19:15:00 +00003514 }
3515 if( idx!=0 ){
3516 printf("ERROR: next freeblock index out of range: %d\n", idx);
3517 }
drha34b6762004-05-07 13:30:42 +00003518 if( recursive && !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003519 idx = get2byte(&data[hdr+3]);
drha34b6762004-05-07 13:30:42 +00003520 while( idx>0 && idx<pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003521 unsigned char *pCell = &data[idx];
drha34b6762004-05-07 13:30:42 +00003522 sqlite3BtreePageDump(pBt, get4byte(&pCell[2]), 1);
3523 idx = get2byte(pCell);
drh6019e162001-07-02 17:51:45 +00003524 }
drhab9f7f12004-05-08 10:56:11 +00003525 sqlite3BtreePageDump(pBt, get4byte(&data[hdr+6]), 1);
drh6019e162001-07-02 17:51:45 +00003526 }
drhab9f7f12004-05-08 10:56:11 +00003527 sqlite3pager_unref(data);
drh3644f082004-05-10 18:45:09 +00003528 fflush(stdout);
drh8c42ca92001-06-22 19:15:00 +00003529 return SQLITE_OK;
3530}
drhaaab5722002-02-19 13:39:21 +00003531#endif
drh8c42ca92001-06-22 19:15:00 +00003532
drhaaab5722002-02-19 13:39:21 +00003533#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003534/*
drha34b6762004-05-07 13:30:42 +00003535** Return the flag byte at the beginning of the page that the cursor
3536** is currently pointing to.
3537*/
3538int sqlite3BtreeFlags(BtCursor *pCur){
3539 return pCur->pPage->aData[pCur->pPage->hdrOffset];
3540}
3541#endif
3542
3543#ifdef SQLITE_TEST
3544/*
drh2aa679f2001-06-25 02:11:07 +00003545** Fill aResult[] with information about the entry and page that the
3546** cursor is pointing to.
3547**
3548** aResult[0] = The page number
3549** aResult[1] = The entry number
3550** aResult[2] = Total number of entries on this page
3551** aResult[3] = Size of this entry
3552** aResult[4] = Number of free bytes on this page
3553** aResult[5] = Number of free blocks on the page
3554** aResult[6] = Page number of the left child of this entry
3555** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003556**
3557** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003558*/
drhda200cc2004-05-09 11:51:38 +00003559int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003560 int cnt, idx;
3561 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00003562
3563 pageIntegrity(pPage);
drh4b70f112004-05-02 21:12:19 +00003564 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003565 aResult[0] = sqlite3pager_pagenumber(pPage->aData);
drh91025292004-05-03 19:49:32 +00003566 assert( aResult[0]==pPage->pgno );
drh8c42ca92001-06-22 19:15:00 +00003567 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003568 aResult[2] = pPage->nCell;
3569 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh4b70f112004-05-02 21:12:19 +00003570 aResult[3] = cellSize(pPage, pPage->aCell[pCur->idx]);
3571 aResult[6] = pPage->leaf ? 0 : get4byte(&pPage->aCell[pCur->idx][2]);
drh2aa679f2001-06-25 02:11:07 +00003572 }else{
3573 aResult[3] = 0;
3574 aResult[6] = 0;
3575 }
3576 aResult[4] = pPage->nFree;
3577 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00003578 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhc39e0002004-05-07 23:50:57 +00003579 while( idx>0 && idx<pPage->pBt->pageSize ){
drh2aa679f2001-06-25 02:11:07 +00003580 cnt++;
drh4b70f112004-05-02 21:12:19 +00003581 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003582 }
3583 aResult[5] = cnt;
drh4b70f112004-05-02 21:12:19 +00003584 aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh8c42ca92001-06-22 19:15:00 +00003585 return SQLITE_OK;
3586}
drhaaab5722002-02-19 13:39:21 +00003587#endif
drhdd793422001-06-28 01:54:48 +00003588
drhdd793422001-06-28 01:54:48 +00003589/*
drh5eddca62001-06-30 21:53:53 +00003590** Return the pager associated with a BTree. This routine is used for
3591** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003592*/
drh3aac2dd2004-04-26 14:10:20 +00003593Pager *sqlite3BtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003594 return pBt->pPager;
3595}
drh5eddca62001-06-30 21:53:53 +00003596
3597/*
3598** This structure is passed around through all the sanity checking routines
3599** in order to keep track of some global state information.
3600*/
drhaaab5722002-02-19 13:39:21 +00003601typedef struct IntegrityCk IntegrityCk;
3602struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003603 Btree *pBt; /* The tree being checked out */
3604 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3605 int nPage; /* Number of pages in the database */
3606 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00003607 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003608};
3609
3610/*
3611** Append a message to the error message string.
3612*/
drhaaab5722002-02-19 13:39:21 +00003613static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003614 if( pCheck->zErrMsg ){
3615 char *zOld = pCheck->zErrMsg;
3616 pCheck->zErrMsg = 0;
danielk19774adee202004-05-08 08:23:19 +00003617 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003618 sqliteFree(zOld);
3619 }else{
danielk19774adee202004-05-08 08:23:19 +00003620 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003621 }
3622}
3623
3624/*
3625** Add 1 to the reference count for page iPage. If this is the second
3626** reference to the page, add an error message to pCheck->zErrMsg.
3627** Return 1 if there are 2 ore more references to the page and 0 if
3628** if this is the first reference to the page.
3629**
3630** Also check that the page number is in bounds.
3631*/
drhaaab5722002-02-19 13:39:21 +00003632static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003633 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003634 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003635 char zBuf[100];
3636 sprintf(zBuf, "invalid page number %d", iPage);
3637 checkAppendMsg(pCheck, zContext, zBuf);
3638 return 1;
3639 }
3640 if( pCheck->anRef[iPage]==1 ){
3641 char zBuf[100];
3642 sprintf(zBuf, "2nd reference to page %d", iPage);
3643 checkAppendMsg(pCheck, zContext, zBuf);
3644 return 1;
3645 }
3646 return (pCheck->anRef[iPage]++)>1;
3647}
3648
3649/*
3650** Check the integrity of the freelist or of an overflow page list.
3651** Verify that the number of pages on the list is N.
3652*/
drh30e58752002-03-02 20:41:57 +00003653static void checkList(
3654 IntegrityCk *pCheck, /* Integrity checking context */
3655 int isFreeList, /* True for a freelist. False for overflow page list */
3656 int iPage, /* Page number for first page in the list */
3657 int N, /* Expected number of pages in the list */
3658 char *zContext /* Context for error messages */
3659){
3660 int i;
drh3a4c1412004-05-09 20:40:11 +00003661 int expected = N;
3662 int iFirst = iPage;
drh5eddca62001-06-30 21:53:53 +00003663 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003664 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00003665 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00003666 if( iPage<1 ){
drh3a4c1412004-05-09 20:40:11 +00003667 sprintf(zMsg, "%d of %d pages missing from overflow list starting at %d",
3668 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00003669 checkAppendMsg(pCheck, zContext, zMsg);
3670 break;
3671 }
3672 if( checkRef(pCheck, iPage, zContext) ) break;
drha34b6762004-05-07 13:30:42 +00003673 if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
drh5eddca62001-06-30 21:53:53 +00003674 sprintf(zMsg, "failed to get page %d", iPage);
3675 checkAppendMsg(pCheck, zContext, zMsg);
3676 break;
3677 }
drh30e58752002-03-02 20:41:57 +00003678 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00003679 int n = get4byte(&pOvfl[4]);
drh0d316a42002-08-11 20:10:47 +00003680 for(i=0; i<n; i++){
drh4b70f112004-05-02 21:12:19 +00003681 checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
drh30e58752002-03-02 20:41:57 +00003682 }
drh0d316a42002-08-11 20:10:47 +00003683 N -= n;
drh30e58752002-03-02 20:41:57 +00003684 }
drh4b70f112004-05-02 21:12:19 +00003685 iPage = get4byte(pOvfl);
drha34b6762004-05-07 13:30:42 +00003686 sqlite3pager_unref(pOvfl);
drh5eddca62001-06-30 21:53:53 +00003687 }
3688}
3689
3690/*
drh1bffb9c2002-02-03 17:37:36 +00003691** Return negative if zKey1<zKey2.
3692** Return zero if zKey1==zKey2.
3693** Return positive if zKey1>zKey2.
3694*/
3695static int keyCompare(
3696 const char *zKey1, int nKey1,
3697 const char *zKey2, int nKey2
3698){
3699 int min = nKey1>nKey2 ? nKey2 : nKey1;
3700 int c = memcmp(zKey1, zKey2, min);
3701 if( c==0 ){
3702 c = nKey1 - nKey2;
3703 }
3704 return c;
3705}
3706
3707/*
drh5eddca62001-06-30 21:53:53 +00003708** Do various sanity checks on a single page of a tree. Return
3709** the tree depth. Root pages return 0. Parents of root pages
3710** return 1, and so forth.
3711**
3712** These checks are done:
3713**
3714** 1. Make sure that cells and freeblocks do not overlap
3715** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00003716** NO 2. Make sure cell keys are in order.
3717** NO 3. Make sure no key is less than or equal to zLowerBound.
3718** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00003719** 5. Check the integrity of overflow pages.
3720** 6. Recursively call checkTreePage on all children.
3721** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003722** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003723** the root of the tree.
3724*/
3725static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003726 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003727 int iPage, /* Page number of the page to check */
3728 MemPage *pParent, /* Parent page */
3729 char *zParentContext, /* Parent context */
3730 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003731 int nLower, /* Number of characters in zLowerBound */
3732 char *zUpperBound, /* All keys should be less than this, if not NULL */
3733 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003734){
3735 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00003736 int i, rc, depth, d2, pgno, cnt;
3737 int hdr;
3738 u8 *data;
drh5eddca62001-06-30 21:53:53 +00003739 char *zKey1, *zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003740 int nKey1, nKey2;
drh5eddca62001-06-30 21:53:53 +00003741 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003742 Btree *pBt;
drhda200cc2004-05-09 11:51:38 +00003743 int maxLocal, pageSize;
drh5eddca62001-06-30 21:53:53 +00003744 char zMsg[100];
3745 char zContext[100];
drhda200cc2004-05-09 11:51:38 +00003746 char hit[MX_PAGE_SIZE];
drh5eddca62001-06-30 21:53:53 +00003747
3748 /* Check that the page exists
3749 */
drh0d316a42002-08-11 20:10:47 +00003750 cur.pBt = pBt = pCheck->pBt;
drhda200cc2004-05-09 11:51:38 +00003751 maxLocal = pBt->maxLocal;
3752 pageSize = pBt->pageSize;
drh5eddca62001-06-30 21:53:53 +00003753 if( iPage==0 ) return 0;
3754 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh4b70f112004-05-02 21:12:19 +00003755 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003756 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3757 checkAppendMsg(pCheck, zContext, zMsg);
3758 return 0;
3759 }
drh4b70f112004-05-02 21:12:19 +00003760 if( (rc = initPage(pPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003761 sprintf(zMsg, "initPage() returns error code %d", rc);
3762 checkAppendMsg(pCheck, zContext, zMsg);
drh91025292004-05-03 19:49:32 +00003763 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003764 return 0;
3765 }
3766
3767 /* Check out all the cells.
3768 */
3769 depth = 0;
drh5eddca62001-06-30 21:53:53 +00003770 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003771 for(i=0; i<pPage->nCell; i++){
drhda200cc2004-05-09 11:51:38 +00003772 u8 *pCell = pPage->aCell[i];
3773 u64 nKey, nData;
3774 int sz, nHeader;
drh5eddca62001-06-30 21:53:53 +00003775
3776 /* Check payload overflow pages
3777 */
drh3a4c1412004-05-09 20:40:11 +00003778 sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
drhda200cc2004-05-09 11:51:38 +00003779 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3780 sz = nData;
3781 if( !pPage->intKey ) sz += nKey;
3782 if( sz>maxLocal ){
3783 int nPage = (sz - maxLocal + pageSize - 5)/(pageSize - 4);
3784 checkList(pCheck, 0, get4byte(&pCell[nHeader+maxLocal]),nPage,zContext);
drh5eddca62001-06-30 21:53:53 +00003785 }
3786
3787 /* Check sanity of left child page.
3788 */
drhda200cc2004-05-09 11:51:38 +00003789 if( !pPage->leaf ){
3790 pgno = get4byte(&pCell[2]);
3791 d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
3792 if( i>0 && d2!=depth ){
3793 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3794 }
3795 depth = d2;
drh5eddca62001-06-30 21:53:53 +00003796 }
drh5eddca62001-06-30 21:53:53 +00003797 }
drhda200cc2004-05-09 11:51:38 +00003798 if( !pPage->leaf ){
3799 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
3800 sprintf(zContext, "On page %d at right child: ", iPage);
3801 checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
3802 }
drh5eddca62001-06-30 21:53:53 +00003803
3804 /* Check for complete coverage of the page
3805 */
drhda200cc2004-05-09 11:51:38 +00003806 memset(hit, 0, pageSize);
3807 memset(hit, 1, pPage->hdrOffset+10-4*(pPage->leaf));
3808 data = pPage->aData;
3809 hdr = pPage->hdrOffset;
3810 for(cnt=0, i=get2byte(&data[hdr+3]); i>0 && i<pageSize && cnt<10000; cnt++){
3811 int size = cellSize(pPage, &data[i]);
drh5eddca62001-06-30 21:53:53 +00003812 int j;
drhda200cc2004-05-09 11:51:38 +00003813 for(j=i+size-1; j>=i; j--) hit[j]++;
3814 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003815 }
drhda200cc2004-05-09 11:51:38 +00003816 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<pageSize && cnt<10000; cnt++){
3817 int size = get2byte(&data[i+2]);
drh5eddca62001-06-30 21:53:53 +00003818 int j;
drhda200cc2004-05-09 11:51:38 +00003819 for(j=i+size-1; j>=i; j--) hit[j]++;
3820 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003821 }
drhda200cc2004-05-09 11:51:38 +00003822 for(i=cnt=0; i<pageSize; i++){
drh5eddca62001-06-30 21:53:53 +00003823 if( hit[i]==0 ){
drhda200cc2004-05-09 11:51:38 +00003824 cnt++;
drh5eddca62001-06-30 21:53:53 +00003825 }else if( hit[i]>1 ){
3826 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3827 checkAppendMsg(pCheck, zMsg, 0);
3828 break;
3829 }
3830 }
drhda200cc2004-05-09 11:51:38 +00003831 if( cnt!=data[hdr+5] ){
3832 sprintf(zMsg, "Fragmented space is %d byte reported as %d on page %d",
3833 cnt, data[hdr+5], iPage);
3834 checkAppendMsg(pCheck, zMsg, 0);
3835 }
drh6019e162001-07-02 17:51:45 +00003836
drh4b70f112004-05-02 21:12:19 +00003837 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00003838 return depth+1;
drh5eddca62001-06-30 21:53:53 +00003839}
3840
3841/*
3842** This routine does a complete check of the given BTree file. aRoot[] is
3843** an array of pages numbers were each page number is the root page of
3844** a table. nRoot is the number of entries in aRoot.
3845**
3846** If everything checks out, this routine returns NULL. If something is
3847** amiss, an error message is written into memory obtained from malloc()
3848** and a pointer to that error message is returned. The calling function
3849** is responsible for freeing the error message when it is done.
3850*/
drh3aac2dd2004-04-26 14:10:20 +00003851char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003852 int i;
3853 int nRef;
drhaaab5722002-02-19 13:39:21 +00003854 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003855
drha34b6762004-05-07 13:30:42 +00003856 nRef = *sqlite3pager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003857 if( lockBtree(pBt)!=SQLITE_OK ){
3858 return sqliteStrDup("Unable to acquire a read lock on the database");
3859 }
drh5eddca62001-06-30 21:53:53 +00003860 sCheck.pBt = pBt;
3861 sCheck.pPager = pBt->pPager;
drha34b6762004-05-07 13:30:42 +00003862 sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003863 if( sCheck.nPage==0 ){
3864 unlockBtreeIfUnused(pBt);
3865 return 0;
3866 }
drh8c1238a2003-01-02 14:43:55 +00003867 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drhda200cc2004-05-09 11:51:38 +00003868 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh5eddca62001-06-30 21:53:53 +00003869 sCheck.zErrMsg = 0;
3870
3871 /* Check the integrity of the freelist
3872 */
drha34b6762004-05-07 13:30:42 +00003873 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
3874 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003875
3876 /* Check all the tables.
3877 */
3878 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00003879 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00003880 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00003881 }
3882
3883 /* Make sure every page in the file is referenced
3884 */
3885 for(i=1; i<=sCheck.nPage; i++){
3886 if( sCheck.anRef[i]==0 ){
3887 char zBuf[100];
3888 sprintf(zBuf, "Page %d is never used", i);
3889 checkAppendMsg(&sCheck, zBuf, 0);
3890 }
3891 }
3892
3893 /* Make sure this analysis did not leave any unref() pages
3894 */
drh5e00f6c2001-09-13 13:46:56 +00003895 unlockBtreeIfUnused(pBt);
drha34b6762004-05-07 13:30:42 +00003896 if( nRef != *sqlite3pager_stats(pBt->pPager) ){
drh5eddca62001-06-30 21:53:53 +00003897 char zBuf[100];
3898 sprintf(zBuf,
3899 "Outstanding page count goes from %d to %d during this analysis",
drha34b6762004-05-07 13:30:42 +00003900 nRef, *sqlite3pager_stats(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00003901 );
3902 checkAppendMsg(&sCheck, zBuf, 0);
3903 }
3904
3905 /* Clean up and report errors.
3906 */
3907 sqliteFree(sCheck.anRef);
3908 return sCheck.zErrMsg;
3909}
paulb95a8862003-04-01 21:16:41 +00003910
drh73509ee2003-04-06 20:44:45 +00003911/*
3912** Return the full pathname of the underlying database file.
3913*/
drh3aac2dd2004-04-26 14:10:20 +00003914const char *sqlite3BtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00003915 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00003916 return sqlite3pager_filename(pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00003917}
3918
3919/*
drhf7c57532003-04-25 13:22:51 +00003920** Copy the complete content of pBtFrom into pBtTo. A transaction
3921** must be active for both files.
3922**
3923** The size of file pBtFrom may be reduced by this operation.
3924** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00003925*/
drh3aac2dd2004-04-26 14:10:20 +00003926int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
drhf7c57532003-04-25 13:22:51 +00003927 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00003928 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00003929
3930 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
drhf7c57532003-04-25 13:22:51 +00003931 if( pBtTo->pCursor ) return SQLITE_BUSY;
drhc39e0002004-05-07 23:50:57 +00003932 memcpy(pBtTo->pPage1, pBtFrom->pPage1, pBtFrom->pageSize);
drha34b6762004-05-07 13:30:42 +00003933 rc = sqlite3pager_overwrite(pBtTo->pPager, 1, pBtFrom->pPage1);
3934 nToPage = sqlite3pager_pagecount(pBtTo->pPager);
3935 nPage = sqlite3pager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00003936 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00003937 void *pPage;
drha34b6762004-05-07 13:30:42 +00003938 rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
drhf7c57532003-04-25 13:22:51 +00003939 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003940 rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
drh2e6d11b2003-04-25 15:37:57 +00003941 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003942 sqlite3pager_unref(pPage);
drhf7c57532003-04-25 13:22:51 +00003943 }
drh2e6d11b2003-04-25 15:37:57 +00003944 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3945 void *pPage;
drha34b6762004-05-07 13:30:42 +00003946 rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
drh2e6d11b2003-04-25 15:37:57 +00003947 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003948 rc = sqlite3pager_write(pPage);
3949 sqlite3pager_unref(pPage);
3950 sqlite3pager_dont_write(pBtTo->pPager, i);
drh2e6d11b2003-04-25 15:37:57 +00003951 }
3952 if( !rc && nPage<nToPage ){
drha34b6762004-05-07 13:30:42 +00003953 rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
drh2e6d11b2003-04-25 15:37:57 +00003954 }
drhf7c57532003-04-25 13:22:51 +00003955 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003956 sqlite3BtreeRollback(pBtTo);
drhf7c57532003-04-25 13:22:51 +00003957 }
3958 return rc;
drh73509ee2003-04-06 20:44:45 +00003959}