blob: 0e65010165e97fa62acea7e22742b53af42b4935 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drh24cd67e2004-05-10 16:18:47 +000012** $Id: btree.c,v 1.122 2004/05/10 16:18:48 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
56** page contain a special header that describes the file. The format
57** of that header is as follows:
58**
59** OFFSET SIZE DESCRIPTION
drhde647132004-05-07 17:57:49 +000060** 0 16 Header string: "SQLite format 3\000"
drh9e572e62004-04-23 23:43:10 +000061** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
64** 20 2 Bytes of unused space at the end of each page
65** 22 2 Maximum allowed local payload per entry
66** 24 8 File change counter
67** 32 4 First freelist page
68** 36 4 Number of freelist pages in the file
69** 40 60 15 4-byte meta values passed to higher layers
70**
71** All of the integer values are big-endian (most significant byte first).
72** The file change counter is incremented every time the database is changed.
73** This allows other processes to know when the file has changed and thus
74** when they need to flush their cache.
75**
76** Each btree page begins with a header described below. Note that the
77** header for page one begins at byte 100. For all other btree pages, the
78** header begins on byte zero.
79**
80** OFFSET SIZE DESCRIPTION
81** 0 1 Flags. 01: leaf, 02: zerodata, 04: intkey, F8: type
82** 1 2 byte offset to the first freeblock
83** 3 2 byte offset to the first cell
84** 5 1 number of fragmented free bytes
85** 6 4 Right child (the Ptr(N+1) value). Omitted if leaf
86**
87** The flags define the format of this btree page. The leaf flag means that
88** this page has no children. The zerodata flag means that this page carries
89** only keys and no data. The intkey flag means that the key is a single
90** variable length integer at the beginning of the payload.
91**
92** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
93** byte are used. The integer consists of all bytes that have bit 8 set and
94** the first byte with bit 8 clear. Unlike fixed-length values, variable-
95** length integers are little-endian. Examples:
96**
97** 0x00 becomes 0x00000000
98** 0x1b becomes 0x0000001b
99** 0x9b 0x4a becomes 0x00000dca
100** 0x80 0x1b becomes 0x0000001b
101** 0xf8 0xac 0xb1 0x91 0x01 becomes 0x12345678
102** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
103**
104** Variable length integers are used for rowids and to hold the number of
105** bytes of key and data in a btree cell.
106**
107** Unused space within a btree page is collected into a linked list of
108** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
109** to the first freeblock is given in the header. Freeblocks occur in
110** increasing order. Because a freeblock is 4 bytes in size, the minimum
111** size allocation on a btree page is 4 bytes. Because a freeblock must be
112** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
113** exist on the freeblock chain. The total number of such fragmented bytes
114** is recorded in the page header at offset 5.
115**
116** SIZE DESCRIPTION
117** 2 Byte offset of the next freeblock
118** 2 Bytes in this freeblock
119**
120** Cells are of variable length. The first cell begins on the byte defined
121** in the page header. Cells do not necessarily occur in order - they can
122** skip around on the page.
123**
124** SIZE DESCRIPTION
125** 2 Byte offset of the next cell. 0 if this is the last cell
drh3aac2dd2004-04-26 14:10:20 +0000126** 4 Page number of the left child. Omitted if leaf flag is set.
127** var Number of bytes of data. Omitted if the zerodata flag is set.
128** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000129** * Payload
130** 4 First page of the overflow chain. Omitted if no overflow
131**
132** Overflow pages form a linked list. Each page except the last is completely
133** filled with data (pagesize - 4 bytes). The last page can have as little
134** as 1 byte of data.
135**
136** SIZE DESCRIPTION
137** 4 Page number of next overflow page
138** * Data
139**
140** Freelist pages come in two subtypes: trunk pages and leaf pages. The
141** file header points to first in a linked list of trunk page. Each trunk
142** page points to multiple leaf pages. The content of a leaf page is
143** unspecified. A trunk page looks like this:
144**
145** SIZE DESCRIPTION
146** 4 Page number of next trunk page
147** 4 Number of leaf pointers on this page
148** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000149*/
150#include "sqliteInt.h"
151#include "pager.h"
152#include "btree.h"
153#include <assert.h>
154
drh4b70f112004-05-02 21:12:19 +0000155
156/* Maximum page size. The upper bound on this value is 65536 (a limit
157** imposed by the 2-byte offset at the beginning of each cell.) The
158** maximum page size determines the amount of stack space allocated
159** by many of the routines in this module. On embedded architectures
160** or any machine where memory and especially stack memory is limited,
161** one may wish to chose a smaller value for the maximum page size.
162*/
163#ifndef MX_PAGE_SIZE
164# define MX_PAGE_SIZE 1024
165#endif
166
167/* Individual entries or "cells" are limited in size so that at least
168** this many cells will fit on one page. Changing this value will result
169** in an incompatible database.
170*/
171#define MN_CELLS_PER_PAGE 4
172
173/* The following value is the maximum cell size assuming a maximum page
174** size give above.
175*/
176#define MX_CELL_SIZE ((MX_PAGE_SIZE-10)/MN_CELLS_PER_PAGE)
177
178/* The maximum number of cells on a single page of the database. This
179** assumes a minimum cell size of 3 bytes. Such small cells will be
180** exceedingly rare, but they are possible.
181*/
182#define MX_CELL ((MX_PAGE_SIZE-10)/3)
183
paulb95a8862003-04-01 21:16:41 +0000184/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000185typedef struct MemPage MemPage;
paulb95a8862003-04-01 21:16:41 +0000186
drh8c42ca92001-06-22 19:15:00 +0000187/*
drhbd03cae2001-06-02 02:40:57 +0000188** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000189** SQLite database in order to identify the file as a real database.
drhde647132004-05-07 17:57:49 +0000190** 123456789 123456 */
191static const char zMagicHeader[] = "SQLite format 3";
drh08ed44e2001-04-29 23:32:55 +0000192
193/*
drh4b70f112004-05-02 21:12:19 +0000194** Page type flags. An ORed combination of these flags appear as the
195** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000196*/
drhde647132004-05-07 17:57:49 +0000197#define PTF_INTKEY 0x01
drh9e572e62004-04-23 23:43:10 +0000198#define PTF_ZERODATA 0x02
drhde647132004-05-07 17:57:49 +0000199#define PTF_LEAF 0x04
drh4b70f112004-05-02 21:12:19 +0000200/* Idea for the future: PTF_LEAFDATA */
drh8c42ca92001-06-22 19:15:00 +0000201
202/*
drh9e572e62004-04-23 23:43:10 +0000203** As each page of the file is loaded into memory, an instance of the following
204** structure is appended and initialized to zero. This structure stores
205** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000206**
drh72f82862001-05-24 21:06:34 +0000207** The pParent field points back to the parent page. This allows us to
208** walk up the BTree from any leaf to the root. Care must be taken to
209** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000210** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000211*/
212struct MemPage {
drhde647132004-05-07 17:57:49 +0000213 u32 notUsed;
drh3aac2dd2004-04-26 14:10:20 +0000214 u8 isInit; /* True if previously initialized */
drh9e572e62004-04-23 23:43:10 +0000215 u8 idxShift; /* True if Cell indices have changed */
drh3aac2dd2004-04-26 14:10:20 +0000216 u8 isOverfull; /* Some aCell[] do not fit on page */
drh9e572e62004-04-23 23:43:10 +0000217 u8 intKey; /* True if intkey flag is set */
218 u8 leaf; /* True if leaf flag is set */
219 u8 zeroData; /* True if zero data flag is set */
220 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
drhda200cc2004-05-09 11:51:38 +0000221 u8 needRelink; /* True if need to run relinkCellList() */
drh3aac2dd2004-04-26 14:10:20 +0000222 int idxParent; /* Index in pParent->aCell[] of this node */
drh9e572e62004-04-23 23:43:10 +0000223 int nFree; /* Number of free bytes on the page */
drh306dc212001-05-21 13:45:10 +0000224 int nCell; /* Number of entries on this page */
drh4b70f112004-05-02 21:12:19 +0000225 int nCellAlloc; /* Number of slots allocated in aCell[] */
drh9e572e62004-04-23 23:43:10 +0000226 unsigned char **aCell; /* Pointer to start of each cell */
drhc8629a12004-05-08 20:07:40 +0000227 struct Btree *pBt; /* Pointer back to BTree structure */
228
229 unsigned char *aData; /* Pointer back to the start of the page */
230 Pgno pgno; /* Page number for this page */
231 MemPage *pParent; /* The parent of this page. NULL for root */
drh8c42ca92001-06-22 19:15:00 +0000232};
drh7e3b0a02001-04-28 16:52:40 +0000233
234/*
drh3b7511c2001-05-26 13:15:44 +0000235** The in-memory image of a disk page has the auxiliary information appended
236** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
237** that extra information.
238*/
drh3aac2dd2004-04-26 14:10:20 +0000239#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000240
241/*
drha059ad02001-04-17 20:09:11 +0000242** Everything we need to know about an open database
243*/
244struct Btree {
245 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000246 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000247 MemPage *pPage1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000248 u8 inTrans; /* True if a transaction is in progress */
drh3aac2dd2004-04-26 14:10:20 +0000249 u8 inStmt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000250 u8 readOnly; /* True if the underlying file is readonly */
drh9e572e62004-04-23 23:43:10 +0000251 int pageSize; /* Number of usable bytes on each page */
252 int maxLocal; /* Maximum local payload */
drha059ad02001-04-17 20:09:11 +0000253};
254typedef Btree Bt;
255
drh365d68f2001-05-11 11:02:46 +0000256/*
257** A cursor is a pointer to a particular entry in the BTree.
258** The entry is identified by its MemPage and the index in
drha34b6762004-05-07 13:30:42 +0000259** MemPage.aCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000260*/
drh72f82862001-05-24 21:06:34 +0000261struct BtCursor {
drh5e2f8b92001-05-28 00:41:15 +0000262 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000263 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000264 BtCursor *pShared; /* Loop of cursors with the same root page */
drh3aac2dd2004-04-26 14:10:20 +0000265 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
266 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000267 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000268 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000269 int idx; /* Index of the entry in pPage->aCell[] */
drhecdc7532001-09-23 02:35:53 +0000270 u8 wrFlag; /* True if writable */
drh23e11ca2004-05-04 17:27:28 +0000271 u8 iMatch; /* compare result from last sqlite3BtreeMoveto() */
drhc39e0002004-05-07 23:50:57 +0000272 u8 isValid; /* TRUE if points to a valid entry */
273 u8 status; /* Set to SQLITE_ABORT if cursors is invalidated */
drh365d68f2001-05-11 11:02:46 +0000274};
drh7e3b0a02001-04-28 16:52:40 +0000275
drha059ad02001-04-17 20:09:11 +0000276/*
drh3aac2dd2004-04-26 14:10:20 +0000277** Read or write a two-, four-, and eight-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000278*/
drh9e572e62004-04-23 23:43:10 +0000279static u32 get2byte(unsigned char *p){
280 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000281}
drh9e572e62004-04-23 23:43:10 +0000282static u32 get4byte(unsigned char *p){
283 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
284}
drh3aac2dd2004-04-26 14:10:20 +0000285static u64 get8byte(unsigned char *p){
drh9e572e62004-04-23 23:43:10 +0000286 u64 v = get4byte(p);
287 return (v<<32) | get4byte(&p[4]);
288}
289static void put2byte(unsigned char *p, u32 v){
290 p[0] = v>>8;
291 p[1] = v;
292}
293static void put4byte(unsigned char *p, u32 v){
294 p[0] = v>>24;
295 p[1] = v>>16;
296 p[2] = v>>8;
297 p[3] = v;
298}
299static void put8byte(unsigned char *p, u64 v){
300 put4byte(&p[4], v>>32);
301 put4byte(p, v);
302}
303
304/*
305** Read a variable-length integer. Store the result in *pResult.
306** Return the number of bytes in the integer.
307*/
308static unsigned int getVarint(unsigned char *p, u64 *pResult){
309 u64 x = p[0] & 0x7f;
310 int n = 0;
311 while( (p[n++]&0x80)!=0 ){
312 x |= (p[n]&0x7f)<<(n*7);
313 }
314 *pResult = x;
315 return n;
316}
317
318/*
319** Write a variable length integer with value v into p[]. Return
320** the number of bytes written.
321*/
322static unsigned int putVarint(unsigned char *p, u64 v){
323 int i = 0;
324 do{
drhde647132004-05-07 17:57:49 +0000325 p[i++] = (v & 0x7f) | 0x80;
drh9e572e62004-04-23 23:43:10 +0000326 v >>= 7;
327 }while( v!=0 );
drhde647132004-05-07 17:57:49 +0000328 p[i-1] &= 0x7f;
drh9e572e62004-04-23 23:43:10 +0000329 return i;
drh0d316a42002-08-11 20:10:47 +0000330}
331
332/*
drh3aac2dd2004-04-26 14:10:20 +0000333** Parse a cell header and fill in the CellInfo structure.
334*/
335static void parseCellHeader(
336 MemPage *pPage, /* Page containing the cell */
337 unsigned char *pCell, /* The cell */
338 u64 *pnData, /* Number of bytes of data in payload */
339 u64 *pnKey, /* Number of bytes of key, or key value for intKey */
340 int *pnHeader /* Size of header in bytes. Offset to payload */
341){
342 int n;
343 if( pPage->leaf ){
344 n = 2;
345 }else{
346 n = 6;
347 }
348 if( pPage->zeroData ){
349 *pnData = 0;
350 }else{
351 n += getVarint(&pCell[n], pnData);
352 }
drhde647132004-05-07 17:57:49 +0000353 n += getVarint(&pCell[n], pnKey);
drh3aac2dd2004-04-26 14:10:20 +0000354 *pnHeader = n;
355}
356
357/*
drh3b7511c2001-05-26 13:15:44 +0000358** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000359** database page. The number returned includes the Cell header,
360** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000361** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000362** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000363*/
drh9e572e62004-04-23 23:43:10 +0000364static int cellSize(MemPage *pPage, unsigned char *pCell){
drh3aac2dd2004-04-26 14:10:20 +0000365 int n;
drh9e572e62004-04-23 23:43:10 +0000366 u64 nData, nKey;
drh3aac2dd2004-04-26 14:10:20 +0000367 int nPayload, maxPayload;
368
369 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
370 nPayload = (int)nData;
371 if( !pPage->intKey ){
372 nPayload += (int)nKey;
drh3b7511c2001-05-26 13:15:44 +0000373 }
drh3aac2dd2004-04-26 14:10:20 +0000374 maxPayload = pPage->pBt->maxLocal;
drh9e572e62004-04-23 23:43:10 +0000375 if( nPayload>maxPayload ){
376 nPayload = maxPayload + 4;
377 }
378 return n + nPayload;
drh3b7511c2001-05-26 13:15:44 +0000379}
380
381/*
drhda200cc2004-05-09 11:51:38 +0000382** Do sanity checking on a page. Throw an exception if anything is
383** not right.
384**
385** This routine is used for internal error checking only. It is omitted
386** from most builds.
387*/
388#if defined(BTREE_DEBUG) && !defined(NDEBUG) && 0
389static void _pageIntegrity(MemPage *pPage){
390 int pageSize;
391 u8 *data;
392 int i, idx, c, pc, hdr, nFree;
393 u8 used[MX_PAGE_SIZE];
394
395 pageSize = pPage->pBt->pageSize;
396 assert( pPage->aData==&((unsigned char*)pPage)[-pageSize] );
397 hdr = pPage->hdrOffset;
398 assert( hdr==(pPage->pgno==1 ? 100 : 0) );
399 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
400 c = pPage->aData[hdr];
401 if( pPage->isInit ){
402 assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
403 assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
404 assert( pPage->intKey == ((c & PTF_INTKEY)!=0) );
405 }
406 data = pPage->aData;
407 memset(used, 0, pageSize);
408 for(i=0; i<hdr+10-pPage->leaf*4; i++) used[i] = 1;
409 nFree = 0;
410 pc = get2byte(&data[hdr+1]);
411 while( pc ){
412 int size;
413 assert( pc>0 && pc<pageSize-4 );
414 size = get2byte(&data[pc+2]);
415 assert( pc+size<=pageSize );
416 nFree += size;
417 for(i=pc; i<pc+size; i++){
418 assert( used[i]==0 );
419 used[i] = 1;
420 }
421 pc = get2byte(&data[pc]);
422 }
423 assert( pPage->isInit==0 || pPage->nFree==nFree+data[hdr+5] );
424 idx = 0;
425 pc = get2byte(&data[hdr+3]);
426 while( pc ){
427 int size;
428 assert( pPage->isInit==0 || idx<pPage->nCell );
429 assert( pc>0 && pc<pageSize-4 );
430 assert( pPage->isInit==0 || pPage->aCell[idx]==&data[pc] );
431 size = cellSize(pPage, &data[pc]);
432 assert( pc+size<=pageSize );
433 for(i=pc; i<pc+size; i++){
434 assert( used[i]==0 );
435 used[i] = 1;
436 }
437 pc = get2byte(&data[pc]);
438 idx++;
439 }
440 assert( idx==pPage->nCell );
441 nFree = 0;
442 for(i=0; i<pageSize; i++){
443 assert( used[i]<=1 );
444 if( used[i]==0 ) nFree++;
445 }
446 assert( nFree==data[hdr+5] );
447}
448#define pageIntegrity(X) _pageIntegrity(X)
449#else
450# define pageIntegrity(X)
451#endif
452
453/*
drh72f82862001-05-24 21:06:34 +0000454** Defragment the page given. All Cells are moved to the
455** beginning of the page and all free space is collected
456** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000457*/
drh9e572e62004-04-23 23:43:10 +0000458static void defragmentPage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000459 int pc, i, n, addr;
460 int start, hdr, size;
drh9e572e62004-04-23 23:43:10 +0000461 int leftover;
462 unsigned char *oldPage;
drh23e11ca2004-05-04 17:27:28 +0000463 unsigned char newPage[MX_PAGE_SIZE];
drh2af926b2001-05-15 00:39:25 +0000464
drha34b6762004-05-07 13:30:42 +0000465 assert( sqlite3pager_iswriteable(pPage->aData) );
drh9e572e62004-04-23 23:43:10 +0000466 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000467 assert( pPage->pBt->pageSize <= MX_PAGE_SIZE );
drhda200cc2004-05-09 11:51:38 +0000468 assert( !pPage->needRelink );
469 assert( !pPage->isOverfull );
drh9e572e62004-04-23 23:43:10 +0000470 oldPage = pPage->aData;
471 hdr = pPage->hdrOffset;
drh3aac2dd2004-04-26 14:10:20 +0000472 addr = 3+hdr;
drh9e572e62004-04-23 23:43:10 +0000473 n = 6+hdr;
474 if( !pPage->leaf ){
475 n += 4;
drh2af926b2001-05-15 00:39:25 +0000476 }
drhc39e0002004-05-07 23:50:57 +0000477 memcpy(&newPage[hdr], &oldPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000478 start = n;
drh3aac2dd2004-04-26 14:10:20 +0000479 pc = get2byte(&oldPage[addr]);
drh9e572e62004-04-23 23:43:10 +0000480 i = 0;
481 while( pc>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000482 assert( n<pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000483 size = cellSize(pPage, &oldPage[pc]);
484 memcpy(&newPage[n], &oldPage[pc], size);
drh3aac2dd2004-04-26 14:10:20 +0000485 put2byte(&newPage[addr],n);
drhda200cc2004-05-09 11:51:38 +0000486 assert( pPage->aCell[i]==&oldPage[pc] );
drhc39e0002004-05-07 23:50:57 +0000487 pPage->aCell[i++] = &oldPage[n];
drhda200cc2004-05-09 11:51:38 +0000488 addr = n;
drh9e572e62004-04-23 23:43:10 +0000489 n += size;
drh9e572e62004-04-23 23:43:10 +0000490 pc = get2byte(&oldPage[pc]);
drh2aa679f2001-06-25 02:11:07 +0000491 }
drhc39e0002004-05-07 23:50:57 +0000492 assert( i==pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +0000493 leftover = pPage->pBt->pageSize - n;
drh9e572e62004-04-23 23:43:10 +0000494 assert( leftover>=0 );
495 assert( pPage->nFree==leftover );
496 if( leftover<4 ){
497 oldPage[hdr+5] = leftover;
498 leftover = 0;
drh3aac2dd2004-04-26 14:10:20 +0000499 n = pPage->pBt->pageSize;
drh9e572e62004-04-23 23:43:10 +0000500 }
drhc39e0002004-05-07 23:50:57 +0000501 memcpy(&oldPage[hdr], &newPage[hdr], n-hdr);
drh9e572e62004-04-23 23:43:10 +0000502 if( leftover==0 ){
drhc39e0002004-05-07 23:50:57 +0000503 put2byte(&oldPage[hdr+1], 0);
drh9e572e62004-04-23 23:43:10 +0000504 }else if( leftover>=4 ){
drhc39e0002004-05-07 23:50:57 +0000505 put2byte(&oldPage[hdr+1], n);
drh9e572e62004-04-23 23:43:10 +0000506 put2byte(&oldPage[n], 0);
507 put2byte(&oldPage[n+2], leftover);
508 memset(&oldPage[n+4], 0, leftover-4);
509 }
drhc39e0002004-05-07 23:50:57 +0000510 oldPage[hdr+5] = 0;
drh365d68f2001-05-11 11:02:46 +0000511}
512
drha059ad02001-04-17 20:09:11 +0000513/*
drh9e572e62004-04-23 23:43:10 +0000514** Allocate nByte bytes of space on a page. If nByte is less than
515** 4 it is rounded up to 4.
drhbd03cae2001-06-02 02:40:57 +0000516**
drh9e572e62004-04-23 23:43:10 +0000517** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000518** the new allocation. Or return 0 if there is not enough free
519** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000520**
drh72f82862001-05-24 21:06:34 +0000521** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000522** nBytes of contiguous free space, then this routine automatically
523** calls defragementPage() to consolidate all free space before
524** allocating the new chunk.
drh9e572e62004-04-23 23:43:10 +0000525**
526** Algorithm: Carve a piece off of the first freeblock that is
527** nByte in size or that larger.
drh7e3b0a02001-04-28 16:52:40 +0000528*/
drh9e572e62004-04-23 23:43:10 +0000529static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000530 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000531 int size;
drh24cd67e2004-05-10 16:18:47 +0000532 int nFrag;
drh9e572e62004-04-23 23:43:10 +0000533 unsigned char *data;
drh44ce7e22003-06-17 02:57:17 +0000534#ifndef NDEBUG
535 int cnt = 0;
536#endif
drh72f82862001-05-24 21:06:34 +0000537
drh9e572e62004-04-23 23:43:10 +0000538 data = pPage->aData;
drha34b6762004-05-07 13:30:42 +0000539 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000540 assert( pPage->pBt );
541 if( nByte<4 ) nByte = 4;
drh14acc042001-06-10 19:56:58 +0000542 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
drh9e572e62004-04-23 23:43:10 +0000543 hdr = pPage->hdrOffset;
drh24cd67e2004-05-10 16:18:47 +0000544 nFrag = data[hdr+5];
545 if( nFrag>=60 || nFrag>pPage->nFree-nByte ){
drh9e572e62004-04-23 23:43:10 +0000546 defragmentPage(pPage);
drh2af926b2001-05-15 00:39:25 +0000547 }
drh3aac2dd2004-04-26 14:10:20 +0000548 addr = hdr+1;
549 pc = get2byte(&data[addr]);
550 assert( addr<pc );
drha34b6762004-05-07 13:30:42 +0000551 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000552 while( (size = get2byte(&data[pc+2]))<nByte ){
553 addr = pc;
554 pc = get2byte(&data[addr]);
drha34b6762004-05-07 13:30:42 +0000555 assert( pc<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000556 assert( pc>=addr+size+4 || pc==0 );
drh9e572e62004-04-23 23:43:10 +0000557 if( pc==0 ){
558 assert( (cnt++)==0 );
559 defragmentPage(pPage);
560 assert( data[hdr+5]==0 );
drh3aac2dd2004-04-26 14:10:20 +0000561 addr = pPage->hdrOffset+1;
562 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000563 }
564 }
565 assert( pc>0 && size>=nByte );
drha34b6762004-05-07 13:30:42 +0000566 assert( pc+size<=pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000567 if( size>nByte+4 ){
drhde647132004-05-07 17:57:49 +0000568 int newStart = pc+nByte;
569 put2byte(&data[addr], newStart);
570 put2byte(&data[newStart], get2byte(&data[pc]));
571 put2byte(&data[newStart+2], size-nByte);
drh2af926b2001-05-15 00:39:25 +0000572 }else{
drh3aac2dd2004-04-26 14:10:20 +0000573 put2byte(&data[addr], get2byte(&data[pc]));
drh9e572e62004-04-23 23:43:10 +0000574 data[hdr+5] += size-nByte;
drh2af926b2001-05-15 00:39:25 +0000575 }
576 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000577 assert( pPage->nFree>=0 );
578 return pc;
drh7e3b0a02001-04-28 16:52:40 +0000579}
580
581/*
drh9e572e62004-04-23 23:43:10 +0000582** Return a section of the pPage->aData to the freelist.
583** The first byte of the new free block is pPage->aDisk[start]
584** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000585**
586** Most of the effort here is involved in coalesing adjacent
587** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000588*/
drh9e572e62004-04-23 23:43:10 +0000589static void freeSpace(MemPage *pPage, int start, int size){
590 int end = start + size; /* End of the segment being freed */
drha34b6762004-05-07 13:30:42 +0000591 int addr, pbegin;
drh9e572e62004-04-23 23:43:10 +0000592#ifndef NDEBUG
593 int tsize = 0; /* Total size of all freeblocks */
594#endif
595 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000596
drh9e572e62004-04-23 23:43:10 +0000597 assert( pPage->pBt!=0 );
drha34b6762004-05-07 13:30:42 +0000598 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000599 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
600 assert( end<=pPage->pBt->pageSize );
601 if( size<4 ) size = 4;
602
603 /* Add the space back into the linked list of freeblocks */
drh3aac2dd2004-04-26 14:10:20 +0000604 addr = pPage->hdrOffset + 1;
605 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drh9e572e62004-04-23 23:43:10 +0000606 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000607 assert( pbegin>addr );
608 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000609 }
drh9e572e62004-04-23 23:43:10 +0000610 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000611 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000612 put2byte(&data[addr], start);
613 put2byte(&data[start], pbegin);
614 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000615 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000616
617 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000618 addr = pPage->hdrOffset + 1;
619 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000620 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000621 assert( pbegin>addr );
drh9e572e62004-04-23 23:43:10 +0000622 assert( pbegin<pPage->pBt->pageSize-4 );
623 pnext = get2byte(&data[pbegin]);
624 psize = get2byte(&data[pbegin+2]);
625 if( pbegin + psize + 3 >= pnext && pnext>0 ){
626 int frag = pnext - (pbegin+psize);
627 assert( frag<=data[pPage->hdrOffset+5] );
628 data[pPage->hdrOffset+5] -= frag;
629 put2byte(&data[pbegin], get2byte(&data[pnext]));
630 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
631 }else{
632 assert( (tsize += psize)>0 );
drh3aac2dd2004-04-26 14:10:20 +0000633 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000634 }
635 }
636 assert( tsize+data[pPage->hdrOffset+5]==pPage->nFree );
drh7e3b0a02001-04-28 16:52:40 +0000637}
638
drh9e572e62004-04-23 23:43:10 +0000639/*
drh4b70f112004-05-02 21:12:19 +0000640** Resize the aCell[] array of the given page so that it is able to
641** hold at least nNewSz entries.
642**
643** Return SQLITE_OK or SQLITE_NOMEM.
644*/
645static int resizeCellArray(MemPage *pPage, int nNewSz){
drha34b6762004-05-07 13:30:42 +0000646 if( pPage->nCellAlloc<nNewSz ){
drh4b70f112004-05-02 21:12:19 +0000647 pPage->aCell = sqliteRealloc(pPage->aCell, nNewSz*sizeof(pPage->aCell[0]) );
danielk197724b03fd2004-05-10 10:34:34 +0000648 if( sqlite3_malloc_failed ) return SQLITE_NOMEM;
drha34b6762004-05-07 13:30:42 +0000649 pPage->nCellAlloc = nNewSz;
drh4b70f112004-05-02 21:12:19 +0000650 }
651 return SQLITE_OK;
652}
653
654/*
drh7e3b0a02001-04-28 16:52:40 +0000655** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000656**
drhbd03cae2001-06-02 02:40:57 +0000657** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +0000658** is the parent of the page being initialized. The root of a
659** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000660**
drh72f82862001-05-24 21:06:34 +0000661** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000662** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000663** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
664** guarantee that the page is well-formed. It only shows that
665** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000666*/
drh9e572e62004-04-23 23:43:10 +0000667static int initPage(
drh3aac2dd2004-04-26 14:10:20 +0000668 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +0000669 MemPage *pParent /* The parent. Might be NULL */
670){
drh3aac2dd2004-04-26 14:10:20 +0000671 int c, pc, i, hdr;
drha34b6762004-05-07 13:30:42 +0000672 unsigned char *data;
673 int pageSize;
drh9e572e62004-04-23 23:43:10 +0000674 int sumCell = 0; /* Total size of all cells */
drh2af926b2001-05-15 00:39:25 +0000675
drh3aac2dd2004-04-26 14:10:20 +0000676 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000677 assert( pParent==0 || pParent->pBt==pPage->pBt );
drha34b6762004-05-07 13:30:42 +0000678 assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
drhde647132004-05-07 17:57:49 +0000679 assert( pPage->aData == &((unsigned char*)pPage)[-pPage->pBt->pageSize] );
drhda200cc2004-05-09 11:51:38 +0000680 assert( pPage->pParent==0 || pPage->pParent==pParent );
681 if( pPage->pParent==0 && pParent!=0 ){
682 pPage->pParent = pParent;
drha34b6762004-05-07 13:30:42 +0000683 sqlite3pager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +0000684 }
drhda200cc2004-05-09 11:51:38 +0000685 if( pPage->isInit ) return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +0000686 pPage->nCell = pPage->nCellAlloc = 0;
drhde647132004-05-07 17:57:49 +0000687 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
688 hdr = pPage->hdrOffset;
drha34b6762004-05-07 13:30:42 +0000689 data = pPage->aData;
690 c = data[hdr];
drh9e572e62004-04-23 23:43:10 +0000691 pPage->intKey = (c & PTF_INTKEY)!=0;
692 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh4b70f112004-05-02 21:12:19 +0000693 pPage->leaf = (c & PTF_LEAF)!=0;
drhc8629a12004-05-08 20:07:40 +0000694 pPage->isOverfull = 0;
drhda200cc2004-05-09 11:51:38 +0000695 pPage->needRelink = 0;
drhc8629a12004-05-08 20:07:40 +0000696 pPage->idxShift = 0;
drha34b6762004-05-07 13:30:42 +0000697 pageSize = pPage->pBt->pageSize;
drh2af926b2001-05-15 00:39:25 +0000698
drh9e572e62004-04-23 23:43:10 +0000699 /* Initialize the cell count and cell pointers */
700 pc = get2byte(&data[hdr+3]);
701 while( pc>0 ){
drha34b6762004-05-07 13:30:42 +0000702 if( pc>=pageSize ) return SQLITE_CORRUPT;
703 if( pPage->nCell>pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000704 pPage->nCell++;
705 pc = get2byte(&data[pc]);
706 }
drh4b70f112004-05-02 21:12:19 +0000707 if( resizeCellArray(pPage, pPage->nCell) ){
drh9e572e62004-04-23 23:43:10 +0000708 return SQLITE_NOMEM;
709 }
710 pc = get2byte(&data[hdr+3]);
711 for(i=0; pc>0; i++){
712 pPage->aCell[i] = &data[pc];
drh9e572e62004-04-23 23:43:10 +0000713 sumCell += cellSize(pPage, &data[pc]);
drhde647132004-05-07 17:57:49 +0000714 pc = get2byte(&data[pc]);
drh9e572e62004-04-23 23:43:10 +0000715 }
716
717 /* Compute the total free space on the page */
718 pPage->nFree = data[hdr+5];
719 pc = get2byte(&data[hdr+1]);
720 while( pc>0 ){
721 int next, size;
drha34b6762004-05-07 13:30:42 +0000722 if( pc>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000723 next = get2byte(&data[pc]);
724 size = get2byte(&data[pc+2]);
drh3aac2dd2004-04-26 14:10:20 +0000725 if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000726 pPage->nFree += size;
727 pc = next;
728 }
drha34b6762004-05-07 13:30:42 +0000729 if( pPage->nFree>=pageSize ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000730
731 /* Sanity check: Cells and freespace and header must sum to the size
732 ** a page. */
drha34b6762004-05-07 13:30:42 +0000733 if( sumCell+pPage->nFree+hdr+10-pPage->leaf*4 != pageSize ){
734 return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000735 }
736
drhde647132004-05-07 17:57:49 +0000737 pPage->isInit = 1;
drhda200cc2004-05-09 11:51:38 +0000738 pageIntegrity(pPage);
drh9e572e62004-04-23 23:43:10 +0000739 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000740}
741
742/*
drh8b2f49b2001-06-08 00:21:52 +0000743** Set up a raw page so that it looks like a database page holding
744** no entries.
drhbd03cae2001-06-02 02:40:57 +0000745*/
drh9e572e62004-04-23 23:43:10 +0000746static void zeroPage(MemPage *pPage, int flags){
747 unsigned char *data = pPage->aData;
748 Btree *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +0000749 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +0000750 int first;
751
drhda200cc2004-05-09 11:51:38 +0000752 assert( sqlite3pager_pagenumber(data)==pPage->pgno );
753 assert( &data[pBt->pageSize] == (unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000754 assert( sqlite3pager_iswriteable(data) );
drh9e572e62004-04-23 23:43:10 +0000755 memset(&data[hdr], 0, pBt->pageSize - hdr);
756 data[hdr] = flags;
drhde647132004-05-07 17:57:49 +0000757 first = hdr + 6 + 4*((flags&PTF_LEAF)==0);
drh9e572e62004-04-23 23:43:10 +0000758 put2byte(&data[hdr+1], first);
759 put2byte(&data[first+2], pBt->pageSize - first);
760 sqliteFree(pPage->aCell);
761 pPage->aCell = 0;
drh8c42ca92001-06-22 19:15:00 +0000762 pPage->nCell = 0;
drhde647132004-05-07 17:57:49 +0000763 pPage->nCellAlloc = 0;
drh9e572e62004-04-23 23:43:10 +0000764 pPage->nFree = pBt->pageSize - first;
765 pPage->intKey = (flags & PTF_INTKEY)!=0;
766 pPage->leaf = (flags & PTF_LEAF)!=0;
767 pPage->zeroData = (flags & PTF_ZERODATA)!=0;
768 pPage->hdrOffset = hdr;
drhda200cc2004-05-09 11:51:38 +0000769 pPage->isOverfull = 0;
770 pPage->needRelink = 0;
771 pPage->idxShift = 0;
772 pPage->isInit = 1;
773 pageIntegrity(pPage);
drhbd03cae2001-06-02 02:40:57 +0000774}
775
776/*
drh3aac2dd2004-04-26 14:10:20 +0000777** Get a page from the pager. Initialize the MemPage.pBt and
778** MemPage.aData elements if needed.
779*/
780static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
781 int rc;
782 unsigned char *aData;
783 MemPage *pPage;
drha34b6762004-05-07 13:30:42 +0000784 rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
drh3aac2dd2004-04-26 14:10:20 +0000785 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +0000786 pPage = (MemPage*)&aData[pBt->pageSize];
drh3aac2dd2004-04-26 14:10:20 +0000787 pPage->aData = aData;
788 pPage->pBt = pBt;
789 pPage->pgno = pgno;
drhde647132004-05-07 17:57:49 +0000790 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
drh3aac2dd2004-04-26 14:10:20 +0000791 *ppPage = pPage;
792 return SQLITE_OK;
793}
794
795/*
drhde647132004-05-07 17:57:49 +0000796** Get a page from the pager and initialize it. This routine
797** is just a convenience wrapper around separate calls to
798** getPage() and initPage().
799*/
800static int getAndInitPage(
801 Btree *pBt, /* The database file */
802 Pgno pgno, /* Number of the page to get */
803 MemPage **ppPage, /* Write the page pointer here */
804 MemPage *pParent /* Parent of the page */
805){
806 int rc;
807 rc = getPage(pBt, pgno, ppPage);
808 if( rc==SQLITE_OK ){
809 rc = initPage(*ppPage, pParent);
810 }
811 return rc;
812}
813
814/*
drh3aac2dd2004-04-26 14:10:20 +0000815** Release a MemPage. This should be called once for each prior
816** call to getPage.
817*/
drh4b70f112004-05-02 21:12:19 +0000818static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000819 if( pPage ){
820 assert( pPage->aData );
821 assert( pPage->pBt );
822 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
drha34b6762004-05-07 13:30:42 +0000823 sqlite3pager_unref(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +0000824 }
825}
826
827/*
drh72f82862001-05-24 21:06:34 +0000828** This routine is called when the reference count for a page
829** reaches zero. We need to unref the pParent pointer when that
830** happens.
831*/
832static void pageDestructor(void *pData){
drh9e572e62004-04-23 23:43:10 +0000833 MemPage *pPage = (MemPage*)&((char*)pData)[SQLITE_PAGE_SIZE];
drhda200cc2004-05-09 11:51:38 +0000834 assert( pPage->isInit==0 || pPage->needRelink==0 );
drh72f82862001-05-24 21:06:34 +0000835 if( pPage->pParent ){
836 MemPage *pParent = pPage->pParent;
837 pPage->pParent = 0;
drha34b6762004-05-07 13:30:42 +0000838 releasePage(pParent);
drh72f82862001-05-24 21:06:34 +0000839 }
drh9e572e62004-04-23 23:43:10 +0000840 sqliteFree(pPage->aCell);
841 pPage->aCell = 0;
drh3aac2dd2004-04-26 14:10:20 +0000842 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +0000843}
844
845/*
drh306dc212001-05-21 13:45:10 +0000846** Open a new database.
847**
848** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000849** for accessing the database. We do not open the database file
850** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000851**
852** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000853** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +0000854** database file will be deleted when sqlite3BtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000855*/
drh23e11ca2004-05-04 17:27:28 +0000856int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +0000857 const char *zFilename, /* Name of the file containing the BTree database */
858 Btree **ppBtree, /* Pointer to new Btree object written here */
859 int nCache, /* Number of cache pages */
860 int flags /* Options */
drh6019e162001-07-02 17:51:45 +0000861){
drha059ad02001-04-17 20:09:11 +0000862 Btree *pBt;
drha34b6762004-05-07 13:30:42 +0000863 int rc;
drha059ad02001-04-17 20:09:11 +0000864
drhd62d3d02003-01-24 12:14:20 +0000865 /*
866 ** The following asserts make sure that structures used by the btree are
867 ** the right size. This is to guard against size changes that result
868 ** when compiling on a different architecture.
869 */
drh9e572e62004-04-23 23:43:10 +0000870 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +0000871 assert( sizeof(u32)==4 );
872 assert( sizeof(u16)==2 );
873 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +0000874 assert( sizeof(ptr)==sizeof(char*) );
875 assert( sizeof(uptr)==sizeof(ptr) );
876
drha059ad02001-04-17 20:09:11 +0000877 pBt = sqliteMalloc( sizeof(*pBt) );
878 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000879 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000880 return SQLITE_NOMEM;
881 }
drh6019e162001-07-02 17:51:45 +0000882 if( nCache<10 ) nCache = 10;
drha34b6762004-05-07 13:30:42 +0000883 rc = sqlite3pager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
drh3aac2dd2004-04-26 14:10:20 +0000884 (flags & BTREE_OMIT_JOURNAL)==0);
drha059ad02001-04-17 20:09:11 +0000885 if( rc!=SQLITE_OK ){
drha34b6762004-05-07 13:30:42 +0000886 if( pBt->pPager ) sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000887 sqliteFree(pBt);
888 *ppBtree = 0;
889 return rc;
890 }
drha34b6762004-05-07 13:30:42 +0000891 sqlite3pager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000892 pBt->pCursor = 0;
drha34b6762004-05-07 13:30:42 +0000893 pBt->pPage1 = 0;
894 pBt->readOnly = sqlite3pager_isreadonly(pBt->pPager);
drh4b70f112004-05-02 21:12:19 +0000895 pBt->pageSize = SQLITE_PAGE_SIZE; /* FIX ME - read from header */
drh3a4c1412004-05-09 20:40:11 +0000896
897 /* maxLocal is the maximum amount of payload to store locally for
898 ** a cell. Make sure it is small enough so that at least MN_CELLS_PER_PAGE
899 ** will fit on one page. We assume a 10-byte page header. Besides
900 ** the payload, the cell must store:
901 ** 2-byte pointer to next cell
902 ** 4-byte child pointer
903 ** 9-byte nKey value
904 ** 4-byte nData value
905 ** 4-byte overflow page pointer
906 */
907 pBt->maxLocal = (pBt->pageSize-10)/MN_CELLS_PER_PAGE - 23;
908 assert( pBt->maxLocal + 23 <= MX_CELL_SIZE );
drha059ad02001-04-17 20:09:11 +0000909 *ppBtree = pBt;
910 return SQLITE_OK;
911}
912
913/*
914** Close an open database and invalidate all cursors.
915*/
drh3aac2dd2004-04-26 14:10:20 +0000916int sqlite3BtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000917 while( pBt->pCursor ){
drh3aac2dd2004-04-26 14:10:20 +0000918 sqlite3BtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000919 }
drha34b6762004-05-07 13:30:42 +0000920 sqlite3pager_close(pBt->pPager);
drha059ad02001-04-17 20:09:11 +0000921 sqliteFree(pBt);
922 return SQLITE_OK;
923}
924
925/*
drhda47d772002-12-02 04:25:19 +0000926** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000927**
928** The maximum number of cache pages is set to the absolute
929** value of mxPage. If mxPage is negative, the pager will
930** operate asynchronously - it will not stop to do fsync()s
931** to insure data is written to the disk surface before
932** continuing. Transactions still work if synchronous is off,
933** and the database cannot be corrupted if this program
934** crashes. But if the operating system crashes or there is
935** an abrupt power failure when synchronous is off, the database
936** could be left in an inconsistent and unrecoverable state.
937** Synchronous is on by default so database corruption is not
938** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000939*/
drh23e11ca2004-05-04 17:27:28 +0000940int sqlite3BtreeSetCacheSize(Btree *pBt, int mxPage){
drha34b6762004-05-07 13:30:42 +0000941 sqlite3pager_set_cachesize(pBt->pPager, mxPage);
drhf57b14a2001-09-14 18:54:08 +0000942 return SQLITE_OK;
943}
944
945/*
drh973b6e32003-02-12 14:09:42 +0000946** Change the way data is synced to disk in order to increase or decrease
947** how well the database resists damage due to OS crashes and power
948** failures. Level 1 is the same as asynchronous (no syncs() occur and
949** there is a high probability of damage) Level 2 is the default. There
950** is a very low but non-zero probability of damage. Level 3 reduces the
951** probability of damage to near zero but with a write performance reduction.
952*/
drh3aac2dd2004-04-26 14:10:20 +0000953int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
drha34b6762004-05-07 13:30:42 +0000954 sqlite3pager_set_safety_level(pBt->pPager, level);
drh973b6e32003-02-12 14:09:42 +0000955 return SQLITE_OK;
956}
957
958/*
drha34b6762004-05-07 13:30:42 +0000959** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +0000960** also acquire a readlock on that file.
961**
962** SQLITE_OK is returned on success. If the file is not a
963** well-formed database file, then SQLITE_CORRUPT is returned.
964** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
965** is returned if we run out of memory. SQLITE_PROTOCOL is returned
966** if there is a locking protocol violation.
967*/
968static int lockBtree(Btree *pBt){
969 int rc;
drh3aac2dd2004-04-26 14:10:20 +0000970 MemPage *pPage1;
drha34b6762004-05-07 13:30:42 +0000971 if( pBt->pPage1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000972 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +0000973 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +0000974
drh306dc212001-05-21 13:45:10 +0000975
976 /* Do some checking to help insure the file we opened really is
977 ** a valid database file.
978 */
drha34b6762004-05-07 13:30:42 +0000979 if( sqlite3pager_pagecount(pBt->pPager)>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000980 if( memcmp(pPage1->aData, zMagicHeader, 16)!=0 ){
drhc602f9a2004-02-12 19:01:04 +0000981 rc = SQLITE_NOTADB;
drh72f82862001-05-24 21:06:34 +0000982 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +0000983 }
drh9e572e62004-04-23 23:43:10 +0000984 /*** TBD: Other header checks such as page size ****/
drh306dc212001-05-21 13:45:10 +0000985 }
drh3aac2dd2004-04-26 14:10:20 +0000986 pBt->pPage1 = pPage1;
drh306dc212001-05-21 13:45:10 +0000987 return rc;
988
drh72f82862001-05-24 21:06:34 +0000989page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +0000990 releasePage(pPage1);
991 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +0000992 return rc;
drh306dc212001-05-21 13:45:10 +0000993}
994
995/*
drhb8ca3072001-12-05 00:21:20 +0000996** If there are no outstanding cursors and we are not in the middle
997** of a transaction but there is a read lock on the database, then
998** this routine unrefs the first page of the database file which
999** has the effect of releasing the read lock.
1000**
1001** If there are any outstanding cursors, this routine is a no-op.
1002**
1003** If there is a transaction in progress, this routine is a no-op.
1004*/
1005static void unlockBtreeIfUnused(Btree *pBt){
drh3aac2dd2004-04-26 14:10:20 +00001006 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->pPage1!=0 ){
1007 releasePage(pBt->pPage1);
1008 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +00001009 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001010 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001011 }
1012}
1013
1014/*
drh9e572e62004-04-23 23:43:10 +00001015** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001016** file.
drh8b2f49b2001-06-08 00:21:52 +00001017*/
1018static int newDatabase(Btree *pBt){
drh9e572e62004-04-23 23:43:10 +00001019 MemPage *pP1;
1020 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001021 int rc;
drhde647132004-05-07 17:57:49 +00001022 if( sqlite3pager_pagecount(pBt->pPager)>0 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00001023 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001024 assert( pP1!=0 );
1025 data = pP1->aData;
drha34b6762004-05-07 13:30:42 +00001026 rc = sqlite3pager_write(data);
drh8b2f49b2001-06-08 00:21:52 +00001027 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001028 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1029 assert( sizeof(zMagicHeader)==16 );
1030 put2byte(&data[16], SQLITE_PAGE_SIZE);
1031 data[18] = 1;
1032 data[19] = 1;
1033 put2byte(&data[22], (SQLITE_PAGE_SIZE-10)/4-12);
1034 zeroPage(pP1, PTF_INTKEY|PTF_LEAF);
drh8b2f49b2001-06-08 00:21:52 +00001035 return SQLITE_OK;
1036}
1037
1038/*
drh72f82862001-05-24 21:06:34 +00001039** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001040**
1041** A transaction must be started before attempting any changes
1042** to the database. None of the following routines will work
1043** unless a transaction is started first:
1044**
drh23e11ca2004-05-04 17:27:28 +00001045** sqlite3BtreeCreateTable()
1046** sqlite3BtreeCreateIndex()
1047** sqlite3BtreeClearTable()
1048** sqlite3BtreeDropTable()
1049** sqlite3BtreeInsert()
1050** sqlite3BtreeDelete()
1051** sqlite3BtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +00001052*/
drh3aac2dd2004-04-26 14:10:20 +00001053int sqlite3BtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001054 int rc;
1055 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +00001056 if( pBt->readOnly ) return SQLITE_READONLY;
drh3aac2dd2004-04-26 14:10:20 +00001057 if( pBt->pPage1==0 ){
drh7e3b0a02001-04-28 16:52:40 +00001058 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +00001059 if( rc!=SQLITE_OK ){
1060 return rc;
1061 }
drha059ad02001-04-17 20:09:11 +00001062 }
drha34b6762004-05-07 13:30:42 +00001063 rc = sqlite3pager_begin(pBt->pPage1->aData);
drhf74b8d92002-09-01 23:20:45 +00001064 if( rc==SQLITE_OK ){
1065 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +00001066 }
drhb8ca3072001-12-05 00:21:20 +00001067 if( rc==SQLITE_OK ){
1068 pBt->inTrans = 1;
drh3aac2dd2004-04-26 14:10:20 +00001069 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001070 }else{
1071 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001072 }
drhb8ca3072001-12-05 00:21:20 +00001073 return rc;
drha059ad02001-04-17 20:09:11 +00001074}
1075
1076/*
drh2aa679f2001-06-25 02:11:07 +00001077** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00001078**
1079** This will release the write lock on the database file. If there
1080** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001081*/
drh3aac2dd2004-04-26 14:10:20 +00001082int sqlite3BtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001083 int rc;
drha34b6762004-05-07 13:30:42 +00001084 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +00001085 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001086 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00001087 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001088 return rc;
1089}
1090
1091/*
drhc39e0002004-05-07 23:50:57 +00001092** Invalidate all cursors
1093*/
1094static void invalidateCursors(Btree *pBt){
1095 BtCursor *pCur;
1096 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1097 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001098 if( pPage /* && !pPage->isInit */ ){
1099 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001100 releasePage(pPage);
1101 pCur->pPage = 0;
1102 pCur->isValid = 0;
1103 pCur->status = SQLITE_ABORT;
1104 }
1105 }
1106}
1107
drhda200cc2004-05-09 11:51:38 +00001108#ifdef SQLITE_TEST
1109/*
1110** Print debugging information about all cursors to standard output.
1111*/
1112void sqlite3BtreeCursorList(Btree *pBt){
1113 BtCursor *pCur;
1114 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1115 MemPage *pPage = pCur->pPage;
1116 char *zMode = pCur->wrFlag ? "rw" : "ro";
1117 printf("CURSOR %08x rooted at %4d(%s) currently at %d.%d%s\n",
1118 (int)pCur, pCur->pgnoRoot, zMode,
1119 pPage ? pPage->pgno : 0, pCur->idx,
1120 pCur->isValid ? "" : " eof"
1121 );
1122 }
1123}
1124#endif
1125
drhc39e0002004-05-07 23:50:57 +00001126/*
drhecdc7532001-09-23 02:35:53 +00001127** Rollback the transaction in progress. All cursors will be
1128** invalided by this operation. Any attempt to use a cursor
1129** that was open at the beginning of this operation will result
1130** in an error.
drh5e00f6c2001-09-13 13:46:56 +00001131**
1132** This will release the write lock on the database file. If there
1133** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001134*/
drh3aac2dd2004-04-26 14:10:20 +00001135int sqlite3BtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001136 int rc;
drh24cd67e2004-05-10 16:18:47 +00001137 MemPage *pPage1;
drh7c717f72001-06-24 20:39:41 +00001138 if( pBt->inTrans==0 ) return SQLITE_OK;
1139 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001140 pBt->inStmt = 0;
drh24cd67e2004-05-10 16:18:47 +00001141 if( pBt->readOnly ){
1142 rc = SQLITE_OK;
1143 }else{
1144 rc = sqlite3pager_rollback(pBt->pPager);
1145 /* The rollback may have destroyed the pPage1->aData value. So
1146 ** call getPage() on page 1 again to make sure pPage1->aData is
1147 ** set correctly. */
1148 if( getPage(pBt, 1, &pPage1)==SQLITE_OK ){
1149 releasePage(pPage1);
1150 }
1151 }
drhc39e0002004-05-07 23:50:57 +00001152 invalidateCursors(pBt);
drh5e00f6c2001-09-13 13:46:56 +00001153 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001154 return rc;
1155}
1156
1157/*
drh663fc632002-02-02 18:49:19 +00001158** Set the checkpoint for the current transaction. The checkpoint serves
1159** as a sub-transaction that can be rolled back independently of the
1160** main transaction. You must start a transaction before starting a
1161** checkpoint. The checkpoint is ended automatically if the transaction
1162** commits or rolls back.
1163**
1164** Only one checkpoint may be active at a time. It is an error to try
1165** to start a new checkpoint if another checkpoint is already active.
1166*/
drh3aac2dd2004-04-26 14:10:20 +00001167int sqlite3BtreeBeginStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001168 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001169 if( !pBt->inTrans || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00001170 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00001171 }
drha34b6762004-05-07 13:30:42 +00001172 rc = pBt->readOnly ? SQLITE_OK : sqlite3pager_stmt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00001173 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00001174 return rc;
1175}
1176
1177
1178/*
1179** Commit a checkpoint to transaction currently in progress. If no
1180** checkpoint is active, this is a no-op.
1181*/
drh3aac2dd2004-04-26 14:10:20 +00001182int sqlite3BtreeCommitStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001183 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001184 if( pBt->inStmt && !pBt->readOnly ){
drha34b6762004-05-07 13:30:42 +00001185 rc = sqlite3pager_stmt_commit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00001186 }else{
1187 rc = SQLITE_OK;
1188 }
drh3aac2dd2004-04-26 14:10:20 +00001189 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001190 return rc;
1191}
1192
1193/*
1194** Rollback the checkpoint to the current transaction. If there
1195** is no active checkpoint or transaction, this routine is a no-op.
1196**
1197** All cursors will be invalided by this operation. Any attempt
1198** to use a cursor that was open at the beginning of this operation
1199** will result in an error.
1200*/
drh3aac2dd2004-04-26 14:10:20 +00001201int sqlite3BtreeRollbackStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001202 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001203 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drha34b6762004-05-07 13:30:42 +00001204 rc = sqlite3pager_stmt_rollback(pBt->pPager);
drhc39e0002004-05-07 23:50:57 +00001205 invalidateCursors(pBt);
drh3aac2dd2004-04-26 14:10:20 +00001206 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001207 return rc;
1208}
1209
1210/*
drh3aac2dd2004-04-26 14:10:20 +00001211** Default key comparison function to be used if no comparison function
1212** is specified on the sqlite3BtreeCursor() call.
1213*/
1214static int dfltCompare(
1215 void *NotUsed, /* User data is not used */
1216 int n1, const void *p1, /* First key to compare */
1217 int n2, const void *p2 /* Second key to compare */
1218){
1219 int c;
1220 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1221 if( c==0 ){
1222 c = n1 - n2;
1223 }
1224 return c;
1225}
1226
1227/*
drh8b2f49b2001-06-08 00:21:52 +00001228** Create a new cursor for the BTree whose root is on the page
1229** iTable. The act of acquiring a cursor gets a read lock on
1230** the database file.
drh1bee3d72001-10-15 00:44:35 +00001231**
1232** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001233** If wrFlag==1, then the cursor can be used for reading or for
1234** writing if other conditions for writing are also met. These
1235** are the conditions that must be met in order for writing to
1236** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001237**
drhf74b8d92002-09-01 23:20:45 +00001238** 1: The cursor must have been opened with wrFlag==1
1239**
1240** 2: No other cursors may be open with wrFlag==0 on the same table
1241**
1242** 3: The database must be writable (not on read-only media)
1243**
1244** 4: There must be an active transaction.
1245**
1246** Condition 2 warrants further discussion. If any cursor is opened
1247** on a table with wrFlag==0, that prevents all other cursors from
1248** writing to that table. This is a kind of "read-lock". When a cursor
1249** is opened with wrFlag==0 it is guaranteed that the table will not
1250** change as long as the cursor is open. This allows the cursor to
1251** do a sequential scan of the table without having to worry about
1252** entries being inserted or deleted during the scan. Cursors should
1253** be opened with wrFlag==0 only if this read-lock property is needed.
1254** That is to say, cursors should be opened with wrFlag==0 only if they
drh23e11ca2004-05-04 17:27:28 +00001255** intend to use the sqlite3BtreeNext() system call. All other cursors
drhf74b8d92002-09-01 23:20:45 +00001256** should be opened with wrFlag==1 even if they never really intend
1257** to write.
1258**
drh6446c4d2001-12-15 14:22:18 +00001259** No checking is done to make sure that page iTable really is the
1260** root page of a b-tree. If it is not, then the cursor acquired
1261** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00001262**
1263** The comparison function must be logically the same for every cursor
1264** on a particular table. Changing the comparison function will result
1265** in incorrect operations. If the comparison function is NULL, a
1266** default comparison function is used. The comparison function is
1267** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00001268*/
drh3aac2dd2004-04-26 14:10:20 +00001269int sqlite3BtreeCursor(
1270 Btree *pBt, /* The btree */
1271 int iTable, /* Root page of table to open */
1272 int wrFlag, /* 1 to write. 0 read-only */
1273 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1274 void *pArg, /* First arg to xCompare() */
1275 BtCursor **ppCur /* Write new cursor here */
1276){
drha059ad02001-04-17 20:09:11 +00001277 int rc;
drhf74b8d92002-09-01 23:20:45 +00001278 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001279
drha0c9a112004-03-10 13:42:37 +00001280 if( pBt->readOnly && wrFlag ){
1281 *ppCur = 0;
1282 return SQLITE_READONLY;
1283 }
drh4b70f112004-05-02 21:12:19 +00001284 if( pBt->pPage1==0 ){
drha059ad02001-04-17 20:09:11 +00001285 rc = lockBtree(pBt);
1286 if( rc!=SQLITE_OK ){
1287 *ppCur = 0;
1288 return rc;
1289 }
1290 }
1291 pCur = sqliteMalloc( sizeof(*pCur) );
1292 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001293 rc = SQLITE_NOMEM;
1294 goto create_cursor_exception;
1295 }
drh8b2f49b2001-06-08 00:21:52 +00001296 pCur->pgnoRoot = (Pgno)iTable;
drh24cd67e2004-05-10 16:18:47 +00001297 if( iTable==1 && sqlite3pager_pagecount(pBt->pPager)==0 ){
1298 rc = SQLITE_EMPTY;
1299 goto create_cursor_exception;
1300 }
drhde647132004-05-07 17:57:49 +00001301 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00001302 if( rc!=SQLITE_OK ){
1303 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001304 }
drh3aac2dd2004-04-26 14:10:20 +00001305 pCur->xCompare = xCmp ? xCmp : dfltCompare;
1306 pCur->pArg = pArg;
drh14acc042001-06-10 19:56:58 +00001307 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001308 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001309 pCur->idx = 0;
drha059ad02001-04-17 20:09:11 +00001310 pCur->pNext = pBt->pCursor;
1311 if( pCur->pNext ){
1312 pCur->pNext->pPrev = pCur;
1313 }
drh14acc042001-06-10 19:56:58 +00001314 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001315 pRing = pBt->pCursor;
1316 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1317 if( pRing ){
1318 pCur->pShared = pRing->pShared;
1319 pRing->pShared = pCur;
1320 }else{
1321 pCur->pShared = pCur;
1322 }
drha059ad02001-04-17 20:09:11 +00001323 pBt->pCursor = pCur;
drhc39e0002004-05-07 23:50:57 +00001324 pCur->isValid = 0;
1325 pCur->status = SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001326 *ppCur = pCur;
1327 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001328
1329create_cursor_exception:
1330 *ppCur = 0;
1331 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00001332 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001333 sqliteFree(pCur);
1334 }
drh5e00f6c2001-09-13 13:46:56 +00001335 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001336 return rc;
drha059ad02001-04-17 20:09:11 +00001337}
1338
1339/*
drh5e00f6c2001-09-13 13:46:56 +00001340** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001341** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001342*/
drh3aac2dd2004-04-26 14:10:20 +00001343int sqlite3BtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001344 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001345 if( pCur->pPrev ){
1346 pCur->pPrev->pNext = pCur->pNext;
1347 }else{
1348 pBt->pCursor = pCur->pNext;
1349 }
1350 if( pCur->pNext ){
1351 pCur->pNext->pPrev = pCur->pPrev;
1352 }
drh3aac2dd2004-04-26 14:10:20 +00001353 releasePage(pCur->pPage);
drhf74b8d92002-09-01 23:20:45 +00001354 if( pCur->pShared!=pCur ){
1355 BtCursor *pRing = pCur->pShared;
1356 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1357 pRing->pShared = pCur->pShared;
1358 }
drh5e00f6c2001-09-13 13:46:56 +00001359 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001360 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001361 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001362}
1363
drh7e3b0a02001-04-28 16:52:40 +00001364/*
drh5e2f8b92001-05-28 00:41:15 +00001365** Make a temporary cursor by filling in the fields of pTempCur.
1366** The temporary cursor is not on the cursor list for the Btree.
1367*/
drh14acc042001-06-10 19:56:58 +00001368static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001369 memcpy(pTempCur, pCur, sizeof(*pCur));
1370 pTempCur->pNext = 0;
1371 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001372 if( pTempCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001373 sqlite3pager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001374 }
drh5e2f8b92001-05-28 00:41:15 +00001375}
1376
1377/*
drhbd03cae2001-06-02 02:40:57 +00001378** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001379** function above.
1380*/
drh14acc042001-06-10 19:56:58 +00001381static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001382 if( pCur->pPage ){
drha34b6762004-05-07 13:30:42 +00001383 sqlite3pager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001384 }
drh5e2f8b92001-05-28 00:41:15 +00001385}
1386
1387/*
drh3aac2dd2004-04-26 14:10:20 +00001388** Set *pSize to the size of the buffer needed to hold the value of
1389** the key for the current entry. If the cursor is not pointing
1390** to a valid entry, *pSize is set to 0.
1391**
drh4b70f112004-05-02 21:12:19 +00001392** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00001393** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00001394*/
drh3aac2dd2004-04-26 14:10:20 +00001395int sqlite3BtreeKeySize(BtCursor *pCur, u64 *pSize){
drh2af926b2001-05-15 00:39:25 +00001396 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001397 unsigned char *cell;
drh2af926b2001-05-15 00:39:25 +00001398
drhc39e0002004-05-07 23:50:57 +00001399 if( !pCur->isValid ){
drh72f82862001-05-24 21:06:34 +00001400 *pSize = 0;
1401 }else{
drhc39e0002004-05-07 23:50:57 +00001402 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001403 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001404 assert( pPage!=0 );
1405 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1406 cell = pPage->aCell[pCur->idx];
drh3aac2dd2004-04-26 14:10:20 +00001407 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001408 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001409 cell += 4; /* Skip the child pointer */
1410 }
1411 if( !pPage->zeroData ){
drha34b6762004-05-07 13:30:42 +00001412 while( (0x80&*(cell++))!=0 ){} /* Skip the data size number */
drh3aac2dd2004-04-26 14:10:20 +00001413 }
drha34b6762004-05-07 13:30:42 +00001414 getVarint(cell, pSize);
drh72f82862001-05-24 21:06:34 +00001415 }
1416 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001417}
drh2af926b2001-05-15 00:39:25 +00001418
drh72f82862001-05-24 21:06:34 +00001419/*
1420** Read payload information from the entry that the pCur cursor is
1421** pointing to. Begin reading the payload at "offset" and read
1422** a total of "amt" bytes. Put the result in zBuf.
1423**
1424** This routine does not make a distinction between key and data.
1425** It just reads bytes from the payload area.
1426*/
drh3aac2dd2004-04-26 14:10:20 +00001427static int getPayload(
1428 BtCursor *pCur, /* Cursor pointing to entry to read from */
1429 int offset, /* Begin reading this far into payload */
1430 int amt, /* Read this many bytes */
1431 unsigned char *pBuf, /* Write the bytes into this buffer */
1432 int skipKey /* offset begins at data if this is true */
1433){
1434 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001435 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001436 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001437 MemPage *pPage;
1438 Btree *pBt;
1439 u64 nData, nKey;
1440 int maxLocal, ovflSize;
1441
drh72f82862001-05-24 21:06:34 +00001442 assert( pCur!=0 && pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001443 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001444 pBt = pCur->pBt;
1445 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001446 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00001447 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1448 aPayload = pPage->aCell[pCur->idx];
1449 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001450 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001451 aPayload += 4; /* Skip the child pointer */
1452 }
1453 if( pPage->zeroData ){
1454 nData = 0;
1455 }else{
1456 aPayload += getVarint(aPayload, &nData);
1457 }
drha34b6762004-05-07 13:30:42 +00001458 aPayload += getVarint(aPayload, &nKey);
drh3aac2dd2004-04-26 14:10:20 +00001459 if( pPage->intKey ){
1460 nKey = 0;
1461 }
1462 assert( offset>=0 );
1463 if( skipKey ){
1464 offset += nKey;
1465 }
1466 if( offset+amt > nKey+nData ){
drha34b6762004-05-07 13:30:42 +00001467 return SQLITE_ERROR;
drh3aac2dd2004-04-26 14:10:20 +00001468 }
drha34b6762004-05-07 13:30:42 +00001469 maxLocal = pBt->maxLocal;
drh3aac2dd2004-04-26 14:10:20 +00001470 if( offset<maxLocal ){
drh2af926b2001-05-15 00:39:25 +00001471 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001472 if( a+offset>maxLocal ){
1473 a = maxLocal - offset;
drh2af926b2001-05-15 00:39:25 +00001474 }
drha34b6762004-05-07 13:30:42 +00001475 memcpy(pBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001476 if( a==amt ){
1477 return SQLITE_OK;
1478 }
drh2aa679f2001-06-25 02:11:07 +00001479 offset = 0;
drha34b6762004-05-07 13:30:42 +00001480 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00001481 amt -= a;
drhdd793422001-06-28 01:54:48 +00001482 }else{
drh3aac2dd2004-04-26 14:10:20 +00001483 offset -= maxLocal;
drhbd03cae2001-06-02 02:40:57 +00001484 }
1485 if( amt>0 ){
drha34b6762004-05-07 13:30:42 +00001486 nextPage = get4byte(&aPayload[maxLocal]);
drh2af926b2001-05-15 00:39:25 +00001487 }
drh3aac2dd2004-04-26 14:10:20 +00001488 ovflSize = pBt->pageSize - 4;
drh2af926b2001-05-15 00:39:25 +00001489 while( amt>0 && nextPage ){
drha34b6762004-05-07 13:30:42 +00001490 rc = sqlite3pager_get(pBt->pPager, nextPage, (void**)&aPayload);
drh2af926b2001-05-15 00:39:25 +00001491 if( rc!=0 ){
1492 return rc;
1493 }
drha34b6762004-05-07 13:30:42 +00001494 nextPage = get4byte(aPayload);
drh3aac2dd2004-04-26 14:10:20 +00001495 if( offset<ovflSize ){
drh2af926b2001-05-15 00:39:25 +00001496 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001497 if( a + offset > ovflSize ){
1498 a = ovflSize - offset;
drh2af926b2001-05-15 00:39:25 +00001499 }
drh9b171272004-05-08 02:03:22 +00001500 memcpy(pBuf, &aPayload[offset+4], a);
drh2aa679f2001-06-25 02:11:07 +00001501 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001502 amt -= a;
drha34b6762004-05-07 13:30:42 +00001503 pBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001504 }else{
drh3aac2dd2004-04-26 14:10:20 +00001505 offset -= ovflSize;
drh2af926b2001-05-15 00:39:25 +00001506 }
drha34b6762004-05-07 13:30:42 +00001507 sqlite3pager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00001508 }
drha7fcb052001-12-14 15:09:55 +00001509 if( amt>0 ){
1510 return SQLITE_CORRUPT;
1511 }
1512 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001513}
1514
drh72f82862001-05-24 21:06:34 +00001515/*
drh3aac2dd2004-04-26 14:10:20 +00001516** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001517** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001518** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00001519**
drh3aac2dd2004-04-26 14:10:20 +00001520** Return SQLITE_OK on success or an error code if anything goes
1521** wrong. An error is returned if "offset+amt" is larger than
1522** the available payload.
drh72f82862001-05-24 21:06:34 +00001523*/
drha34b6762004-05-07 13:30:42 +00001524int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh8c1238a2003-01-02 14:43:55 +00001525 assert( amt>=0 );
1526 assert( offset>=0 );
drhc39e0002004-05-07 23:50:57 +00001527 if( pCur->isValid==0 ){
1528 return pCur->status;
drh3aac2dd2004-04-26 14:10:20 +00001529 }
drhc39e0002004-05-07 23:50:57 +00001530 assert( pCur->pPage!=0 );
1531 assert( pCur->pPage->intKey==0 );
1532 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001533 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1534}
1535
1536/*
1537** Return a pointer to the key of record that cursor pCur
1538** is point to if the entire key is in contiguous memory.
1539** If the key is split up among multiple tables, return 0.
1540** If pCur is not pointing to a valid entry return 0.
1541**
1542** The pointer returned is ephemeral. The key may move
1543** or be destroyed on the next call to any Btree routine.
1544**
1545** This routine is used to do quick key comparisons in the
1546** common case where the entire key fits in the payload area
drh4b70f112004-05-02 21:12:19 +00001547** of a cell and does not overflow onto secondary pages. If
1548** this routine returns 0 for a valid cursor, the caller will
1549** need to allocate a buffer big enough to hold the whole key
1550** then use sqlite3BtreeKey() to copy the key value into the
1551** buffer. That is substantially slower. But fortunately,
1552** most keys are small enough to fit in the payload area so
1553** the slower method is rarely needed.
drh3aac2dd2004-04-26 14:10:20 +00001554*/
1555void *sqlite3BtreeKeyFetch(BtCursor *pCur){
1556 unsigned char *aPayload;
1557 MemPage *pPage;
1558 Btree *pBt;
1559 u64 nData, nKey;
1560
drhc39e0002004-05-07 23:50:57 +00001561 assert( pCur!=0 );
1562 if( !pCur->isValid ){
1563 return 0;
1564 }
1565 assert( pCur->pPage!=0 );
1566 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001567 pBt = pCur->pBt;
1568 pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00001569 pageIntegrity(pPage);
drh3aac2dd2004-04-26 14:10:20 +00001570 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drhc39e0002004-05-07 23:50:57 +00001571 assert( pPage->intKey==0 );
drh3aac2dd2004-04-26 14:10:20 +00001572 aPayload = pPage->aCell[pCur->idx];
1573 aPayload += 2; /* Skip the next cell index */
drhde647132004-05-07 17:57:49 +00001574 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001575 aPayload += 4; /* Skip the child pointer */
1576 }
1577 if( !pPage->zeroData ){
1578 aPayload += getVarint(aPayload, &nData);
1579 }
drha34b6762004-05-07 13:30:42 +00001580 aPayload += getVarint(aPayload, &nKey);
drhc39e0002004-05-07 23:50:57 +00001581 if( nKey>pBt->maxLocal ){
drh5e00f6c2001-09-13 13:46:56 +00001582 return 0;
drh72f82862001-05-24 21:06:34 +00001583 }
drh3aac2dd2004-04-26 14:10:20 +00001584 return aPayload;
drh72f82862001-05-24 21:06:34 +00001585}
1586
drh3aac2dd2004-04-26 14:10:20 +00001587
drh72f82862001-05-24 21:06:34 +00001588/*
drhbd03cae2001-06-02 02:40:57 +00001589** Set *pSize to the number of bytes of data in the entry the
1590** cursor currently points to. Always return SQLITE_OK.
1591** Failure is not possible. If the cursor is not currently
1592** pointing to an entry (which can happen, for example, if
1593** the database is empty) then *pSize is set to 0.
drh72f82862001-05-24 21:06:34 +00001594*/
drh3aac2dd2004-04-26 14:10:20 +00001595int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drh72f82862001-05-24 21:06:34 +00001596 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00001597 unsigned char *cell;
1598 u64 size;
drh72f82862001-05-24 21:06:34 +00001599
drhc39e0002004-05-07 23:50:57 +00001600 if( !pCur->isValid ){
1601 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1602 }
drh72f82862001-05-24 21:06:34 +00001603 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001604 assert( pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001605 assert( pPage->isInit );
drhda200cc2004-05-09 11:51:38 +00001606 pageIntegrity(pPage);
drhc39e0002004-05-07 23:50:57 +00001607 if( pPage->zeroData ){
drh72f82862001-05-24 21:06:34 +00001608 *pSize = 0;
1609 }else{
drhc39e0002004-05-07 23:50:57 +00001610 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001611 cell = pPage->aCell[pCur->idx];
1612 cell += 2; /* Skip the offset to the next cell */
drhde647132004-05-07 17:57:49 +00001613 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00001614 cell += 4; /* Skip the child pointer */
1615 }
drha34b6762004-05-07 13:30:42 +00001616 getVarint(cell, &size);
drh3aac2dd2004-04-26 14:10:20 +00001617 assert( (size & 0x00000000ffffffff)==size );
drha34b6762004-05-07 13:30:42 +00001618 *pSize = (u32)size;
drh72f82862001-05-24 21:06:34 +00001619 }
1620 return SQLITE_OK;
1621}
1622
1623/*
drh3aac2dd2004-04-26 14:10:20 +00001624** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00001625** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00001626** begins at "offset".
1627**
1628** Return SQLITE_OK on success or an error code if anything goes
1629** wrong. An error is returned if "offset+amt" is larger than
1630** the available payload.
drh72f82862001-05-24 21:06:34 +00001631*/
drh3aac2dd2004-04-26 14:10:20 +00001632int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhc39e0002004-05-07 23:50:57 +00001633 if( !pCur->isValid ){
1634 return pCur->status ? pCur->status : SQLITE_INTERNAL;
1635 }
drh8c1238a2003-01-02 14:43:55 +00001636 assert( amt>=0 );
1637 assert( offset>=0 );
1638 assert( pCur->pPage!=0 );
drhc39e0002004-05-07 23:50:57 +00001639 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh3aac2dd2004-04-26 14:10:20 +00001640 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00001641}
1642
drh72f82862001-05-24 21:06:34 +00001643/*
drh8178a752003-01-05 21:41:40 +00001644** Move the cursor down to a new child page. The newPgno argument is the
1645** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001646*/
drh3aac2dd2004-04-26 14:10:20 +00001647static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00001648 int rc;
1649 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00001650 MemPage *pOldPage;
drh0d316a42002-08-11 20:10:47 +00001651 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001652
drhc39e0002004-05-07 23:50:57 +00001653 assert( pCur->isValid );
drhde647132004-05-07 17:57:49 +00001654 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001655 if( rc ) return rc;
drhda200cc2004-05-09 11:51:38 +00001656 pageIntegrity(pNewPage);
drh428ae8c2003-01-04 16:48:09 +00001657 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00001658 pOldPage = pCur->pPage;
1659 pOldPage->idxShift = 0;
1660 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00001661 pCur->pPage = pNewPage;
1662 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001663 if( pNewPage->nCell<1 ){
1664 return SQLITE_CORRUPT;
1665 }
drh72f82862001-05-24 21:06:34 +00001666 return SQLITE_OK;
1667}
1668
1669/*
drh8856d6a2004-04-29 14:42:46 +00001670** Return true if the page is the virtual root of its table.
1671**
1672** The virtual root page is the root page for most tables. But
1673** for the table rooted on page 1, sometime the real root page
1674** is empty except for the right-pointer. In such cases the
1675** virtual root page is the page that the right-pointer of page
1676** 1 is pointing to.
1677*/
1678static int isRootPage(MemPage *pPage){
1679 MemPage *pParent = pPage->pParent;
drhda200cc2004-05-09 11:51:38 +00001680 if( pParent==0 ) return 1;
1681 if( pParent->pgno>1 ) return 0;
1682 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
drh8856d6a2004-04-29 14:42:46 +00001683 return 0;
1684}
1685
1686/*
drh5e2f8b92001-05-28 00:41:15 +00001687** Move the cursor up to the parent page.
1688**
1689** pCur->idx is set to the cell index that contains the pointer
1690** to the page we are coming from. If we are coming from the
1691** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001692** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001693*/
drh8178a752003-01-05 21:41:40 +00001694static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001695 Pgno oldPgno;
1696 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001697 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001698 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001699
drhc39e0002004-05-07 23:50:57 +00001700 assert( pCur->isValid );
drh8178a752003-01-05 21:41:40 +00001701 pPage = pCur->pPage;
1702 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00001703 assert( !isRootPage(pPage) );
drhda200cc2004-05-09 11:51:38 +00001704 pageIntegrity(pPage);
drh8178a752003-01-05 21:41:40 +00001705 pParent = pPage->pParent;
1706 assert( pParent!=0 );
drhda200cc2004-05-09 11:51:38 +00001707 pageIntegrity(pParent);
drh8178a752003-01-05 21:41:40 +00001708 idxParent = pPage->idxParent;
drha34b6762004-05-07 13:30:42 +00001709 sqlite3pager_ref(pParent->aData);
drh3aac2dd2004-04-26 14:10:20 +00001710 oldPgno = pPage->pgno;
1711 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00001712 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001713 assert( pParent->idxShift==0 );
1714 if( pParent->idxShift==0 ){
1715 pCur->idx = idxParent;
1716#ifndef NDEBUG
1717 /* Verify that pCur->idx is the correct index to point back to the child
1718 ** page we just came from
1719 */
drh428ae8c2003-01-04 16:48:09 +00001720 if( pCur->idx<pParent->nCell ){
drha34b6762004-05-07 13:30:42 +00001721 assert( get4byte(&pParent->aCell[idxParent][2])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001722 }else{
drha34b6762004-05-07 13:30:42 +00001723 assert( get4byte(&pParent->aData[pParent->hdrOffset+6])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001724 }
1725#endif
1726 }else{
1727 /* The MemPage.idxShift flag indicates that cell indices might have
1728 ** changed since idxParent was set and hence idxParent might be out
1729 ** of date. So recompute the parent cell index by scanning all cells
1730 ** and locating the one that points to the child we just came from.
1731 */
1732 int i;
1733 pCur->idx = pParent->nCell;
drh428ae8c2003-01-04 16:48:09 +00001734 for(i=0; i<pParent->nCell; i++){
drh3aac2dd2004-04-26 14:10:20 +00001735 if( get4byte(&pParent->aCell[i][2])==oldPgno ){
drh428ae8c2003-01-04 16:48:09 +00001736 pCur->idx = i;
1737 break;
1738 }
drh72f82862001-05-24 21:06:34 +00001739 }
1740 }
1741}
1742
1743/*
1744** Move the cursor to the root page
1745*/
drh5e2f8b92001-05-28 00:41:15 +00001746static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00001747 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00001748 int rc;
drh0d316a42002-08-11 20:10:47 +00001749 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001750
drhde647132004-05-07 17:57:49 +00001751 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0);
drhc39e0002004-05-07 23:50:57 +00001752 if( rc ){
1753 pCur->isValid = 0;
1754 return rc;
1755 }
drh3aac2dd2004-04-26 14:10:20 +00001756 releasePage(pCur->pPage);
drhda200cc2004-05-09 11:51:38 +00001757 pageIntegrity(pRoot);
drh3aac2dd2004-04-26 14:10:20 +00001758 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00001759 pCur->idx = 0;
drh8856d6a2004-04-29 14:42:46 +00001760 if( pRoot->nCell==0 && !pRoot->leaf ){
1761 Pgno subpage;
1762 assert( pRoot->pgno==1 );
1763 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+6]);
1764 assert( subpage>0 );
drh4b70f112004-05-02 21:12:19 +00001765 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00001766 }
drhc39e0002004-05-07 23:50:57 +00001767 pCur->isValid = pCur->pPage->nCell>0;
drh8856d6a2004-04-29 14:42:46 +00001768 return rc;
drh72f82862001-05-24 21:06:34 +00001769}
drh2af926b2001-05-15 00:39:25 +00001770
drh5e2f8b92001-05-28 00:41:15 +00001771/*
1772** Move the cursor down to the left-most leaf entry beneath the
1773** entry to which it is currently pointing.
1774*/
1775static int moveToLeftmost(BtCursor *pCur){
1776 Pgno pgno;
1777 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001778 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00001779
drhc39e0002004-05-07 23:50:57 +00001780 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001781 while( !(pPage = pCur->pPage)->leaf ){
drha34b6762004-05-07 13:30:42 +00001782 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1783 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001784 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001785 if( rc ) return rc;
1786 }
1787 return SQLITE_OK;
1788}
1789
drh2dcc9aa2002-12-04 13:40:25 +00001790/*
1791** Move the cursor down to the right-most leaf entry beneath the
1792** page to which it is currently pointing. Notice the difference
1793** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1794** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1795** finds the right-most entry beneath the *page*.
1796*/
1797static int moveToRightmost(BtCursor *pCur){
1798 Pgno pgno;
1799 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001800 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001801
drhc39e0002004-05-07 23:50:57 +00001802 assert( pCur->isValid );
drh3aac2dd2004-04-26 14:10:20 +00001803 while( !(pPage = pCur->pPage)->leaf ){
1804 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
1805 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001806 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001807 if( rc ) return rc;
1808 }
drh3aac2dd2004-04-26 14:10:20 +00001809 pCur->idx = pPage->nCell - 1;
drh2dcc9aa2002-12-04 13:40:25 +00001810 return SQLITE_OK;
1811}
1812
drh5e00f6c2001-09-13 13:46:56 +00001813/* Move the cursor to the first entry in the table. Return SQLITE_OK
1814** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001815** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001816*/
drh3aac2dd2004-04-26 14:10:20 +00001817int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001818 int rc;
drhc39e0002004-05-07 23:50:57 +00001819 if( pCur->status ){
1820 return pCur->status;
1821 }
drh5e00f6c2001-09-13 13:46:56 +00001822 rc = moveToRoot(pCur);
1823 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001824 if( pCur->isValid==0 ){
1825 assert( pCur->pPage->nCell==0 );
drh5e00f6c2001-09-13 13:46:56 +00001826 *pRes = 1;
1827 return SQLITE_OK;
1828 }
drhc39e0002004-05-07 23:50:57 +00001829 assert( pCur->pPage->nCell>0 );
drh5e00f6c2001-09-13 13:46:56 +00001830 *pRes = 0;
1831 rc = moveToLeftmost(pCur);
1832 return rc;
1833}
drh5e2f8b92001-05-28 00:41:15 +00001834
drh9562b552002-02-19 15:00:07 +00001835/* Move the cursor to the last entry in the table. Return SQLITE_OK
1836** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001837** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001838*/
drh3aac2dd2004-04-26 14:10:20 +00001839int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001840 int rc;
drhc39e0002004-05-07 23:50:57 +00001841 if( pCur->status ){
1842 return pCur->status;
1843 }
drh9562b552002-02-19 15:00:07 +00001844 rc = moveToRoot(pCur);
1845 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001846 if( pCur->isValid==0 ){
1847 assert( pCur->pPage->nCell==0 );
drh9562b552002-02-19 15:00:07 +00001848 *pRes = 1;
1849 return SQLITE_OK;
1850 }
drhc39e0002004-05-07 23:50:57 +00001851 assert( pCur->isValid );
drh9562b552002-02-19 15:00:07 +00001852 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001853 rc = moveToRightmost(pCur);
drh9562b552002-02-19 15:00:07 +00001854 return rc;
1855}
1856
drh3aac2dd2004-04-26 14:10:20 +00001857/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00001858** Return a success code.
1859**
drh3aac2dd2004-04-26 14:10:20 +00001860** For INTKEY tables, only the nKey parameter is used. pKey is
1861** ignored. For other tables, nKey is the number of bytes of data
1862** in nKey. The comparison function specified when the cursor was
1863** created is used to compare keys.
1864**
drh5e2f8b92001-05-28 00:41:15 +00001865** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001866** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001867** were present. The cursor might point to an entry that comes
1868** before or after the key.
1869**
drhbd03cae2001-06-02 02:40:57 +00001870** The result of comparing the key with the entry to which the
1871** cursor is left pointing is stored in pCur->iMatch. The same
1872** value is also written to *pRes if pRes!=NULL. The meaning of
1873** this value is as follows:
1874**
1875** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001876** is smaller than pKey or if the table is empty
1877** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001878**
1879** *pRes==0 The cursor is left pointing at an entry that
1880** exactly matches pKey.
1881**
1882** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001883** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001884*/
drh3aac2dd2004-04-26 14:10:20 +00001885int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, u64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001886 int rc;
drhc39e0002004-05-07 23:50:57 +00001887
1888 if( pCur->status ){
1889 return pCur->status;
1890 }
drh5e2f8b92001-05-28 00:41:15 +00001891 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001892 if( rc ) return rc;
drhc39e0002004-05-07 23:50:57 +00001893 assert( pCur->pPage );
1894 assert( pCur->pPage->isInit );
1895 if( pCur->isValid==0 ){
1896 assert( pCur->pPage->nCell==0 );
1897 return SQLITE_OK;
1898 }
drh72f82862001-05-24 21:06:34 +00001899 for(;;){
1900 int lwr, upr;
1901 Pgno chldPg;
1902 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001903 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001904 lwr = 0;
1905 upr = pPage->nCell-1;
drhda200cc2004-05-09 11:51:38 +00001906 pageIntegrity(pPage);
drh72f82862001-05-24 21:06:34 +00001907 while( lwr<=upr ){
drh3aac2dd2004-04-26 14:10:20 +00001908 void *pCellKey;
1909 u64 nCellKey;
drh72f82862001-05-24 21:06:34 +00001910 pCur->idx = (lwr+upr)/2;
drhde647132004-05-07 17:57:49 +00001911 sqlite3BtreeKeySize(pCur, &nCellKey);
drh3aac2dd2004-04-26 14:10:20 +00001912 if( pPage->intKey ){
1913 if( nCellKey<nKey ){
1914 c = -1;
1915 }else if( nCellKey>nKey ){
1916 c = +1;
1917 }else{
1918 c = 0;
1919 }
1920 }else if( (pCellKey = sqlite3BtreeKeyFetch(pCur))!=0 ){
1921 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1922 }else{
1923 pCellKey = sqliteMalloc( nCellKey );
1924 if( pCellKey==0 ) return SQLITE_NOMEM;
1925 rc = sqlite3BtreeKey(pCur, 0, nCellKey, pCellKey);
1926 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1927 sqliteFree(pCellKey);
1928 if( rc ) return rc;
1929 }
drh72f82862001-05-24 21:06:34 +00001930 if( c==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001931 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001932 if( pRes ) *pRes = 0;
1933 return SQLITE_OK;
1934 }
1935 if( c<0 ){
1936 lwr = pCur->idx+1;
1937 }else{
1938 upr = pCur->idx-1;
1939 }
1940 }
1941 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00001942 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00001943 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00001944 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00001945 }else if( lwr>=pPage->nCell ){
1946 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh72f82862001-05-24 21:06:34 +00001947 }else{
drh3aac2dd2004-04-26 14:10:20 +00001948 chldPg = get4byte(&pPage->aCell[lwr][2]);
drh72f82862001-05-24 21:06:34 +00001949 }
1950 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001951 pCur->iMatch = c;
drhc39e0002004-05-07 23:50:57 +00001952 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001953 if( pRes ) *pRes = c;
1954 return SQLITE_OK;
1955 }
drh428ae8c2003-01-04 16:48:09 +00001956 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00001957 rc = moveToChild(pCur, chldPg);
drhc39e0002004-05-07 23:50:57 +00001958 if( rc ){
1959 return rc;
1960 }
drh72f82862001-05-24 21:06:34 +00001961 }
drhbd03cae2001-06-02 02:40:57 +00001962 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00001963}
1964
1965/*
drhc39e0002004-05-07 23:50:57 +00001966** Return TRUE if the cursor is not pointing at an entry of the table.
1967**
1968** TRUE will be returned after a call to sqlite3BtreeNext() moves
1969** past the last entry in the table or sqlite3BtreePrev() moves past
1970** the first entry. TRUE is also returned if the table is empty.
1971*/
1972int sqlite3BtreeEof(BtCursor *pCur){
1973 return pCur->isValid==0;
1974}
1975
1976/*
drhbd03cae2001-06-02 02:40:57 +00001977** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00001978** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00001979** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00001980** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00001981*/
drh3aac2dd2004-04-26 14:10:20 +00001982int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00001983 int rc;
drh8178a752003-01-05 21:41:40 +00001984 MemPage *pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001985 assert( pRes!=0 );
drhc39e0002004-05-07 23:50:57 +00001986 if( pCur->isValid==0 ){
drh8c1238a2003-01-02 14:43:55 +00001987 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00001988 return SQLITE_OK;
drhecdc7532001-09-23 02:35:53 +00001989 }
drh8178a752003-01-05 21:41:40 +00001990 assert( pPage->isInit );
drh8178a752003-01-05 21:41:40 +00001991 assert( pCur->idx<pPage->nCell );
drh72f82862001-05-24 21:06:34 +00001992 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00001993 if( pCur->idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00001994 if( !pPage->leaf ){
1995 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+6]));
drh5e2f8b92001-05-28 00:41:15 +00001996 if( rc ) return rc;
1997 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001998 *pRes = 0;
1999 return rc;
drh72f82862001-05-24 21:06:34 +00002000 }
drh5e2f8b92001-05-28 00:41:15 +00002001 do{
drh8856d6a2004-04-29 14:42:46 +00002002 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00002003 *pRes = 1;
drhc39e0002004-05-07 23:50:57 +00002004 pCur->isValid = 0;
drh5e2f8b92001-05-28 00:41:15 +00002005 return SQLITE_OK;
2006 }
drh8178a752003-01-05 21:41:40 +00002007 moveToParent(pCur);
2008 pPage = pCur->pPage;
2009 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00002010 *pRes = 0;
drh8178a752003-01-05 21:41:40 +00002011 return SQLITE_OK;
2012 }
2013 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00002014 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00002015 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00002016 }
drh5e2f8b92001-05-28 00:41:15 +00002017 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00002018 return rc;
drh72f82862001-05-24 21:06:34 +00002019}
2020
drh3b7511c2001-05-26 13:15:44 +00002021/*
drh2dcc9aa2002-12-04 13:40:25 +00002022** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00002023** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00002024** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00002025** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00002026*/
drh3aac2dd2004-04-26 14:10:20 +00002027int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00002028 int rc;
2029 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00002030 MemPage *pPage;
drhc39e0002004-05-07 23:50:57 +00002031 if( pCur->isValid==0 ){
2032 *pRes = 1;
2033 return SQLITE_OK;
2034 }
drh8178a752003-01-05 21:41:40 +00002035 pPage = pCur->pPage;
drh8178a752003-01-05 21:41:40 +00002036 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00002037 assert( pCur->idx>=0 );
drha34b6762004-05-07 13:30:42 +00002038 if( !pPage->leaf ){
drh3aac2dd2004-04-26 14:10:20 +00002039 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00002040 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00002041 if( rc ) return rc;
2042 rc = moveToRightmost(pCur);
2043 }else{
2044 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00002045 if( isRootPage(pPage) ){
drhc39e0002004-05-07 23:50:57 +00002046 pCur->isValid = 0;
2047 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00002048 return SQLITE_OK;
2049 }
drh8178a752003-01-05 21:41:40 +00002050 moveToParent(pCur);
2051 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00002052 }
2053 pCur->idx--;
2054 rc = SQLITE_OK;
2055 }
drh8178a752003-01-05 21:41:40 +00002056 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00002057 return rc;
2058}
2059
2060/*
drh3a4c1412004-05-09 20:40:11 +00002061** The TRACE macro will print high-level status information about the
2062** btree operation when the global variable sqlite3_btree_trace is
2063** enabled.
2064*/
2065#if SQLITE_TEST
2066# define TRACE(X) if( sqlite3_btree_trace ){ printf X; fflush(stdout); }
2067#else
2068# define TRACE(X)
2069#endif
2070int sqlite3_btree_trace=0; /* True to enable tracing */
2071
2072/*
drh3b7511c2001-05-26 13:15:44 +00002073** Allocate a new page from the database file.
2074**
drha34b6762004-05-07 13:30:42 +00002075** The new page is marked as dirty. (In other words, sqlite3pager_write()
drh3b7511c2001-05-26 13:15:44 +00002076** has already been called on the new page.) The new page has also
2077** been referenced and the calling routine is responsible for calling
drha34b6762004-05-07 13:30:42 +00002078** sqlite3pager_unref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00002079**
2080** SQLITE_OK is returned on success. Any other return value indicates
2081** an error. *ppPage and *pPgno are undefined in the event of an error.
drha34b6762004-05-07 13:30:42 +00002082** Do not invoke sqlite3pager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00002083**
drh199e3cf2002-07-18 11:01:47 +00002084** If the "nearby" parameter is not 0, then a (feeble) effort is made to
2085** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00002086** attempt to keep related pages close to each other in the database file,
2087** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00002088*/
drh199e3cf2002-07-18 11:01:47 +00002089static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drh3aac2dd2004-04-26 14:10:20 +00002090 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00002091 int rc;
drh3aac2dd2004-04-26 14:10:20 +00002092 int n; /* Number of pages on the freelist */
2093 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00002094
drh3aac2dd2004-04-26 14:10:20 +00002095 pPage1 = pBt->pPage1;
2096 n = get4byte(&pPage1->aData[36]);
2097 if( n>0 ){
drh91025292004-05-03 19:49:32 +00002098 /* There are pages on the freelist. Reuse one of those pages. */
drh3aac2dd2004-04-26 14:10:20 +00002099 MemPage *pTrunk;
drha34b6762004-05-07 13:30:42 +00002100 rc = sqlite3pager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00002101 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002102 put4byte(&pPage1->aData[36], n-1);
2103 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002104 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002105 rc = sqlite3pager_write(pTrunk->aData);
drh3b7511c2001-05-26 13:15:44 +00002106 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00002107 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002108 return rc;
2109 }
drh3aac2dd2004-04-26 14:10:20 +00002110 k = get4byte(&pTrunk->aData[4]);
2111 if( k==0 ){
2112 /* The trunk has no leaves. So extract the trunk page itself and
2113 ** use it as the newly allocated page */
drha34b6762004-05-07 13:30:42 +00002114 *pPgno = get4byte(&pPage1->aData[32]);
drh3aac2dd2004-04-26 14:10:20 +00002115 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
2116 *ppPage = pTrunk;
drh3a4c1412004-05-09 20:40:11 +00002117 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh30e58752002-03-02 20:41:57 +00002118 }else{
drh3aac2dd2004-04-26 14:10:20 +00002119 /* Extract a leaf from the trunk */
2120 int closest;
2121 unsigned char *aData = pTrunk->aData;
2122 if( nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00002123 int i, dist;
2124 closest = 0;
drh3aac2dd2004-04-26 14:10:20 +00002125 dist = get4byte(&aData[8]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002126 if( dist<0 ) dist = -dist;
drh3a4c1412004-05-09 20:40:11 +00002127 for(i=1; i<k; i++){
drh3aac2dd2004-04-26 14:10:20 +00002128 int d2 = get4byte(&aData[8+i*4]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002129 if( d2<0 ) d2 = -d2;
2130 if( d2<dist ) closest = i;
2131 }
2132 }else{
2133 closest = 0;
2134 }
drha34b6762004-05-07 13:30:42 +00002135 *pPgno = get4byte(&aData[8+closest*4]);
drh3a4c1412004-05-09 20:40:11 +00002136 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d: %d more free pages\n",
2137 *pPgno, closest+1, k, pTrunk->pgno, n-1));
drh9b171272004-05-08 02:03:22 +00002138 if( closest<k-1 ){
2139 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
2140 }
drh3a4c1412004-05-09 20:40:11 +00002141 put4byte(&aData[4], k-1);
drh3aac2dd2004-04-26 14:10:20 +00002142 rc = getPage(pBt, *pPgno, ppPage);
2143 releasePage(pTrunk);
drh30e58752002-03-02 20:41:57 +00002144 if( rc==SQLITE_OK ){
drh9b171272004-05-08 02:03:22 +00002145 sqlite3pager_dont_rollback((*ppPage)->aData);
drha34b6762004-05-07 13:30:42 +00002146 rc = sqlite3pager_write((*ppPage)->aData);
drh30e58752002-03-02 20:41:57 +00002147 }
2148 }
drh3b7511c2001-05-26 13:15:44 +00002149 }else{
drh3aac2dd2004-04-26 14:10:20 +00002150 /* There are no pages on the freelist, so create a new page at the
2151 ** end of the file */
drha34b6762004-05-07 13:30:42 +00002152 *pPgno = sqlite3pager_pagecount(pBt->pPager) + 1;
drh3aac2dd2004-04-26 14:10:20 +00002153 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00002154 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002155 rc = sqlite3pager_write((*ppPage)->aData);
drh3a4c1412004-05-09 20:40:11 +00002156 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00002157 }
2158 return rc;
2159}
2160
2161/*
drh3aac2dd2004-04-26 14:10:20 +00002162** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00002163**
drha34b6762004-05-07 13:30:42 +00002164** sqlite3pager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00002165*/
drh3aac2dd2004-04-26 14:10:20 +00002166static int freePage(MemPage *pPage){
2167 Btree *pBt = pPage->pBt;
2168 MemPage *pPage1 = pBt->pPage1;
2169 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00002170
drh3aac2dd2004-04-26 14:10:20 +00002171 /* Prepare the page for freeing */
2172 assert( pPage->pgno>1 );
2173 pPage->isInit = 0;
2174 releasePage(pPage->pParent);
2175 pPage->pParent = 0;
2176
drha34b6762004-05-07 13:30:42 +00002177 /* Increment the free page count on pPage1 */
2178 rc = sqlite3pager_write(pPage1->aData);
drh3aac2dd2004-04-26 14:10:20 +00002179 if( rc ) return rc;
2180 n = get4byte(&pPage1->aData[36]);
2181 put4byte(&pPage1->aData[36], n+1);
2182
2183 if( n==0 ){
2184 /* This is the first free page */
drhda200cc2004-05-09 11:51:38 +00002185 rc = sqlite3pager_write(pPage->aData);
2186 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002187 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00002188 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002189 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002190 }else{
2191 /* Other free pages already exist. Retrive the first trunk page
2192 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00002193 MemPage *pTrunk;
2194 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002195 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002196 k = get4byte(&pTrunk->aData[4]);
2197 if( k==pBt->pageSize/4 - 8 ){
2198 /* The trunk is full. Turn the page being freed into a new
2199 ** trunk page with no leaves. */
drha34b6762004-05-07 13:30:42 +00002200 rc = sqlite3pager_write(pPage->aData);
drh3aac2dd2004-04-26 14:10:20 +00002201 if( rc ) return rc;
2202 put4byte(pPage->aData, pTrunk->pgno);
2203 put4byte(&pPage->aData[4], 0);
2204 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002205 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
2206 pPage->pgno, pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002207 }else{
2208 /* Add the newly freed page as a leaf on the current trunk */
drha34b6762004-05-07 13:30:42 +00002209 rc = sqlite3pager_write(pTrunk->aData);
drh3aac2dd2004-04-26 14:10:20 +00002210 if( rc ) return rc;
2211 put4byte(&pTrunk->aData[4], k+1);
2212 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drha34b6762004-05-07 13:30:42 +00002213 sqlite3pager_dont_write(pBt->pPager, pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00002214 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00002215 }
2216 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002217 }
drh3b7511c2001-05-26 13:15:44 +00002218 return rc;
2219}
2220
2221/*
drh3aac2dd2004-04-26 14:10:20 +00002222** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00002223*/
drh3aac2dd2004-04-26 14:10:20 +00002224static int clearCell(MemPage *pPage, unsigned char *pCell){
2225 Btree *pBt = pPage->pBt;
drha34b6762004-05-07 13:30:42 +00002226 int rc, n, nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002227 u64 nData, nKey;
2228 Pgno ovflPgno;
drh3b7511c2001-05-26 13:15:44 +00002229
drh3aac2dd2004-04-26 14:10:20 +00002230 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
drha34b6762004-05-07 13:30:42 +00002231 assert( (nData&0x000000007fffffff)==nData );
2232 nPayload = (int)nData;
drh3aac2dd2004-04-26 14:10:20 +00002233 if( !pPage->intKey ){
2234 nPayload += nKey;
drh5e2f8b92001-05-28 00:41:15 +00002235 }
drh3aac2dd2004-04-26 14:10:20 +00002236 if( nPayload<=pBt->maxLocal ){
drha34b6762004-05-07 13:30:42 +00002237 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00002238 }
2239 ovflPgno = get4byte(&pCell[n+pBt->maxLocal]);
2240 while( ovflPgno!=0 ){
2241 MemPage *pOvfl;
2242 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00002243 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002244 ovflPgno = get4byte(pOvfl->aData);
drha34b6762004-05-07 13:30:42 +00002245 rc = freePage(pOvfl);
drhbd03cae2001-06-02 02:40:57 +00002246 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002247 sqlite3pager_unref(pOvfl->aData);
drh3b7511c2001-05-26 13:15:44 +00002248 }
drh5e2f8b92001-05-28 00:41:15 +00002249 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00002250}
2251
2252/*
drh91025292004-05-03 19:49:32 +00002253** Create the byte sequence used to represent a cell on page pPage
2254** and write that byte sequence into pCell[]. Overflow pages are
2255** allocated and filled in as necessary. The calling procedure
2256** is responsible for making sure sufficient space has been allocated
2257** for pCell[].
2258**
2259** Note that pCell does not necessary need to point to the pPage->aData
2260** area. pCell might point to some temporary storage. The cell will
2261** be constructed in this temporary area then copied into pPage->aData
2262** later.
drh3b7511c2001-05-26 13:15:44 +00002263*/
2264static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00002265 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00002266 unsigned char *pCell, /* Complete text of the cell */
drh3aac2dd2004-04-26 14:10:20 +00002267 const void *pKey, u64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00002268 const void *pData,int nData, /* The data */
2269 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00002270){
drh3b7511c2001-05-26 13:15:44 +00002271 int nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002272 const void *pSrc;
drha34b6762004-05-07 13:30:42 +00002273 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00002274 int spaceLeft;
2275 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00002276 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00002277 unsigned char *pPrior;
2278 unsigned char *pPayload;
2279 Btree *pBt = pPage->pBt;
2280 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00002281 int nHeader;
drh3b7511c2001-05-26 13:15:44 +00002282
drh91025292004-05-03 19:49:32 +00002283 /* Fill in the header. */
2284 nHeader = 2;
2285 if( !pPage->leaf ){
2286 nHeader += 4;
2287 }
2288 if( !pPage->zeroData ){
2289 nHeader += putVarint(&pCell[nHeader], nData);
2290 }
2291 nHeader += putVarint(&pCell[nHeader], nKey);
2292
2293 /* Fill in the payload */
2294 if( pPage->zeroData ){
2295 nData = 0;
2296 }
drh3aac2dd2004-04-26 14:10:20 +00002297 nPayload = nData;
2298 if( pPage->intKey ){
2299 pSrc = pData;
2300 nSrc = nData;
drh91025292004-05-03 19:49:32 +00002301 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00002302 }else{
2303 nPayload += nKey;
2304 pSrc = pKey;
2305 nSrc = nKey;
2306 }
drh4b70f112004-05-02 21:12:19 +00002307 if( nPayload>pBt->maxLocal ){
2308 *pnSize = nHeader + pBt->maxLocal + 4;
2309 }else{
2310 *pnSize = nHeader + nPayload;
2311 }
drh3aac2dd2004-04-26 14:10:20 +00002312 spaceLeft = pBt->maxLocal;
2313 pPayload = &pCell[nHeader];
2314 pPrior = &pPayload[pBt->maxLocal];
drh3b7511c2001-05-26 13:15:44 +00002315
drh3b7511c2001-05-26 13:15:44 +00002316 while( nPayload>0 ){
2317 if( spaceLeft==0 ){
drh3aac2dd2004-04-26 14:10:20 +00002318 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
drh3b7511c2001-05-26 13:15:44 +00002319 if( rc ){
drh9b171272004-05-08 02:03:22 +00002320 releasePage(pToRelease);
drh3aac2dd2004-04-26 14:10:20 +00002321 clearCell(pPage, pCell);
drh3b7511c2001-05-26 13:15:44 +00002322 return rc;
2323 }
drh3aac2dd2004-04-26 14:10:20 +00002324 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00002325 releasePage(pToRelease);
2326 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00002327 pPrior = pOvfl->aData;
2328 put4byte(pPrior, 0);
2329 pPayload = &pOvfl->aData[4];
2330 spaceLeft = pBt->pageSize - 4;
drh3b7511c2001-05-26 13:15:44 +00002331 }
2332 n = nPayload;
2333 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00002334 if( n>nSrc ) n = nSrc;
2335 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00002336 nPayload -= n;
drhde647132004-05-07 17:57:49 +00002337 pPayload += n;
drh9b171272004-05-08 02:03:22 +00002338 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00002339 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00002340 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00002341 if( nSrc==0 ){
2342 nSrc = nData;
2343 pSrc = pData;
2344 }
drhdd793422001-06-28 01:54:48 +00002345 }
drh9b171272004-05-08 02:03:22 +00002346 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00002347 return SQLITE_OK;
2348}
2349
2350/*
drhbd03cae2001-06-02 02:40:57 +00002351** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00002352** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00002353** pointer in the third argument.
2354*/
drh4b70f112004-05-02 21:12:19 +00002355static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00002356 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00002357 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00002358
drhdd793422001-06-28 01:54:48 +00002359 if( pgno==0 ) return;
drh4b70f112004-05-02 21:12:19 +00002360 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00002361 aData = sqlite3pager_lookup(pBt->pPager, pgno);
drhda200cc2004-05-09 11:51:38 +00002362 if( aData ){
2363 pThis = (MemPage*)&aData[pBt->pageSize];
2364 if( pThis->isInit ){
2365 if( pThis->pParent!=pNewParent ){
2366 if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
2367 pThis->pParent = pNewParent;
2368 if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
2369 }
2370 pThis->idxParent = idx;
drhdd793422001-06-28 01:54:48 +00002371 }
drha34b6762004-05-07 13:30:42 +00002372 sqlite3pager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00002373 }
2374}
2375
2376/*
drh4b70f112004-05-02 21:12:19 +00002377** Change the pParent pointer of all children of pPage to point back
2378** to pPage.
2379**
drhbd03cae2001-06-02 02:40:57 +00002380** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00002381** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00002382**
2383** This routine gets called after you memcpy() one page into
2384** another.
2385*/
drh4b70f112004-05-02 21:12:19 +00002386static void reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002387 int i;
drh4b70f112004-05-02 21:12:19 +00002388 Btree *pBt;
2389
drha34b6762004-05-07 13:30:42 +00002390 if( pPage->leaf ) return;
drh4b70f112004-05-02 21:12:19 +00002391 pBt = pPage->pBt;
drhbd03cae2001-06-02 02:40:57 +00002392 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002393 reparentPage(pBt, get4byte(&pPage->aCell[i][2]), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002394 }
drh4b70f112004-05-02 21:12:19 +00002395 reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+6]), pPage, i);
drh428ae8c2003-01-04 16:48:09 +00002396 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002397}
2398
2399/*
2400** Remove the i-th cell from pPage. This routine effects pPage only.
2401** The cell content is not freed or deallocated. It is assumed that
2402** the cell content has been copied someplace else. This routine just
2403** removes the reference to the cell from pPage.
2404**
2405** "sz" must be the number of bytes in the cell.
2406**
drhda200cc2004-05-09 11:51:38 +00002407** Try to maintain the integrity of the linked list of cells. But if
2408** the cell being inserted does not fit on the page, this will not be
2409** possible. If the linked list is not maintained, then just update
2410** pPage->aCell[] and set the pPage->needRelink flag so that we will
2411** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002412*/
drh4b70f112004-05-02 21:12:19 +00002413static void dropCell(MemPage *pPage, int idx, int sz){
drhde647132004-05-07 17:57:49 +00002414 int j, pc;
drhda200cc2004-05-09 11:51:38 +00002415 u8 *data;
drh8c42ca92001-06-22 19:15:00 +00002416 assert( idx>=0 && idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002417 assert( sz==cellSize(pPage, pPage->aCell[idx]) );
drha34b6762004-05-07 13:30:42 +00002418 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002419 assert( pPage->aCell[idx]>=pPage->aData );
drh3a4c1412004-05-09 20:40:11 +00002420 assert( pPage->aCell[idx]<=&pPage->aData[pPage->pBt->pageSize-sz] );
drhda200cc2004-05-09 11:51:38 +00002421 data = pPage->aData;
2422 pc = Addr(pPage->aCell[idx]) - Addr(data);
drhde647132004-05-07 17:57:49 +00002423 assert( pc>pPage->hdrOffset && pc+sz<=pPage->pBt->pageSize );
2424 freeSpace(pPage, pc, sz);
drh7c717f72001-06-24 20:39:41 +00002425 for(j=idx; j<pPage->nCell-1; j++){
drh4b70f112004-05-02 21:12:19 +00002426 pPage->aCell[j] = pPage->aCell[j+1];
drh14acc042001-06-10 19:56:58 +00002427 }
2428 pPage->nCell--;
drhda200cc2004-05-09 11:51:38 +00002429 if( !pPage->isOverfull && !pPage->needRelink ){
2430 u8 *pPrev;
2431 if( idx==0 ){
2432 pPrev = &data[pPage->hdrOffset+3];
2433 }else{
2434 pPrev = pPage->aCell[idx-1];
2435 }
2436 if( idx<pPage->nCell ){
2437 pc = Addr(pPage->aCell[idx]) - Addr(data);
2438 }else{
2439 pc = 0;
2440 }
2441 put2byte(pPrev, pc);
2442 pageIntegrity(pPage);
2443 }else{
2444 pPage->needRelink = 1;
2445 }
drh428ae8c2003-01-04 16:48:09 +00002446 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002447}
2448
2449/*
2450** Insert a new cell on pPage at cell index "i". pCell points to the
2451** content of the cell.
2452**
2453** If the cell content will fit on the page, then put it there. If it
drh24cd67e2004-05-10 16:18:47 +00002454** will not fit and pTemp is not NULL, then make a copy of the content
2455** into pTemp, set pPage->aCell[i] point to pTemp, and set pPage->isOverfull.
2456** If the content will not fit and pTemp is NULL, then make pPage->aCell[i]
2457** point to pCell and set pPage->isOverfull.
drh14acc042001-06-10 19:56:58 +00002458**
drhda200cc2004-05-09 11:51:38 +00002459** Try to maintain the integrity of the linked list of cells. But if
2460** the cell being inserted does not fit on the page, this will not be
2461** possible. If the linked list is not maintained, then just update
2462** pPage->aCell[] and set the pPage->needRelink flag so that we will
2463** know to rebuild the linked list later.
drh14acc042001-06-10 19:56:58 +00002464*/
drh24cd67e2004-05-10 16:18:47 +00002465static void insertCell(
2466 MemPage *pPage, /* Page into which we are copying */
2467 int i, /* Which cell on pPage to insert after */
2468 u8 *pCell, /* Text of the new cell to insert */
2469 int sz, /* Bytes of data in pCell */
2470 u8 *pTemp /* Temp storage space for pCell, if needed */
2471){
drh14acc042001-06-10 19:56:58 +00002472 int idx, j;
2473 assert( i>=0 && i<=pPage->nCell );
drha34b6762004-05-07 13:30:42 +00002474 assert( sz==cellSize(pPage, pCell) );
2475 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002476 idx = pPage->needRelink ? 0 : allocateSpace(pPage, sz);
drh4b70f112004-05-02 21:12:19 +00002477 resizeCellArray(pPage, pPage->nCell+1);
drh14acc042001-06-10 19:56:58 +00002478 for(j=pPage->nCell; j>i; j--){
drh4b70f112004-05-02 21:12:19 +00002479 pPage->aCell[j] = pPage->aCell[j-1];
drh14acc042001-06-10 19:56:58 +00002480 }
2481 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002482 if( idx<=0 ){
2483 pPage->isOverfull = 1;
drh24cd67e2004-05-10 16:18:47 +00002484 if( pTemp ){
2485 memcpy(pTemp, pCell, sz);
2486 }else{
2487 pTemp = pCell;
2488 }
2489 pPage->aCell[i] = pTemp;
drh14acc042001-06-10 19:56:58 +00002490 }else{
drhda200cc2004-05-09 11:51:38 +00002491 u8 *data = pPage->aData;
2492 memcpy(&data[idx], pCell, sz);
2493 pPage->aCell[i] = &data[idx];
2494 }
2495 if( !pPage->isOverfull && !pPage->needRelink ){
2496 u8 *pPrev;
2497 int pc;
2498 if( i==0 ){
2499 pPrev = &pPage->aData[pPage->hdrOffset+3];
2500 }else{
2501 pPrev = pPage->aCell[i-1];
2502 }
2503 pc = get2byte(pPrev);
2504 put2byte(pPrev, idx);
2505 put2byte(pPage->aCell[i], pc);
2506 pageIntegrity(pPage);
2507 }else{
2508 pPage->needRelink = 1;
drh14acc042001-06-10 19:56:58 +00002509 }
drh428ae8c2003-01-04 16:48:09 +00002510 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002511}
2512
2513/*
2514** Rebuild the linked list of cells on a page so that the cells
drh4b70f112004-05-02 21:12:19 +00002515** occur in the order specified by the pPage->aCell[] array.
drh8c42ca92001-06-22 19:15:00 +00002516** Invoke this routine once to repair damage after one or more
2517** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002518*/
drh4b70f112004-05-02 21:12:19 +00002519static void relinkCellList(MemPage *pPage){
2520 int i, idxFrom;
drha34b6762004-05-07 13:30:42 +00002521 assert( sqlite3pager_iswriteable(pPage->aData) );
drhda200cc2004-05-09 11:51:38 +00002522 if( !pPage->needRelink ) return;
drh4b70f112004-05-02 21:12:19 +00002523 idxFrom = pPage->hdrOffset+3;
drh14acc042001-06-10 19:56:58 +00002524 for(i=0; i<pPage->nCell; i++){
drhde647132004-05-07 17:57:49 +00002525 int idx = Addr(pPage->aCell[i]) - Addr(pPage->aData);
drh4b70f112004-05-02 21:12:19 +00002526 assert( idx>pPage->hdrOffset && idx<pPage->pBt->pageSize );
2527 put2byte(&pPage->aData[idxFrom], idx);
2528 idxFrom = idx;
drh14acc042001-06-10 19:56:58 +00002529 }
drh4b70f112004-05-02 21:12:19 +00002530 put2byte(&pPage->aData[idxFrom], 0);
drhda200cc2004-05-09 11:51:38 +00002531 pPage->needRelink = 0;
drh14acc042001-06-10 19:56:58 +00002532}
2533
2534/*
drhc8629a12004-05-08 20:07:40 +00002535** GCC does not define the offsetof() macro so we'll have to do it
2536** ourselves.
2537*/
2538#ifndef offsetof
2539#define offsetof(STRUCTURE,FIELD) ((int)((char*)&((STRUCTURE*)0)->FIELD))
2540#endif
2541
2542/*
drh91025292004-05-03 19:49:32 +00002543** Move the content of the page at pFrom over to pTo. The pFrom->aCell[]
drh4b70f112004-05-02 21:12:19 +00002544** pointers that point into pFrom->aData[] must be adjusted to point
2545** into pTo->aData[] instead. But some pFrom->aCell[] entries might
2546** not point to pFrom->aData[]. Those are unchanged.
drh91025292004-05-03 19:49:32 +00002547**
2548** Over this operation completes, the meta data for pFrom is zeroed.
drh14acc042001-06-10 19:56:58 +00002549*/
drhda200cc2004-05-09 11:51:38 +00002550static void movePage(MemPage *pTo, MemPage *pFrom){
drh14acc042001-06-10 19:56:58 +00002551 uptr from, to;
2552 int i;
drh4b70f112004-05-02 21:12:19 +00002553 int pageSize;
2554 int ofst;
2555
2556 assert( pTo->hdrOffset==0 );
drh3a4c1412004-05-09 20:40:11 +00002557 assert( pFrom->isInit );
drh4b70f112004-05-02 21:12:19 +00002558 ofst = pFrom->hdrOffset;
drhc8629a12004-05-08 20:07:40 +00002559 pageSize = pFrom->pBt->pageSize;
drh91025292004-05-03 19:49:32 +00002560 sqliteFree(pTo->aCell);
drhc8629a12004-05-08 20:07:40 +00002561 memcpy(pTo->aData, &pFrom->aData[ofst], pageSize - ofst);
2562 memcpy(pTo, pFrom, offsetof(MemPage, aData));
2563 pFrom->isInit = 0;
2564 pFrom->aCell = 0;
drh4b70f112004-05-02 21:12:19 +00002565 assert( pTo->aData[5]<155 );
2566 pTo->aData[5] += ofst;
drh14acc042001-06-10 19:56:58 +00002567 pTo->isOverfull = pFrom->isOverfull;
drh4b70f112004-05-02 21:12:19 +00002568 to = Addr(pTo->aData);
drh91025292004-05-03 19:49:32 +00002569 from = Addr(&pFrom->aData[ofst]);
drh14acc042001-06-10 19:56:58 +00002570 for(i=0; i<pTo->nCell; i++){
drh91025292004-05-03 19:49:32 +00002571 uptr x = Addr(pTo->aCell[i]);
2572 if( x>from && x<from+pageSize-ofst ){
drh4b70f112004-05-02 21:12:19 +00002573 *((uptr*)&pTo->aCell[i]) = x + to - from;
drh14acc042001-06-10 19:56:58 +00002574 }
2575 }
drhbd03cae2001-06-02 02:40:57 +00002576}
2577
2578/*
drhc3b70572003-01-04 19:44:07 +00002579** The following parameters determine how many adjacent pages get involved
2580** in a balancing operation. NN is the number of neighbors on either side
2581** of the page that participate in the balancing operation. NB is the
2582** total number of pages that participate, including the target page and
2583** NN neighbors on either side.
2584**
2585** The minimum value of NN is 1 (of course). Increasing NN above 1
2586** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2587** in exchange for a larger degradation in INSERT and UPDATE performance.
2588** The value of NN appears to give the best results overall.
2589*/
2590#define NN 1 /* Number of neighbors on either side of pPage */
2591#define NB (NN*2+1) /* Total pages involved in the balance */
2592
2593/*
drh8b2f49b2001-06-08 00:21:52 +00002594** This routine redistributes Cells on pPage and up to two siblings
2595** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002596** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002597** though both siblings might come from one side if pPage is the first
2598** or last child of its parent. If pPage has fewer than two siblings
2599** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002600** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002601**
2602** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002603** one in an effort to keep pages between 66% and 100% full. The root page
2604** is special and is allowed to be less than 66% full. If pPage is
2605** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002606** or decreased by one, as necessary, to keep the root page from being
2607** overfull or empty.
2608**
drh4b70f112004-05-02 21:12:19 +00002609** This routine alwyas calls relinkCellList() on its input page regardless of
drh14acc042001-06-10 19:56:58 +00002610** whether or not it does any real balancing. Client routines will typically
2611** invoke insertCell() or dropCell() before calling this routine, so we
2612** need to call relinkCellList() to clean up the mess that those other
2613** routines left behind.
2614**
drh8b2f49b2001-06-08 00:21:52 +00002615** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00002616** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002617** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00002618** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00002619**
drh8c42ca92001-06-22 19:15:00 +00002620** In the course of balancing the siblings of pPage, the parent of pPage
2621** might become overfull or underfull. If that happens, then this routine
2622** is called recursively on the parent.
2623**
drh5e00f6c2001-09-13 13:46:56 +00002624** If this routine fails for any reason, it might leave the database
2625** in a corrupted state. So if this routine fails, the database should
2626** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002627*/
drh4b70f112004-05-02 21:12:19 +00002628static int balance(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00002629 MemPage *pParent; /* The parent of pPage */
drh4b70f112004-05-02 21:12:19 +00002630 Btree *pBt; /* The whole database */
drha34b6762004-05-07 13:30:42 +00002631 int nCell; /* Number of cells in aCell[] */
drh8b2f49b2001-06-08 00:21:52 +00002632 int nOld; /* Number of pages in apOld[] */
2633 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002634 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002635 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00002636 int idx; /* Index of pPage in pParent->aCell[] */
2637 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00002638 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00002639 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
2640 int usableSpace; /* Bytes in pPage beyond the header */
2641 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00002642 int subtotal; /* Subtotal of bytes in cells on one page */
drhda200cc2004-05-09 11:51:38 +00002643 MemPage *extraUnref = 0; /* Unref this page if not zero */
drhc3b70572003-01-04 19:44:07 +00002644 MemPage *apOld[NB]; /* pPage and up to two siblings */
2645 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00002646 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drhc3b70572003-01-04 19:44:07 +00002647 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2648 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2649 int idxDiv[NB]; /* Indices of divider cells in pParent */
drh4b70f112004-05-02 21:12:19 +00002650 u8 *apDiv[NB]; /* Divider cells in pParent */
2651 u8 aTemp[NB][MX_CELL_SIZE]; /* Temporary holding area for apDiv[] */
drh24cd67e2004-05-10 16:18:47 +00002652 u8 aInsBuf[NB][MX_CELL_SIZE];/* Space to hold dividers cells during insert */
drha34b6762004-05-07 13:30:42 +00002653 int cntNew[NB+1]; /* Index in aCell[] of cell after i-th page */
drhc3b70572003-01-04 19:44:07 +00002654 int szNew[NB+1]; /* Combined size of cells place on i-th page */
drh4b70f112004-05-02 21:12:19 +00002655 u8 *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
drhc3b70572003-01-04 19:44:07 +00002656 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh4b70f112004-05-02 21:12:19 +00002657 u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)]; /* Space for apCopy[] */
drh8b2f49b2001-06-08 00:21:52 +00002658
drh14acc042001-06-10 19:56:58 +00002659 /*
2660 ** Return without doing any work if pPage is neither overfull nor
2661 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002662 */
drh3a4c1412004-05-09 20:40:11 +00002663 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00002664 assert( sqlite3pager_iswriteable(pPage->aData) );
drh4b70f112004-05-02 21:12:19 +00002665 pBt = pPage->pBt;
2666 if( !pPage->isOverfull && pPage->nFree<pBt->pageSize/2 && pPage->nCell>=2){
2667 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002668 return SQLITE_OK;
2669 }
2670
2671 /*
drh4b70f112004-05-02 21:12:19 +00002672 ** Find the parent of the page to be balanced. If there is no parent,
2673 ** it means this page is the root page and special rules apply.
drh8b2f49b2001-06-08 00:21:52 +00002674 */
drh14acc042001-06-10 19:56:58 +00002675 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002676 if( pParent==0 ){
2677 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002678 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002679 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002680 if( pPage->nCell==0 ){
drh8856d6a2004-04-29 14:42:46 +00002681 if( pPage->leaf ){
2682 /* The table is completely empty */
2683 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002684 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
drh8856d6a2004-04-29 14:42:46 +00002685 }else{
2686 /* The root page is empty but has one child. Transfer the
2687 ** information from that one child into the root page if it
drh3a4c1412004-05-09 20:40:11 +00002688 ** will fit. This reduces the depth of the tree by one.
drh8856d6a2004-04-29 14:42:46 +00002689 **
2690 ** If the root page is page 1, it has less space available than
drh4b70f112004-05-02 21:12:19 +00002691 ** its child (due to the 100 byte header that occurs at the beginning
2692 ** of the database fle), so it might not be able to hold all of the
2693 ** information currently contained in the child. If this is the
2694 ** case, then do not do the transfer. Leave page 1 empty except
2695 ** for the right-pointer to the child page. The child page becomes
2696 ** the virtual root of the tree.
drh8b2f49b2001-06-08 00:21:52 +00002697 */
drha34b6762004-05-07 13:30:42 +00002698 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+6]);
2699 assert( pgnoChild>0 && pgnoChild<=sqlite3pager_pagecount(pBt->pPager) );
drh8856d6a2004-04-29 14:42:46 +00002700 rc = getPage(pBt, pgnoChild, &pChild);
drh8b2f49b2001-06-08 00:21:52 +00002701 if( rc ) return rc;
drh8856d6a2004-04-29 14:42:46 +00002702 if( pPage->pgno==1 ){
drh4b70f112004-05-02 21:12:19 +00002703 rc = initPage(pChild, pPage);
drh8856d6a2004-04-29 14:42:46 +00002704 if( rc ) return rc;
2705 if( pChild->nFree>=100 ){
drh4b70f112004-05-02 21:12:19 +00002706 /* The child information will fit on the root page, so do the
2707 ** copy */
2708 zeroPage(pPage, pChild->aData[0]);
2709 resizeCellArray(pPage, pChild->nCell);
2710 for(i=0; i<pChild->nCell; i++){
2711 insertCell(pPage, i, pChild->aCell[i],
drh24cd67e2004-05-10 16:18:47 +00002712 cellSize(pChild, pChild->aCell[i]), 0);
drh4b70f112004-05-02 21:12:19 +00002713 }
2714 freePage(pChild);
drhda200cc2004-05-09 11:51:38 +00002715 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
drh4b70f112004-05-02 21:12:19 +00002716 }else{
2717 /* The child has more information that will fit on the root.
2718 ** The tree is already balanced. Do nothing. */
drhda200cc2004-05-09 11:51:38 +00002719 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
drh8856d6a2004-04-29 14:42:46 +00002720 }
2721 }else{
drh3a4c1412004-05-09 20:40:11 +00002722 memcpy(pPage->aData, pChild->aData, pBt->pageSize);
drh8856d6a2004-04-29 14:42:46 +00002723 pPage->isInit = 0;
drh4b70f112004-05-02 21:12:19 +00002724 pPage->pParent = 0;
2725 rc = initPage(pPage, 0);
drh8856d6a2004-04-29 14:42:46 +00002726 assert( rc==SQLITE_OK );
drh4b70f112004-05-02 21:12:19 +00002727 freePage(pChild);
drh3a4c1412004-05-09 20:40:11 +00002728 TRACE(("BALANCE: transfer child %d into root %d\n",
2729 pChild->pgno, pPage->pgno));
drh5edc3122001-09-13 21:53:09 +00002730 }
drh4b70f112004-05-02 21:12:19 +00002731 reparentChildPages(pPage);
2732 releasePage(pChild);
drh8b2f49b2001-06-08 00:21:52 +00002733 }
2734 return SQLITE_OK;
2735 }
drh14acc042001-06-10 19:56:58 +00002736 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002737 /* It is OK for the root page to be less than half full.
2738 */
drha34b6762004-05-07 13:30:42 +00002739 relinkCellList(pPage);
drh3a4c1412004-05-09 20:40:11 +00002740 TRACE(("BALANCE: root page %d is low - no changes\n", pPage->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002741 return SQLITE_OK;
2742 }
drh14acc042001-06-10 19:56:58 +00002743 /*
2744 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002745 ** When this happens, Create a new child page and copy the
2746 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002747 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002748 ** child. Then fall thru to the code below which will cause
2749 ** the overfull child page to be split.
2750 */
drh4b70f112004-05-02 21:12:19 +00002751 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
drh14acc042001-06-10 19:56:58 +00002752 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00002753 assert( sqlite3pager_iswriteable(pChild->aData) );
drhda200cc2004-05-09 11:51:38 +00002754 movePage(pChild, pPage);
drhc8629a12004-05-08 20:07:40 +00002755 assert( pChild->aData[0]==pPage->aData[pPage->hdrOffset] );
drh14acc042001-06-10 19:56:58 +00002756 pChild->pParent = pPage;
drh457f5012004-05-09 01:35:05 +00002757 sqlite3pager_ref(pPage->aData);
drhda200cc2004-05-09 11:51:38 +00002758 pChild->idxParent = 0;
drh14acc042001-06-10 19:56:58 +00002759 pChild->isOverfull = 1;
drhc8629a12004-05-08 20:07:40 +00002760 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
drh4b70f112004-05-02 21:12:19 +00002761 put4byte(&pPage->aData[pPage->hdrOffset+6], pChild->pgno);
drh8b2f49b2001-06-08 00:21:52 +00002762 pParent = pPage;
2763 pPage = pChild;
drhda200cc2004-05-09 11:51:38 +00002764 extraUnref = pChild;
drh3a4c1412004-05-09 20:40:11 +00002765 TRACE(("BALANCE: copy root %d into %d and balance %d\n",
2766 pParent->pgno, pPage->pgno, pPage->pgno));
2767 }else{
2768 TRACE(("BALANCE: begin page %d child of %d\n",
2769 pPage->pgno, pParent->pgno));
drh8b2f49b2001-06-08 00:21:52 +00002770 }
drha34b6762004-05-07 13:30:42 +00002771 rc = sqlite3pager_write(pParent->aData);
drh6019e162001-07-02 17:51:45 +00002772 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002773 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002774
drh8b2f49b2001-06-08 00:21:52 +00002775 /*
drh4b70f112004-05-02 21:12:19 +00002776 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00002777 ** to pPage. The "idx" variable is the index of that cell. If pPage
2778 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002779 */
drhbb49aba2003-01-04 18:53:27 +00002780 if( pParent->idxShift ){
drha34b6762004-05-07 13:30:42 +00002781 Pgno pgno;
drh4b70f112004-05-02 21:12:19 +00002782 pgno = pPage->pgno;
drha34b6762004-05-07 13:30:42 +00002783 assert( pgno==sqlite3pager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00002784 for(idx=0; idx<pParent->nCell; idx++){
drha34b6762004-05-07 13:30:42 +00002785 if( get4byte(&pParent->aCell[idx][2])==pgno ){
drhbb49aba2003-01-04 18:53:27 +00002786 break;
2787 }
drh8b2f49b2001-06-08 00:21:52 +00002788 }
drh4b70f112004-05-02 21:12:19 +00002789 assert( idx<pParent->nCell
2790 || get4byte(&pParent->aData[pParent->hdrOffset+6])==pgno );
drhbb49aba2003-01-04 18:53:27 +00002791 }else{
2792 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002793 }
drh8b2f49b2001-06-08 00:21:52 +00002794
2795 /*
drh14acc042001-06-10 19:56:58 +00002796 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002797 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002798 */
drh14acc042001-06-10 19:56:58 +00002799 nOld = nNew = 0;
drha34b6762004-05-07 13:30:42 +00002800 sqlite3pager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00002801
2802 /*
drh4b70f112004-05-02 21:12:19 +00002803 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002804 ** the siblings. An attempt is made to find NN siblings on either
2805 ** side of pPage. More siblings are taken from one side, however, if
2806 ** pPage there are fewer than NN siblings on the other side. If pParent
2807 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002808 */
drhc3b70572003-01-04 19:44:07 +00002809 nxDiv = idx - NN;
2810 if( nxDiv + NB > pParent->nCell ){
2811 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002812 }
drhc3b70572003-01-04 19:44:07 +00002813 if( nxDiv<0 ){
2814 nxDiv = 0;
2815 }
drh8b2f49b2001-06-08 00:21:52 +00002816 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002817 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002818 if( k<pParent->nCell ){
2819 idxDiv[i] = k;
drh4b70f112004-05-02 21:12:19 +00002820 apDiv[i] = pParent->aCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002821 nDiv++;
drha34b6762004-05-07 13:30:42 +00002822 assert( !pParent->leaf );
2823 pgnoOld[i] = get4byte(&apDiv[i][2]);
drh14acc042001-06-10 19:56:58 +00002824 }else if( k==pParent->nCell ){
drh4b70f112004-05-02 21:12:19 +00002825 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+6]);
drh14acc042001-06-10 19:56:58 +00002826 }else{
2827 break;
drh8b2f49b2001-06-08 00:21:52 +00002828 }
drhde647132004-05-07 17:57:49 +00002829 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002830 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002831 apOld[i]->idxParent = k;
drh91025292004-05-03 19:49:32 +00002832 apCopy[i] = 0;
2833 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00002834 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002835 }
2836
2837 /*
drh14acc042001-06-10 19:56:58 +00002838 ** Make copies of the content of pPage and its siblings into aOld[].
2839 ** The rest of this function will use data from the copies rather
2840 ** that the original pages since the original pages will be in the
2841 ** process of being overwritten.
2842 */
2843 for(i=0; i<nOld; i++){
drhc8629a12004-05-08 20:07:40 +00002844 MemPage *p = apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
2845 p->aData = &((u8*)p)[-pBt->pageSize];
drhda200cc2004-05-09 11:51:38 +00002846 p->aCell = 0;
2847 p->hdrOffset = 0;
2848 movePage(p, apOld[i]);
drh14acc042001-06-10 19:56:58 +00002849 }
2850
2851 /*
2852 ** Load pointers to all cells on sibling pages and the divider cells
2853 ** into the local apCell[] array. Make copies of the divider cells
2854 ** into aTemp[] and remove the the divider Cells from pParent.
drh4b70f112004-05-02 21:12:19 +00002855 **
2856 ** If the siblings are on leaf pages, then the child pointers of the
2857 ** divider cells are stripped from the cells before they are copied
2858 ** into aTemp[]. In this wall, all cells in apCell[] are without
2859 ** child pointers. If siblings are not leaves, then all cell in
2860 ** apCell[] include child pointers. Either way, all cells in apCell[]
2861 ** are alike.
drh8b2f49b2001-06-08 00:21:52 +00002862 */
2863 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00002864 leafCorrection = pPage->leaf*4;
drh8b2f49b2001-06-08 00:21:52 +00002865 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002866 MemPage *pOld = apCopy[i];
drh8b2f49b2001-06-08 00:21:52 +00002867 for(j=0; j<pOld->nCell; j++){
drh4b70f112004-05-02 21:12:19 +00002868 apCell[nCell] = pOld->aCell[j];
2869 szCell[nCell] = cellSize(pOld, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002870 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002871 }
2872 if( i<nOld-1 ){
drhda200cc2004-05-09 11:51:38 +00002873 szCell[nCell] = cellSize(pParent, apDiv[i]);
2874 memcpy(aTemp[i], apDiv[i], szCell[nCell]);
drh4b70f112004-05-02 21:12:19 +00002875 apCell[nCell] = &aTemp[i][leafCorrection];
2876 dropCell(pParent, nxDiv, szCell[nCell]);
drhda200cc2004-05-09 11:51:38 +00002877 szCell[nCell] -= leafCorrection;
2878 assert( get4byte(&aTemp[i][2])==pgnoOld[i] );
drh4b70f112004-05-02 21:12:19 +00002879 if( !pOld->leaf ){
2880 assert( leafCorrection==0 );
2881 /* The right pointer of the child page pOld becomes the left
2882 ** pointer of the divider cell */
2883 memcpy(&apCell[nCell][2], &pOld->aData[pOld->hdrOffset+6], 4);
2884 }else{
2885 assert( leafCorrection==4 );
2886 }
drh14acc042001-06-10 19:56:58 +00002887 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002888 }
2889 }
2890
2891 /*
drh6019e162001-07-02 17:51:45 +00002892 ** Figure out the number of pages needed to hold all nCell cells.
2893 ** Store this number in "k". Also compute szNew[] which is the total
2894 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00002895 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00002896 ** cntNew[k] should equal nCell.
2897 **
2898 ** This little patch of code is critical for keeping the tree
2899 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00002900 */
drh4b70f112004-05-02 21:12:19 +00002901 usableSpace = pBt->pageSize - 10 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00002902 for(subtotal=k=i=0; i<nCell; i++){
2903 subtotal += szCell[i];
drh4b70f112004-05-02 21:12:19 +00002904 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00002905 szNew[k] = subtotal - szCell[i];
2906 cntNew[k] = i;
2907 subtotal = 0;
2908 k++;
2909 }
2910 }
2911 szNew[k] = subtotal;
2912 cntNew[k] = nCell;
2913 k++;
2914 for(i=k-1; i>0; i--){
drh4b70f112004-05-02 21:12:19 +00002915 while( szNew[i]<usableSpace/2 ){
drh6019e162001-07-02 17:51:45 +00002916 cntNew[i-1]--;
2917 assert( cntNew[i-1]>0 );
2918 szNew[i] += szCell[cntNew[i-1]];
2919 szNew[i-1] -= szCell[cntNew[i-1]-1];
2920 }
2921 }
2922 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00002923
2924 /*
drh6b308672002-07-08 02:16:37 +00002925 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00002926 */
drh4b70f112004-05-02 21:12:19 +00002927 assert( pPage->pgno>1 );
2928 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00002929 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00002930 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00002931 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00002932 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00002933 pgnoNew[i] = pgnoOld[i];
2934 apOld[i] = 0;
drhda200cc2004-05-09 11:51:38 +00002935 sqlite3pager_write(pNew->aData);
drh6b308672002-07-08 02:16:37 +00002936 }else{
drhda200cc2004-05-09 11:51:38 +00002937 rc = allocatePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00002938 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00002939 apNew[i] = pNew;
drh6b308672002-07-08 02:16:37 +00002940 }
drh14acc042001-06-10 19:56:58 +00002941 nNew++;
drhda200cc2004-05-09 11:51:38 +00002942 zeroPage(pNew, pageFlags);
drh8b2f49b2001-06-08 00:21:52 +00002943 }
2944
drh6b308672002-07-08 02:16:37 +00002945 /* Free any old pages that were not reused as new pages.
2946 */
2947 while( i<nOld ){
drh4b70f112004-05-02 21:12:19 +00002948 rc = freePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002949 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00002950 releasePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002951 apOld[i] = 0;
2952 i++;
2953 }
2954
drh8b2f49b2001-06-08 00:21:52 +00002955 /*
drhf9ffac92002-03-02 19:00:31 +00002956 ** Put the new pages in accending order. This helps to
2957 ** keep entries in the disk file in order so that a scan
2958 ** of the table is a linear scan through the file. That
2959 ** in turn helps the operating system to deliver pages
2960 ** from the disk more rapidly.
2961 **
2962 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00002963 ** n is never more than NB (a small constant), that should
2964 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00002965 **
drhc3b70572003-01-04 19:44:07 +00002966 ** When NB==3, this one optimization makes the database
2967 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00002968 */
2969 for(i=0; i<k-1; i++){
2970 int minV = pgnoNew[i];
2971 int minI = i;
2972 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00002973 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00002974 minI = j;
2975 minV = pgnoNew[j];
2976 }
2977 }
2978 if( minI>i ){
2979 int t;
2980 MemPage *pT;
2981 t = pgnoNew[i];
2982 pT = apNew[i];
2983 pgnoNew[i] = pgnoNew[minI];
2984 apNew[i] = apNew[minI];
2985 pgnoNew[minI] = t;
2986 apNew[minI] = pT;
2987 }
2988 }
drh24cd67e2004-05-10 16:18:47 +00002989 TRACE(("BALANCE: old: %d %d %d new: %d %d %d %d\n",
2990 pgnoOld[0],
2991 nOld>=2 ? pgnoOld[1] : 0,
2992 nOld>=3 ? pgnoOld[2] : 0,
2993 pgnoNew[0],
2994 nNew>=2 ? pgnoNew[1] : 0,
2995 nNew>=3 ? pgnoNew[2] : 0,
2996 nNew>=4 ? pgnoNew[3] : 0));
2997
drhf9ffac92002-03-02 19:00:31 +00002998
2999 /*
drh14acc042001-06-10 19:56:58 +00003000 ** Evenly distribute the data in apCell[] across the new pages.
3001 ** Insert divider cells into pParent as necessary.
3002 */
3003 j = 0;
3004 for(i=0; i<nNew; i++){
3005 MemPage *pNew = apNew[i];
drh4b70f112004-05-02 21:12:19 +00003006 assert( pNew->pgno==pgnoNew[i] );
3007 resizeCellArray(pNew, cntNew[i] - j);
drh6019e162001-07-02 17:51:45 +00003008 while( j<cntNew[i] ){
3009 assert( pNew->nFree>=szCell[j] );
drh24cd67e2004-05-10 16:18:47 +00003010 insertCell(pNew, pNew->nCell, apCell[j], szCell[j], 0);
drh14acc042001-06-10 19:56:58 +00003011 j++;
3012 }
drh6019e162001-07-02 17:51:45 +00003013 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00003014 assert( !pNew->isOverfull );
drh4b70f112004-05-02 21:12:19 +00003015 relinkCellList(pNew);
drh14acc042001-06-10 19:56:58 +00003016 if( i<nNew-1 && j<nCell ){
drh4b70f112004-05-02 21:12:19 +00003017 u8 *pCell = apCell[j];
drh24cd67e2004-05-10 16:18:47 +00003018 u8 *pTemp;
drh4b70f112004-05-02 21:12:19 +00003019 if( !pNew->leaf ){
drh24cd67e2004-05-10 16:18:47 +00003020 memcpy(&pNew->aData[6], pCell+2, 4);
3021 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00003022 }else{
3023 pCell -= 4;
drh24cd67e2004-05-10 16:18:47 +00003024 pTemp = aInsBuf[i];
drh4b70f112004-05-02 21:12:19 +00003025 }
drh24cd67e2004-05-10 16:18:47 +00003026 insertCell(pParent, nxDiv, pCell, szCell[j]+leafCorrection, pTemp);
drh4b70f112004-05-02 21:12:19 +00003027 put4byte(&pParent->aCell[nxDiv][2], pNew->pgno);
drh14acc042001-06-10 19:56:58 +00003028 j++;
3029 nxDiv++;
3030 }
3031 }
drh6019e162001-07-02 17:51:45 +00003032 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00003033 if( (pageFlags & PTF_LEAF)==0 ){
3034 memcpy(&apNew[nNew-1]->aData[6], &apCopy[nOld-1]->aData[6], 4);
drh14acc042001-06-10 19:56:58 +00003035 }
drh4b70f112004-05-02 21:12:19 +00003036 if( nxDiv==pParent->nCell ){
3037 /* Right-most sibling is the right-most child of pParent */
3038 put4byte(&pParent->aData[pParent->hdrOffset+6], pgnoNew[nNew-1]);
3039 }else{
3040 /* Right-most sibling is the left child of the first entry in pParent
3041 ** past the right-most divider entry */
drha34b6762004-05-07 13:30:42 +00003042 put4byte(&pParent->aCell[nxDiv][2], pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00003043 }
3044
3045 /*
3046 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00003047 */
3048 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00003049 reparentChildPages(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003050 }
drh4b70f112004-05-02 21:12:19 +00003051 reparentChildPages(pParent);
drh8b2f49b2001-06-08 00:21:52 +00003052
3053 /*
drh3a4c1412004-05-09 20:40:11 +00003054 ** Balance the parent page. Note that the current page (pPage) might
3055 ** have been added to the freelist is it might no longer be initialized.
3056 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00003057 */
drhda200cc2004-05-09 11:51:38 +00003058 assert( pParent->isInit );
drh3a4c1412004-05-09 20:40:11 +00003059 /* assert( pPage->isInit ); // No! pPage might have been added to freelist */
3060 /* pageIntegrity(pPage); // No! pPage might have been added to freelist */
drh4b70f112004-05-02 21:12:19 +00003061 rc = balance(pParent);
drhda200cc2004-05-09 11:51:38 +00003062
drh8b2f49b2001-06-08 00:21:52 +00003063 /*
drh14acc042001-06-10 19:56:58 +00003064 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00003065 */
drh14acc042001-06-10 19:56:58 +00003066balance_cleanup:
drh8b2f49b2001-06-08 00:21:52 +00003067 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00003068 releasePage(apOld[i]);
3069 if( apCopy[i] ){
drh91025292004-05-03 19:49:32 +00003070 sqliteFree(apCopy[i]->aCell);
3071 }
drh8b2f49b2001-06-08 00:21:52 +00003072 }
drh14acc042001-06-10 19:56:58 +00003073 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00003074 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00003075 }
drh91025292004-05-03 19:49:32 +00003076 releasePage(pParent);
drhda200cc2004-05-09 11:51:38 +00003077 releasePage(extraUnref);
drh3a4c1412004-05-09 20:40:11 +00003078 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
3079 pPage->pgno, nOld, nNew, nCell));
drh8b2f49b2001-06-08 00:21:52 +00003080 return rc;
3081}
3082
3083/*
drhf74b8d92002-09-01 23:20:45 +00003084** This routine checks all cursors that point to the same table
3085** as pCur points to. If any of those cursors were opened with
3086** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
3087** cursors point to the same table were opened with wrFlag==1
3088** then this routine returns SQLITE_OK.
3089**
3090** In addition to checking for read-locks (where a read-lock
3091** means a cursor opened with wrFlag==0) this routine also moves
3092** all cursors other than pCur so that they are pointing to the
3093** first Cell on root page. This is necessary because an insert
3094** or delete might change the number of cells on a page or delete
3095** a page entirely and we do not want to leave any cursors
3096** pointing to non-existant pages or cells.
3097*/
3098static int checkReadLocks(BtCursor *pCur){
3099 BtCursor *p;
3100 assert( pCur->wrFlag );
3101 for(p=pCur->pShared; p!=pCur; p=p->pShared){
3102 assert( p );
3103 assert( p->pgnoRoot==pCur->pgnoRoot );
drha34b6762004-05-07 13:30:42 +00003104 assert( p->pPage->pgno==sqlite3pager_pagenumber(p->pPage->aData) );
drhf74b8d92002-09-01 23:20:45 +00003105 if( p->wrFlag==0 ) return SQLITE_LOCKED;
drh91025292004-05-03 19:49:32 +00003106 if( p->pPage->pgno!=p->pgnoRoot ){
drhf74b8d92002-09-01 23:20:45 +00003107 moveToRoot(p);
3108 }
3109 }
3110 return SQLITE_OK;
3111}
3112
3113/*
drh3b7511c2001-05-26 13:15:44 +00003114** Insert a new record into the BTree. The key is given by (pKey,nKey)
3115** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00003116** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00003117** is left pointing at a random location.
3118**
3119** For an INTKEY table, only the nKey value of the key is used. pKey is
3120** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00003121*/
drh3aac2dd2004-04-26 14:10:20 +00003122int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00003123 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4b70f112004-05-02 21:12:19 +00003124 const void *pKey, u64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00003125 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00003126){
drh3b7511c2001-05-26 13:15:44 +00003127 int rc;
3128 int loc;
drh14acc042001-06-10 19:56:58 +00003129 int szNew;
drh3b7511c2001-05-26 13:15:44 +00003130 MemPage *pPage;
3131 Btree *pBt = pCur->pBt;
drha34b6762004-05-07 13:30:42 +00003132 unsigned char *oldCell;
3133 unsigned char newCell[MX_CELL_SIZE];
drh3b7511c2001-05-26 13:15:44 +00003134
drhc39e0002004-05-07 23:50:57 +00003135 if( pCur->status ){
3136 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003137 }
drhf74b8d92002-09-01 23:20:45 +00003138 if( !pBt->inTrans || nKey+nData==0 ){
3139 /* Must start a transaction before doing an insert */
3140 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003141 }
drhf74b8d92002-09-01 23:20:45 +00003142 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00003143 if( !pCur->wrFlag ){
3144 return SQLITE_PERM; /* Cursor not open for writing */
3145 }
drhf74b8d92002-09-01 23:20:45 +00003146 if( checkReadLocks(pCur) ){
3147 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3148 }
drh3aac2dd2004-04-26 14:10:20 +00003149 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00003150 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00003151 pPage = pCur->pPage;
drh3a4c1412004-05-09 20:40:11 +00003152 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
3153 pCur->pgnoRoot, nKey, nData, pPage->pgno,
3154 loc==0 ? "overwrite" : "new entry"));
drh7aa128d2002-06-21 13:09:16 +00003155 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003156 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003157 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003158 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, &szNew);
drh3b7511c2001-05-26 13:15:44 +00003159 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003160 assert( szNew==cellSize(pPage, newCell) );
drh3a4c1412004-05-09 20:40:11 +00003161 assert( szNew<=sizeof(newCell) );
drh3b7511c2001-05-26 13:15:44 +00003162 if( loc==0 ){
drha34b6762004-05-07 13:30:42 +00003163 int szOld;
3164 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00003165 oldCell = pPage->aCell[pCur->idx];
3166 if( !pPage->leaf ){
3167 memcpy(&newCell[2], &oldCell[2], 4);
3168 }
3169 szOld = cellSize(pPage, oldCell);
3170 rc = clearCell(pPage, oldCell);
drh5e2f8b92001-05-28 00:41:15 +00003171 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003172 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00003173 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00003174 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00003175 pCur->idx++;
3176 }else{
drh4b70f112004-05-02 21:12:19 +00003177 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00003178 }
drh24cd67e2004-05-10 16:18:47 +00003179 insertCell(pPage, pCur->idx, newCell, szNew, 0);
drh4b70f112004-05-02 21:12:19 +00003180 rc = balance(pPage);
drh23e11ca2004-05-04 17:27:28 +00003181 /* sqlite3BtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
drh3fc190c2001-09-14 03:24:23 +00003182 /* fflush(stdout); */
drh4b70f112004-05-02 21:12:19 +00003183 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003184 return rc;
3185}
3186
3187/*
drh4b70f112004-05-02 21:12:19 +00003188** Delete the entry that the cursor is pointing to. The cursor
3189** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00003190*/
drh3aac2dd2004-04-26 14:10:20 +00003191int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00003192 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00003193 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00003194 int rc;
drh8c42ca92001-06-22 19:15:00 +00003195 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00003196 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00003197
drh7aa128d2002-06-21 13:09:16 +00003198 assert( pPage->isInit );
drhc39e0002004-05-07 23:50:57 +00003199 if( pCur->status ){
3200 return pCur->status; /* A rollback destroyed this cursor */
drhecdc7532001-09-23 02:35:53 +00003201 }
drhf74b8d92002-09-01 23:20:45 +00003202 if( !pBt->inTrans ){
3203 /* Must start a transaction before doing a delete */
3204 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003205 }
drhf74b8d92002-09-01 23:20:45 +00003206 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00003207 if( pCur->idx >= pPage->nCell ){
3208 return SQLITE_ERROR; /* The cursor is not pointing to anything */
3209 }
drhecdc7532001-09-23 02:35:53 +00003210 if( !pCur->wrFlag ){
3211 return SQLITE_PERM; /* Did not open this cursor for writing */
3212 }
drhf74b8d92002-09-01 23:20:45 +00003213 if( checkReadLocks(pCur) ){
3214 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
3215 }
drha34b6762004-05-07 13:30:42 +00003216 rc = sqlite3pager_write(pPage->aData);
drhbd03cae2001-06-02 02:40:57 +00003217 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003218 pCell = pPage->aCell[pCur->idx];
3219 if( !pPage->leaf ){
3220 pgnoChild = get4byte(&pCell[2]);
3221 }
3222 clearCell(pPage, pCell);
3223 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00003224 /*
drh5e00f6c2001-09-13 13:46:56 +00003225 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00003226 ** do something we will leave a hole on an internal page.
3227 ** We have to fill the hole by moving in a cell from a leaf. The
3228 ** next Cell after the one to be deleted is guaranteed to exist and
3229 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00003230 */
drh14acc042001-06-10 19:56:58 +00003231 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00003232 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00003233 int szNext;
drh8c1238a2003-01-02 14:43:55 +00003234 int notUsed;
drh24cd67e2004-05-10 16:18:47 +00003235 unsigned char tempCell[MX_CELL_SIZE];
drh14acc042001-06-10 19:56:58 +00003236 getTempCursor(pCur, &leafCur);
drh3aac2dd2004-04-26 14:10:20 +00003237 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00003238 if( rc!=SQLITE_OK ){
drh8a6ac0a2004-02-14 17:35:07 +00003239 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3240 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003241 }
drha34b6762004-05-07 13:30:42 +00003242 rc = sqlite3pager_write(leafCur.pPage->aData);
drh6019e162001-07-02 17:51:45 +00003243 if( rc ) return rc;
drh3a4c1412004-05-09 20:40:11 +00003244 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
3245 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003246 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3247 pNext = leafCur.pPage->aCell[leafCur.idx];
3248 szNext = cellSize(leafCur.pPage, pNext);
drh24cd67e2004-05-10 16:18:47 +00003249 assert( sizeof(tempCell)>=szNext+4 );
3250 insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell);
3251 put4byte(pPage->aCell[pCur->idx]+2, pgnoChild);
drh4b70f112004-05-02 21:12:19 +00003252 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003253 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003254 dropCell(leafCur.pPage, leafCur.idx, szNext);
3255 rc = balance(leafCur.pPage);
drh8c42ca92001-06-22 19:15:00 +00003256 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00003257 }else{
drh3a4c1412004-05-09 20:40:11 +00003258 TRACE(("DELETE: table=%d delete from leaf %d\n",
3259 pCur->pgnoRoot, pPage->pgno));
drh4b70f112004-05-02 21:12:19 +00003260 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3261 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003262 }
drh4b70f112004-05-02 21:12:19 +00003263 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003264 return rc;
drh3b7511c2001-05-26 13:15:44 +00003265}
drh8b2f49b2001-06-08 00:21:52 +00003266
3267/*
drhc6b52df2002-01-04 03:09:29 +00003268** Create a new BTree table. Write into *piTable the page
3269** number for the root page of the new table.
3270**
3271** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00003272** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00003273** are restricted to having a 4-byte integer key and arbitrary data and
3274** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00003275** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00003276*/
drh3aac2dd2004-04-26 14:10:20 +00003277int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
drh8b2f49b2001-06-08 00:21:52 +00003278 MemPage *pRoot;
3279 Pgno pgnoRoot;
3280 int rc;
3281 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003282 /* Must start a transaction first */
3283 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003284 }
drh5df72a52002-06-06 23:16:05 +00003285 if( pBt->readOnly ){
3286 return SQLITE_READONLY;
3287 }
drhda200cc2004-05-09 11:51:38 +00003288 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 1);
drh8b2f49b2001-06-08 00:21:52 +00003289 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003290 assert( sqlite3pager_iswriteable(pRoot->aData) );
drhde647132004-05-07 17:57:49 +00003291 zeroPage(pRoot, flags | PTF_LEAF);
drha34b6762004-05-07 13:30:42 +00003292 sqlite3pager_unref(pRoot->aData);
drh8b2f49b2001-06-08 00:21:52 +00003293 *piTable = (int)pgnoRoot;
3294 return SQLITE_OK;
3295}
3296
3297/*
3298** Erase the given database page and all its children. Return
3299** the page to the freelist.
3300*/
drh4b70f112004-05-02 21:12:19 +00003301static int clearDatabasePage(
3302 Btree *pBt, /* The BTree that contains the table */
3303 Pgno pgno, /* Page number to clear */
3304 MemPage *pParent, /* Parent page. NULL for the root */
3305 int freePageFlag /* Deallocate page if true */
3306){
drh8b2f49b2001-06-08 00:21:52 +00003307 MemPage *pPage;
3308 int rc;
drh4b70f112004-05-02 21:12:19 +00003309 unsigned char *pCell;
3310 int i;
drh8b2f49b2001-06-08 00:21:52 +00003311
drhde647132004-05-07 17:57:49 +00003312 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
drh8b2f49b2001-06-08 00:21:52 +00003313 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00003314 rc = sqlite3pager_write(pPage->aData);
drh6019e162001-07-02 17:51:45 +00003315 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003316 for(i=0; i<pPage->nCell; i++){
3317 pCell = pPage->aCell[i];
3318 if( !pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003319 rc = clearDatabasePage(pBt, get4byte(&pCell[2]), pPage->pParent, 1);
drh8b2f49b2001-06-08 00:21:52 +00003320 if( rc ) return rc;
3321 }
drh4b70f112004-05-02 21:12:19 +00003322 rc = clearCell(pPage, pCell);
drh8b2f49b2001-06-08 00:21:52 +00003323 if( rc ) return rc;
3324 }
drha34b6762004-05-07 13:30:42 +00003325 if( !pPage->leaf ){
3326 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[6]), pPage->pParent, 1);
drh2aa679f2001-06-25 02:11:07 +00003327 if( rc ) return rc;
3328 }
3329 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00003330 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003331 }else{
drh3a4c1412004-05-09 20:40:11 +00003332 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00003333 }
drh4b70f112004-05-02 21:12:19 +00003334 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003335 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003336}
3337
3338/*
3339** Delete all information from a single table in the database.
3340*/
drh3aac2dd2004-04-26 14:10:20 +00003341int sqlite3BtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003342 int rc;
drhf74b8d92002-09-01 23:20:45 +00003343 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003344 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003345 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003346 }
drhf74b8d92002-09-01 23:20:45 +00003347 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3348 if( pCur->pgnoRoot==(Pgno)iTable ){
3349 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3350 moveToRoot(pCur);
3351 }
drhecdc7532001-09-23 02:35:53 +00003352 }
drha34b6762004-05-07 13:30:42 +00003353 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
drh8b2f49b2001-06-08 00:21:52 +00003354 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003355 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00003356 }
drh8c42ca92001-06-22 19:15:00 +00003357 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003358}
3359
3360/*
3361** Erase all information in a table and add the root of the table to
3362** the freelist. Except, the root of the principle table (the one on
3363** page 2) is never added to the freelist.
3364*/
drh3aac2dd2004-04-26 14:10:20 +00003365int sqlite3BtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003366 int rc;
3367 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00003368 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003369 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003370 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003371 }
drhf74b8d92002-09-01 23:20:45 +00003372 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3373 if( pCur->pgnoRoot==(Pgno)iTable ){
3374 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
3375 }
drh5df72a52002-06-06 23:16:05 +00003376 }
drha34b6762004-05-07 13:30:42 +00003377 rc = getPage(pBt, (Pgno)iTable, &pPage);
drh2aa679f2001-06-25 02:11:07 +00003378 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003379 rc = sqlite3BtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00003380 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003381 if( iTable>1 ){
drha34b6762004-05-07 13:30:42 +00003382 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003383 }else{
drha34b6762004-05-07 13:30:42 +00003384 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
drh8b2f49b2001-06-08 00:21:52 +00003385 }
drh4b70f112004-05-02 21:12:19 +00003386 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00003387 return rc;
3388}
3389
drh001bbcb2003-03-19 03:14:00 +00003390
drh8b2f49b2001-06-08 00:21:52 +00003391/*
drh23e11ca2004-05-04 17:27:28 +00003392** Read the meta-information out of a database file. Meta[0]
3393** is the number of free pages currently in the database. Meta[1]
3394** through meta[15] are available for use by higher layers.
drh8b2f49b2001-06-08 00:21:52 +00003395*/
drh3aac2dd2004-04-26 14:10:20 +00003396int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00003397 int rc;
drh4b70f112004-05-02 21:12:19 +00003398 unsigned char *pP1;
drh8b2f49b2001-06-08 00:21:52 +00003399
drh23e11ca2004-05-04 17:27:28 +00003400 assert( idx>=0 && idx<=15 );
drha34b6762004-05-07 13:30:42 +00003401 rc = sqlite3pager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00003402 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003403 *pMeta = get4byte(&pP1[36 + idx*4]);
drha34b6762004-05-07 13:30:42 +00003404 sqlite3pager_unref(pP1);
drh8b2f49b2001-06-08 00:21:52 +00003405 return SQLITE_OK;
3406}
3407
3408/*
drh23e11ca2004-05-04 17:27:28 +00003409** Write meta-information back into the database. Meta[0] is
3410** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00003411*/
drh3aac2dd2004-04-26 14:10:20 +00003412int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
drh4b70f112004-05-02 21:12:19 +00003413 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00003414 int rc;
drh23e11ca2004-05-04 17:27:28 +00003415 assert( idx>=1 && idx<=15 );
drh8b2f49b2001-06-08 00:21:52 +00003416 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003417 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003418 }
drhde647132004-05-07 17:57:49 +00003419 assert( pBt->pPage1!=0 );
3420 pP1 = pBt->pPage1->aData;
drha34b6762004-05-07 13:30:42 +00003421 rc = sqlite3pager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00003422 if( rc ) return rc;
drh23e11ca2004-05-04 17:27:28 +00003423 put4byte(&pP1[36 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00003424 return SQLITE_OK;
3425}
drh8c42ca92001-06-22 19:15:00 +00003426
drh5eddca62001-06-30 21:53:53 +00003427/******************************************************************************
3428** The complete implementation of the BTree subsystem is above this line.
3429** All the code the follows is for testing and troubleshooting the BTree
3430** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003431******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003432
drh8c42ca92001-06-22 19:15:00 +00003433/*
3434** Print a disassembly of the given page on standard output. This routine
3435** is used for debugging and testing only.
3436*/
drhaaab5722002-02-19 13:39:21 +00003437#ifdef SQLITE_TEST
drh23e11ca2004-05-04 17:27:28 +00003438int sqlite3BtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003439 int rc;
3440 MemPage *pPage;
drhc8629a12004-05-08 20:07:40 +00003441 int i, j, c;
drh8c42ca92001-06-22 19:15:00 +00003442 int nFree;
3443 u16 idx;
drhab9f7f12004-05-08 10:56:11 +00003444 int hdr;
3445 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00003446 char range[20];
3447 unsigned char payload[20];
drhab9f7f12004-05-08 10:56:11 +00003448
drh4b70f112004-05-02 21:12:19 +00003449 rc = getPage(pBt, (Pgno)pgno, &pPage);
drh8c42ca92001-06-22 19:15:00 +00003450 if( rc ){
3451 return rc;
3452 }
drhab9f7f12004-05-08 10:56:11 +00003453 hdr = pPage->hdrOffset;
3454 data = pPage->aData;
drhc8629a12004-05-08 20:07:40 +00003455 c = data[hdr];
3456 pPage->intKey = (c & PTF_INTKEY)!=0;
3457 pPage->zeroData = (c & PTF_ZERODATA)!=0;
3458 pPage->leaf = (c & PTF_LEAF)!=0;
drhda200cc2004-05-09 11:51:38 +00003459 printf("PAGE %d: flags=0x%02x frag=%d parent=%d\n", pgno,
3460 data[hdr], data[hdr+5],
3461 (pPage->isInit && pPage->pParent) ? pPage->pParent->pgno : 0);
drh8c42ca92001-06-22 19:15:00 +00003462 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003463 assert( hdr == (pgno==1 ? 100 : 0) );
3464 idx = get2byte(&data[hdr+3]);
drh4b70f112004-05-02 21:12:19 +00003465 while( idx>0 && idx<=pBt->pageSize ){
3466 u64 nData, nKey;
3467 int nHeader;
3468 Pgno child;
drhab9f7f12004-05-08 10:56:11 +00003469 unsigned char *pCell = &data[idx];
drh4b70f112004-05-02 21:12:19 +00003470 int sz = cellSize(pPage, pCell);
drh8c42ca92001-06-22 19:15:00 +00003471 sprintf(range,"%d..%d", idx, idx+sz-1);
drh4b70f112004-05-02 21:12:19 +00003472 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3473 if( pPage->leaf ){
3474 child = 0;
3475 }else{
3476 child = get4byte(&pCell[2]);
3477 }
drha34b6762004-05-07 13:30:42 +00003478 sz = nData;
3479 if( !pPage->intKey ) sz += nKey;
drh8c42ca92001-06-22 19:15:00 +00003480 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
drha34b6762004-05-07 13:30:42 +00003481 memcpy(payload, &pCell[nHeader], sz);
drh8c42ca92001-06-22 19:15:00 +00003482 for(j=0; j<sz; j++){
3483 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3484 }
3485 payload[sz] = 0;
3486 printf(
drha34b6762004-05-07 13:30:42 +00003487 "cell %2d: i=%-10s chld=%-4d nk=%-4lld nd=%-4lld payload=%s\n",
3488 i, range, child, nKey, nData, payload
drh8c42ca92001-06-22 19:15:00 +00003489 );
drh4b70f112004-05-02 21:12:19 +00003490 if( pPage->isInit && pPage->aCell[i]!=pCell ){
3491 printf("**** aCell[%d] does not match on prior entry ****\n", i);
drh2aa679f2001-06-25 02:11:07 +00003492 }
drh7c717f72001-06-24 20:39:41 +00003493 i++;
drh4b70f112004-05-02 21:12:19 +00003494 idx = get2byte(pCell);
drh8c42ca92001-06-22 19:15:00 +00003495 }
3496 if( idx!=0 ){
3497 printf("ERROR: next cell index out of range: %d\n", idx);
3498 }
drh4b70f112004-05-02 21:12:19 +00003499 if( !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003500 printf("right_child: %d\n", get4byte(&data[6]));
drh4b70f112004-05-02 21:12:19 +00003501 }
drh8c42ca92001-06-22 19:15:00 +00003502 nFree = 0;
3503 i = 0;
drhab9f7f12004-05-08 10:56:11 +00003504 idx = get2byte(&data[hdr+1]);
drhc39e0002004-05-07 23:50:57 +00003505 while( idx>0 && idx<pPage->pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003506 int sz = get2byte(&data[idx+2]);
drh4b70f112004-05-02 21:12:19 +00003507 sprintf(range,"%d..%d", idx, idx+sz-1);
3508 nFree += sz;
drh8c42ca92001-06-22 19:15:00 +00003509 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00003510 i, range, sz, nFree);
drhab9f7f12004-05-08 10:56:11 +00003511 idx = get2byte(&data[idx]);
drh2aa679f2001-06-25 02:11:07 +00003512 i++;
drh8c42ca92001-06-22 19:15:00 +00003513 }
3514 if( idx!=0 ){
3515 printf("ERROR: next freeblock index out of range: %d\n", idx);
3516 }
drha34b6762004-05-07 13:30:42 +00003517 if( recursive && !pPage->leaf ){
drhab9f7f12004-05-08 10:56:11 +00003518 idx = get2byte(&data[hdr+3]);
drha34b6762004-05-07 13:30:42 +00003519 while( idx>0 && idx<pBt->pageSize ){
drhab9f7f12004-05-08 10:56:11 +00003520 unsigned char *pCell = &data[idx];
drha34b6762004-05-07 13:30:42 +00003521 sqlite3BtreePageDump(pBt, get4byte(&pCell[2]), 1);
3522 idx = get2byte(pCell);
drh6019e162001-07-02 17:51:45 +00003523 }
drhab9f7f12004-05-08 10:56:11 +00003524 sqlite3BtreePageDump(pBt, get4byte(&data[hdr+6]), 1);
drh6019e162001-07-02 17:51:45 +00003525 }
drhab9f7f12004-05-08 10:56:11 +00003526 sqlite3pager_unref(data);
drh8c42ca92001-06-22 19:15:00 +00003527 return SQLITE_OK;
3528}
drhaaab5722002-02-19 13:39:21 +00003529#endif
drh8c42ca92001-06-22 19:15:00 +00003530
drhaaab5722002-02-19 13:39:21 +00003531#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003532/*
drha34b6762004-05-07 13:30:42 +00003533** Return the flag byte at the beginning of the page that the cursor
3534** is currently pointing to.
3535*/
3536int sqlite3BtreeFlags(BtCursor *pCur){
3537 return pCur->pPage->aData[pCur->pPage->hdrOffset];
3538}
3539#endif
3540
3541#ifdef SQLITE_TEST
3542/*
drh2aa679f2001-06-25 02:11:07 +00003543** Fill aResult[] with information about the entry and page that the
3544** cursor is pointing to.
3545**
3546** aResult[0] = The page number
3547** aResult[1] = The entry number
3548** aResult[2] = Total number of entries on this page
3549** aResult[3] = Size of this entry
3550** aResult[4] = Number of free bytes on this page
3551** aResult[5] = Number of free blocks on the page
3552** aResult[6] = Page number of the left child of this entry
3553** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003554**
3555** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003556*/
drhda200cc2004-05-09 11:51:38 +00003557int sqlite3BtreeCursorInfo(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003558 int cnt, idx;
3559 MemPage *pPage = pCur->pPage;
drhda200cc2004-05-09 11:51:38 +00003560
3561 pageIntegrity(pPage);
drh4b70f112004-05-02 21:12:19 +00003562 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003563 aResult[0] = sqlite3pager_pagenumber(pPage->aData);
drh91025292004-05-03 19:49:32 +00003564 assert( aResult[0]==pPage->pgno );
drh8c42ca92001-06-22 19:15:00 +00003565 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003566 aResult[2] = pPage->nCell;
3567 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh4b70f112004-05-02 21:12:19 +00003568 aResult[3] = cellSize(pPage, pPage->aCell[pCur->idx]);
3569 aResult[6] = pPage->leaf ? 0 : get4byte(&pPage->aCell[pCur->idx][2]);
drh2aa679f2001-06-25 02:11:07 +00003570 }else{
3571 aResult[3] = 0;
3572 aResult[6] = 0;
3573 }
3574 aResult[4] = pPage->nFree;
3575 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00003576 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhc39e0002004-05-07 23:50:57 +00003577 while( idx>0 && idx<pPage->pBt->pageSize ){
drh2aa679f2001-06-25 02:11:07 +00003578 cnt++;
drh4b70f112004-05-02 21:12:19 +00003579 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003580 }
3581 aResult[5] = cnt;
drh4b70f112004-05-02 21:12:19 +00003582 aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh8c42ca92001-06-22 19:15:00 +00003583 return SQLITE_OK;
3584}
drhaaab5722002-02-19 13:39:21 +00003585#endif
drhdd793422001-06-28 01:54:48 +00003586
drhdd793422001-06-28 01:54:48 +00003587/*
drh5eddca62001-06-30 21:53:53 +00003588** Return the pager associated with a BTree. This routine is used for
3589** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003590*/
drh3aac2dd2004-04-26 14:10:20 +00003591Pager *sqlite3BtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003592 return pBt->pPager;
3593}
drh5eddca62001-06-30 21:53:53 +00003594
3595/*
3596** This structure is passed around through all the sanity checking routines
3597** in order to keep track of some global state information.
3598*/
drhaaab5722002-02-19 13:39:21 +00003599typedef struct IntegrityCk IntegrityCk;
3600struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003601 Btree *pBt; /* The tree being checked out */
3602 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3603 int nPage; /* Number of pages in the database */
3604 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00003605 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003606};
3607
3608/*
3609** Append a message to the error message string.
3610*/
drhaaab5722002-02-19 13:39:21 +00003611static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003612 if( pCheck->zErrMsg ){
3613 char *zOld = pCheck->zErrMsg;
3614 pCheck->zErrMsg = 0;
danielk19774adee202004-05-08 08:23:19 +00003615 sqlite3SetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003616 sqliteFree(zOld);
3617 }else{
danielk19774adee202004-05-08 08:23:19 +00003618 sqlite3SetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003619 }
3620}
3621
3622/*
3623** Add 1 to the reference count for page iPage. If this is the second
3624** reference to the page, add an error message to pCheck->zErrMsg.
3625** Return 1 if there are 2 ore more references to the page and 0 if
3626** if this is the first reference to the page.
3627**
3628** Also check that the page number is in bounds.
3629*/
drhaaab5722002-02-19 13:39:21 +00003630static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003631 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003632 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003633 char zBuf[100];
3634 sprintf(zBuf, "invalid page number %d", iPage);
3635 checkAppendMsg(pCheck, zContext, zBuf);
3636 return 1;
3637 }
3638 if( pCheck->anRef[iPage]==1 ){
3639 char zBuf[100];
3640 sprintf(zBuf, "2nd reference to page %d", iPage);
3641 checkAppendMsg(pCheck, zContext, zBuf);
3642 return 1;
3643 }
3644 return (pCheck->anRef[iPage]++)>1;
3645}
3646
3647/*
3648** Check the integrity of the freelist or of an overflow page list.
3649** Verify that the number of pages on the list is N.
3650*/
drh30e58752002-03-02 20:41:57 +00003651static void checkList(
3652 IntegrityCk *pCheck, /* Integrity checking context */
3653 int isFreeList, /* True for a freelist. False for overflow page list */
3654 int iPage, /* Page number for first page in the list */
3655 int N, /* Expected number of pages in the list */
3656 char *zContext /* Context for error messages */
3657){
3658 int i;
drh3a4c1412004-05-09 20:40:11 +00003659 int expected = N;
3660 int iFirst = iPage;
drh5eddca62001-06-30 21:53:53 +00003661 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003662 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00003663 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00003664 if( iPage<1 ){
drh3a4c1412004-05-09 20:40:11 +00003665 sprintf(zMsg, "%d of %d pages missing from overflow list starting at %d",
3666 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00003667 checkAppendMsg(pCheck, zContext, zMsg);
3668 break;
3669 }
3670 if( checkRef(pCheck, iPage, zContext) ) break;
drha34b6762004-05-07 13:30:42 +00003671 if( sqlite3pager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
drh5eddca62001-06-30 21:53:53 +00003672 sprintf(zMsg, "failed to get page %d", iPage);
3673 checkAppendMsg(pCheck, zContext, zMsg);
3674 break;
3675 }
drh30e58752002-03-02 20:41:57 +00003676 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00003677 int n = get4byte(&pOvfl[4]);
drh0d316a42002-08-11 20:10:47 +00003678 for(i=0; i<n; i++){
drh4b70f112004-05-02 21:12:19 +00003679 checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
drh30e58752002-03-02 20:41:57 +00003680 }
drh0d316a42002-08-11 20:10:47 +00003681 N -= n;
drh30e58752002-03-02 20:41:57 +00003682 }
drh4b70f112004-05-02 21:12:19 +00003683 iPage = get4byte(pOvfl);
drha34b6762004-05-07 13:30:42 +00003684 sqlite3pager_unref(pOvfl);
drh5eddca62001-06-30 21:53:53 +00003685 }
3686}
3687
3688/*
drh1bffb9c2002-02-03 17:37:36 +00003689** Return negative if zKey1<zKey2.
3690** Return zero if zKey1==zKey2.
3691** Return positive if zKey1>zKey2.
3692*/
3693static int keyCompare(
3694 const char *zKey1, int nKey1,
3695 const char *zKey2, int nKey2
3696){
3697 int min = nKey1>nKey2 ? nKey2 : nKey1;
3698 int c = memcmp(zKey1, zKey2, min);
3699 if( c==0 ){
3700 c = nKey1 - nKey2;
3701 }
3702 return c;
3703}
3704
3705/*
drh5eddca62001-06-30 21:53:53 +00003706** Do various sanity checks on a single page of a tree. Return
3707** the tree depth. Root pages return 0. Parents of root pages
3708** return 1, and so forth.
3709**
3710** These checks are done:
3711**
3712** 1. Make sure that cells and freeblocks do not overlap
3713** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00003714** NO 2. Make sure cell keys are in order.
3715** NO 3. Make sure no key is less than or equal to zLowerBound.
3716** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00003717** 5. Check the integrity of overflow pages.
3718** 6. Recursively call checkTreePage on all children.
3719** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003720** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003721** the root of the tree.
3722*/
3723static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003724 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003725 int iPage, /* Page number of the page to check */
3726 MemPage *pParent, /* Parent page */
3727 char *zParentContext, /* Parent context */
3728 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003729 int nLower, /* Number of characters in zLowerBound */
3730 char *zUpperBound, /* All keys should be less than this, if not NULL */
3731 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003732){
3733 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00003734 int i, rc, depth, d2, pgno, cnt;
3735 int hdr;
3736 u8 *data;
drh5eddca62001-06-30 21:53:53 +00003737 char *zKey1, *zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003738 int nKey1, nKey2;
drh5eddca62001-06-30 21:53:53 +00003739 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003740 Btree *pBt;
drhda200cc2004-05-09 11:51:38 +00003741 int maxLocal, pageSize;
drh5eddca62001-06-30 21:53:53 +00003742 char zMsg[100];
3743 char zContext[100];
drhda200cc2004-05-09 11:51:38 +00003744 char hit[MX_PAGE_SIZE];
drh5eddca62001-06-30 21:53:53 +00003745
3746 /* Check that the page exists
3747 */
drh0d316a42002-08-11 20:10:47 +00003748 cur.pBt = pBt = pCheck->pBt;
drhda200cc2004-05-09 11:51:38 +00003749 maxLocal = pBt->maxLocal;
3750 pageSize = pBt->pageSize;
drh5eddca62001-06-30 21:53:53 +00003751 if( iPage==0 ) return 0;
3752 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh4b70f112004-05-02 21:12:19 +00003753 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003754 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3755 checkAppendMsg(pCheck, zContext, zMsg);
3756 return 0;
3757 }
drh4b70f112004-05-02 21:12:19 +00003758 if( (rc = initPage(pPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003759 sprintf(zMsg, "initPage() returns error code %d", rc);
3760 checkAppendMsg(pCheck, zContext, zMsg);
drh91025292004-05-03 19:49:32 +00003761 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003762 return 0;
3763 }
3764
3765 /* Check out all the cells.
3766 */
3767 depth = 0;
drh5eddca62001-06-30 21:53:53 +00003768 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003769 for(i=0; i<pPage->nCell; i++){
drhda200cc2004-05-09 11:51:38 +00003770 u8 *pCell = pPage->aCell[i];
3771 u64 nKey, nData;
3772 int sz, nHeader;
drh5eddca62001-06-30 21:53:53 +00003773
3774 /* Check payload overflow pages
3775 */
drh3a4c1412004-05-09 20:40:11 +00003776 sprintf(zContext, "On tree page %d cell %d: ", iPage, i);
drhda200cc2004-05-09 11:51:38 +00003777 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3778 sz = nData;
3779 if( !pPage->intKey ) sz += nKey;
3780 if( sz>maxLocal ){
3781 int nPage = (sz - maxLocal + pageSize - 5)/(pageSize - 4);
3782 checkList(pCheck, 0, get4byte(&pCell[nHeader+maxLocal]),nPage,zContext);
drh5eddca62001-06-30 21:53:53 +00003783 }
3784
3785 /* Check sanity of left child page.
3786 */
drhda200cc2004-05-09 11:51:38 +00003787 if( !pPage->leaf ){
3788 pgno = get4byte(&pCell[2]);
3789 d2 = checkTreePage(pCheck,pgno,pPage,zContext,0,0,0,0);
3790 if( i>0 && d2!=depth ){
3791 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3792 }
3793 depth = d2;
drh5eddca62001-06-30 21:53:53 +00003794 }
drh5eddca62001-06-30 21:53:53 +00003795 }
drhda200cc2004-05-09 11:51:38 +00003796 if( !pPage->leaf ){
3797 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
3798 sprintf(zContext, "On page %d at right child: ", iPage);
3799 checkTreePage(pCheck, pgno, pPage, zContext,0,0,0,0);
3800 }
drh5eddca62001-06-30 21:53:53 +00003801
3802 /* Check for complete coverage of the page
3803 */
drhda200cc2004-05-09 11:51:38 +00003804 memset(hit, 0, pageSize);
3805 memset(hit, 1, pPage->hdrOffset+10-4*(pPage->leaf));
3806 data = pPage->aData;
3807 hdr = pPage->hdrOffset;
3808 for(cnt=0, i=get2byte(&data[hdr+3]); i>0 && i<pageSize && cnt<10000; cnt++){
3809 int size = cellSize(pPage, &data[i]);
drh5eddca62001-06-30 21:53:53 +00003810 int j;
drhda200cc2004-05-09 11:51:38 +00003811 for(j=i+size-1; j>=i; j--) hit[j]++;
3812 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003813 }
drhda200cc2004-05-09 11:51:38 +00003814 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<pageSize && cnt<10000; cnt++){
3815 int size = get2byte(&data[i+2]);
drh5eddca62001-06-30 21:53:53 +00003816 int j;
drhda200cc2004-05-09 11:51:38 +00003817 for(j=i+size-1; j>=i; j--) hit[j]++;
3818 i = get2byte(&data[i]);
drh5eddca62001-06-30 21:53:53 +00003819 }
drhda200cc2004-05-09 11:51:38 +00003820 for(i=cnt=0; i<pageSize; i++){
drh5eddca62001-06-30 21:53:53 +00003821 if( hit[i]==0 ){
drhda200cc2004-05-09 11:51:38 +00003822 cnt++;
drh5eddca62001-06-30 21:53:53 +00003823 }else if( hit[i]>1 ){
3824 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3825 checkAppendMsg(pCheck, zMsg, 0);
3826 break;
3827 }
3828 }
drhda200cc2004-05-09 11:51:38 +00003829 if( cnt!=data[hdr+5] ){
3830 sprintf(zMsg, "Fragmented space is %d byte reported as %d on page %d",
3831 cnt, data[hdr+5], iPage);
3832 checkAppendMsg(pCheck, zMsg, 0);
3833 }
drh6019e162001-07-02 17:51:45 +00003834
drh4b70f112004-05-02 21:12:19 +00003835 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00003836 return depth+1;
drh5eddca62001-06-30 21:53:53 +00003837}
3838
3839/*
3840** This routine does a complete check of the given BTree file. aRoot[] is
3841** an array of pages numbers were each page number is the root page of
3842** a table. nRoot is the number of entries in aRoot.
3843**
3844** If everything checks out, this routine returns NULL. If something is
3845** amiss, an error message is written into memory obtained from malloc()
3846** and a pointer to that error message is returned. The calling function
3847** is responsible for freeing the error message when it is done.
3848*/
drh3aac2dd2004-04-26 14:10:20 +00003849char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003850 int i;
3851 int nRef;
drhaaab5722002-02-19 13:39:21 +00003852 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003853
drha34b6762004-05-07 13:30:42 +00003854 nRef = *sqlite3pager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003855 if( lockBtree(pBt)!=SQLITE_OK ){
3856 return sqliteStrDup("Unable to acquire a read lock on the database");
3857 }
drh5eddca62001-06-30 21:53:53 +00003858 sCheck.pBt = pBt;
3859 sCheck.pPager = pBt->pPager;
drha34b6762004-05-07 13:30:42 +00003860 sCheck.nPage = sqlite3pager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003861 if( sCheck.nPage==0 ){
3862 unlockBtreeIfUnused(pBt);
3863 return 0;
3864 }
drh8c1238a2003-01-02 14:43:55 +00003865 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drhda200cc2004-05-09 11:51:38 +00003866 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh5eddca62001-06-30 21:53:53 +00003867 sCheck.zErrMsg = 0;
3868
3869 /* Check the integrity of the freelist
3870 */
drha34b6762004-05-07 13:30:42 +00003871 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
3872 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003873
3874 /* Check all the tables.
3875 */
3876 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00003877 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00003878 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00003879 }
3880
3881 /* Make sure every page in the file is referenced
3882 */
3883 for(i=1; i<=sCheck.nPage; i++){
3884 if( sCheck.anRef[i]==0 ){
3885 char zBuf[100];
3886 sprintf(zBuf, "Page %d is never used", i);
3887 checkAppendMsg(&sCheck, zBuf, 0);
3888 }
3889 }
3890
3891 /* Make sure this analysis did not leave any unref() pages
3892 */
drh5e00f6c2001-09-13 13:46:56 +00003893 unlockBtreeIfUnused(pBt);
drha34b6762004-05-07 13:30:42 +00003894 if( nRef != *sqlite3pager_stats(pBt->pPager) ){
drh5eddca62001-06-30 21:53:53 +00003895 char zBuf[100];
3896 sprintf(zBuf,
3897 "Outstanding page count goes from %d to %d during this analysis",
drha34b6762004-05-07 13:30:42 +00003898 nRef, *sqlite3pager_stats(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00003899 );
3900 checkAppendMsg(&sCheck, zBuf, 0);
3901 }
3902
3903 /* Clean up and report errors.
3904 */
3905 sqliteFree(sCheck.anRef);
3906 return sCheck.zErrMsg;
3907}
paulb95a8862003-04-01 21:16:41 +00003908
drh73509ee2003-04-06 20:44:45 +00003909/*
3910** Return the full pathname of the underlying database file.
3911*/
drh3aac2dd2004-04-26 14:10:20 +00003912const char *sqlite3BtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00003913 assert( pBt->pPager!=0 );
drha34b6762004-05-07 13:30:42 +00003914 return sqlite3pager_filename(pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00003915}
3916
3917/*
drhf7c57532003-04-25 13:22:51 +00003918** Copy the complete content of pBtFrom into pBtTo. A transaction
3919** must be active for both files.
3920**
3921** The size of file pBtFrom may be reduced by this operation.
3922** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00003923*/
drh3aac2dd2004-04-26 14:10:20 +00003924int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
drhf7c57532003-04-25 13:22:51 +00003925 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00003926 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00003927
3928 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
drhf7c57532003-04-25 13:22:51 +00003929 if( pBtTo->pCursor ) return SQLITE_BUSY;
drhc39e0002004-05-07 23:50:57 +00003930 memcpy(pBtTo->pPage1, pBtFrom->pPage1, pBtFrom->pageSize);
drha34b6762004-05-07 13:30:42 +00003931 rc = sqlite3pager_overwrite(pBtTo->pPager, 1, pBtFrom->pPage1);
3932 nToPage = sqlite3pager_pagecount(pBtTo->pPager);
3933 nPage = sqlite3pager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00003934 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00003935 void *pPage;
drha34b6762004-05-07 13:30:42 +00003936 rc = sqlite3pager_get(pBtFrom->pPager, i, &pPage);
drhf7c57532003-04-25 13:22:51 +00003937 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003938 rc = sqlite3pager_overwrite(pBtTo->pPager, i, pPage);
drh2e6d11b2003-04-25 15:37:57 +00003939 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003940 sqlite3pager_unref(pPage);
drhf7c57532003-04-25 13:22:51 +00003941 }
drh2e6d11b2003-04-25 15:37:57 +00003942 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3943 void *pPage;
drha34b6762004-05-07 13:30:42 +00003944 rc = sqlite3pager_get(pBtTo->pPager, i, &pPage);
drh2e6d11b2003-04-25 15:37:57 +00003945 if( rc ) break;
drha34b6762004-05-07 13:30:42 +00003946 rc = sqlite3pager_write(pPage);
3947 sqlite3pager_unref(pPage);
3948 sqlite3pager_dont_write(pBtTo->pPager, i);
drh2e6d11b2003-04-25 15:37:57 +00003949 }
3950 if( !rc && nPage<nToPage ){
drha34b6762004-05-07 13:30:42 +00003951 rc = sqlite3pager_truncate(pBtTo->pPager, nPage);
drh2e6d11b2003-04-25 15:37:57 +00003952 }
drhf7c57532003-04-25 13:22:51 +00003953 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003954 sqlite3BtreeRollback(pBtTo);
drhf7c57532003-04-25 13:22:51 +00003955 }
3956 return rc;
drh73509ee2003-04-06 20:44:45 +00003957}