blob: 252a433bd640664d103af1ff4eb59c306161a872 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drh4b70f112004-05-02 21:12:19 +000012** $Id: btree.c,v 1.107 2004/05/02 21:12:19 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
15** For a detailed discussion of BTrees, refer to
16**
17** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18** "Sorting And Searching", pages 473-480. Addison-Wesley
19** Publishing Company, Reading, Massachusetts.
20**
21** The basic idea is that each page of the file contains N database
22** entries and N+1 pointers to subpages.
23**
24** ----------------------------------------------------------------
25** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26** ----------------------------------------------------------------
27**
28** All of the keys on the page that Ptr(0) points to have values less
29** than Key(0). All of the keys on page Ptr(1) and its subpages have
30** values greater than Key(0) and less than Key(1). All of the keys
31** on Ptr(N+1) and its subpages have values greater than Key(N). And
32** so forth.
33**
drh5e00f6c2001-09-13 13:46:56 +000034** Finding a particular key requires reading O(log(M)) pages from the
35** disk where M is the number of entries in the tree.
drh8b2f49b2001-06-08 00:21:52 +000036**
37** In this implementation, a single file can hold one or more separate
38** BTrees. Each BTree is identified by the index of its root page. The
drh9e572e62004-04-23 23:43:10 +000039** key and data for any entry are combined to form the "payload". A
40** fixed amount of payload can be carried directly on the database
41** page. If the payload is larger than the preset amount then surplus
42** bytes are stored on overflow pages. The payload for an entry
43** and the preceding pointer are combined to form a "Cell". Each
44** page has a small header which contains the Ptr(N+1) pointer and other
45** information such as the size of key and data.
drh8b2f49b2001-06-08 00:21:52 +000046**
drh9e572e62004-04-23 23:43:10 +000047** FORMAT DETAILS
48**
49** The file is divided into pages. The first page is called page 1,
50** the second is page 2, and so forth. A page number of zero indicates
51** "no such page". The page size can be anything between 512 and 65536.
52** Each page can be either a btree page, a freelist page or an overflow
53** page.
54**
55** The first page is always a btree page. The first 100 bytes of the first
56** page contain a special header that describes the file. The format
57** of that header is as follows:
58**
59** OFFSET SIZE DESCRIPTION
60** 0 16 Header string: "SQLite version 3"
61** 16 2 Page size in bytes.
62** 18 1 File format write version
63** 19 1 File format read version
64** 20 2 Bytes of unused space at the end of each page
65** 22 2 Maximum allowed local payload per entry
66** 24 8 File change counter
67** 32 4 First freelist page
68** 36 4 Number of freelist pages in the file
69** 40 60 15 4-byte meta values passed to higher layers
70**
71** All of the integer values are big-endian (most significant byte first).
72** The file change counter is incremented every time the database is changed.
73** This allows other processes to know when the file has changed and thus
74** when they need to flush their cache.
75**
76** Each btree page begins with a header described below. Note that the
77** header for page one begins at byte 100. For all other btree pages, the
78** header begins on byte zero.
79**
80** OFFSET SIZE DESCRIPTION
81** 0 1 Flags. 01: leaf, 02: zerodata, 04: intkey, F8: type
82** 1 2 byte offset to the first freeblock
83** 3 2 byte offset to the first cell
84** 5 1 number of fragmented free bytes
85** 6 4 Right child (the Ptr(N+1) value). Omitted if leaf
86**
87** The flags define the format of this btree page. The leaf flag means that
88** this page has no children. The zerodata flag means that this page carries
89** only keys and no data. The intkey flag means that the key is a single
90** variable length integer at the beginning of the payload.
91**
92** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
93** byte are used. The integer consists of all bytes that have bit 8 set and
94** the first byte with bit 8 clear. Unlike fixed-length values, variable-
95** length integers are little-endian. Examples:
96**
97** 0x00 becomes 0x00000000
98** 0x1b becomes 0x0000001b
99** 0x9b 0x4a becomes 0x00000dca
100** 0x80 0x1b becomes 0x0000001b
101** 0xf8 0xac 0xb1 0x91 0x01 becomes 0x12345678
102** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
103**
104** Variable length integers are used for rowids and to hold the number of
105** bytes of key and data in a btree cell.
106**
107** Unused space within a btree page is collected into a linked list of
108** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
109** to the first freeblock is given in the header. Freeblocks occur in
110** increasing order. Because a freeblock is 4 bytes in size, the minimum
111** size allocation on a btree page is 4 bytes. Because a freeblock must be
112** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
113** exist on the freeblock chain. The total number of such fragmented bytes
114** is recorded in the page header at offset 5.
115**
116** SIZE DESCRIPTION
117** 2 Byte offset of the next freeblock
118** 2 Bytes in this freeblock
119**
120** Cells are of variable length. The first cell begins on the byte defined
121** in the page header. Cells do not necessarily occur in order - they can
122** skip around on the page.
123**
124** SIZE DESCRIPTION
125** 2 Byte offset of the next cell. 0 if this is the last cell
drh3aac2dd2004-04-26 14:10:20 +0000126** 4 Page number of the left child. Omitted if leaf flag is set.
127** var Number of bytes of data. Omitted if the zerodata flag is set.
128** var Number of bytes of key. Or the key itself if intkey flag is set.
drh9e572e62004-04-23 23:43:10 +0000129** * Payload
130** 4 First page of the overflow chain. Omitted if no overflow
131**
132** Overflow pages form a linked list. Each page except the last is completely
133** filled with data (pagesize - 4 bytes). The last page can have as little
134** as 1 byte of data.
135**
136** SIZE DESCRIPTION
137** 4 Page number of next overflow page
138** * Data
139**
140** Freelist pages come in two subtypes: trunk pages and leaf pages. The
141** file header points to first in a linked list of trunk page. Each trunk
142** page points to multiple leaf pages. The content of a leaf page is
143** unspecified. A trunk page looks like this:
144**
145** SIZE DESCRIPTION
146** 4 Page number of next trunk page
147** 4 Number of leaf pointers on this page
148** * zero or more pages numbers of leaves
drha059ad02001-04-17 20:09:11 +0000149*/
150#include "sqliteInt.h"
151#include "pager.h"
152#include "btree.h"
153#include <assert.h>
154
drh4b70f112004-05-02 21:12:19 +0000155
156/* Maximum page size. The upper bound on this value is 65536 (a limit
157** imposed by the 2-byte offset at the beginning of each cell.) The
158** maximum page size determines the amount of stack space allocated
159** by many of the routines in this module. On embedded architectures
160** or any machine where memory and especially stack memory is limited,
161** one may wish to chose a smaller value for the maximum page size.
162*/
163#ifndef MX_PAGE_SIZE
164# define MX_PAGE_SIZE 1024
165#endif
166
167/* Individual entries or "cells" are limited in size so that at least
168** this many cells will fit on one page. Changing this value will result
169** in an incompatible database.
170*/
171#define MN_CELLS_PER_PAGE 4
172
173/* The following value is the maximum cell size assuming a maximum page
174** size give above.
175*/
176#define MX_CELL_SIZE ((MX_PAGE_SIZE-10)/MN_CELLS_PER_PAGE)
177
178/* The maximum number of cells on a single page of the database. This
179** assumes a minimum cell size of 3 bytes. Such small cells will be
180** exceedingly rare, but they are possible.
181*/
182#define MX_CELL ((MX_PAGE_SIZE-10)/3)
183
paulb95a8862003-04-01 21:16:41 +0000184/* Forward declarations */
drh3aac2dd2004-04-26 14:10:20 +0000185typedef struct MemPage MemPage;
paulb95a8862003-04-01 21:16:41 +0000186
drh8c42ca92001-06-22 19:15:00 +0000187/*
drhbd03cae2001-06-02 02:40:57 +0000188** This is a magic string that appears at the beginning of every
drh8c42ca92001-06-22 19:15:00 +0000189** SQLite database in order to identify the file as a real database.
drh9e572e62004-04-23 23:43:10 +0000190** 0123456789 123456 */
191static const char zMagicHeader[] = "SQLite version 3";
drh08ed44e2001-04-29 23:32:55 +0000192
193/*
drh4b70f112004-05-02 21:12:19 +0000194** Page type flags. An ORed combination of these flags appear as the
195** first byte of every BTree page.
drh8c42ca92001-06-22 19:15:00 +0000196*/
drh9e572e62004-04-23 23:43:10 +0000197#define PTF_LEAF 0x01
198#define PTF_ZERODATA 0x02
199#define PTF_INTKEY 0x04
drh4b70f112004-05-02 21:12:19 +0000200/* Idea for the future: PTF_LEAFDATA */
drh8c42ca92001-06-22 19:15:00 +0000201
202/*
drh9e572e62004-04-23 23:43:10 +0000203** As each page of the file is loaded into memory, an instance of the following
204** structure is appended and initialized to zero. This structure stores
205** information about the page that is decoded from the raw file page.
drh14acc042001-06-10 19:56:58 +0000206**
drh72f82862001-05-24 21:06:34 +0000207** The pParent field points back to the parent page. This allows us to
208** walk up the BTree from any leaf to the root. Care must be taken to
209** unref() the parent page pointer when this page is no longer referenced.
drhbd03cae2001-06-02 02:40:57 +0000210** The pageDestructor() routine handles that chore.
drh7e3b0a02001-04-28 16:52:40 +0000211*/
212struct MemPage {
drh3aac2dd2004-04-26 14:10:20 +0000213 struct Btree *pBt; /* Pointer back to BTree structure */
drh9e572e62004-04-23 23:43:10 +0000214 unsigned char *aData; /* Pointer back to the start of the page */
drh3aac2dd2004-04-26 14:10:20 +0000215 u8 isInit; /* True if previously initialized */
drh9e572e62004-04-23 23:43:10 +0000216 u8 idxShift; /* True if Cell indices have changed */
drh3aac2dd2004-04-26 14:10:20 +0000217 u8 isOverfull; /* Some aCell[] do not fit on page */
drh9e572e62004-04-23 23:43:10 +0000218 u8 intKey; /* True if intkey flag is set */
219 u8 leaf; /* True if leaf flag is set */
220 u8 zeroData; /* True if zero data flag is set */
221 u8 hdrOffset; /* 100 for page 1. 0 otherwise */
222 Pgno pgno; /* Page number for this page */
drh72f82862001-05-24 21:06:34 +0000223 MemPage *pParent; /* The parent of this page. NULL for root */
drh3aac2dd2004-04-26 14:10:20 +0000224 int idxParent; /* Index in pParent->aCell[] of this node */
drh9e572e62004-04-23 23:43:10 +0000225 int nFree; /* Number of free bytes on the page */
drh306dc212001-05-21 13:45:10 +0000226 int nCell; /* Number of entries on this page */
drh4b70f112004-05-02 21:12:19 +0000227 int nCellAlloc; /* Number of slots allocated in aCell[] */
drh9e572e62004-04-23 23:43:10 +0000228 unsigned char **aCell; /* Pointer to start of each cell */
drh8c42ca92001-06-22 19:15:00 +0000229};
drh7e3b0a02001-04-28 16:52:40 +0000230
231/*
drh3b7511c2001-05-26 13:15:44 +0000232** The in-memory image of a disk page has the auxiliary information appended
233** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
234** that extra information.
235*/
drh3aac2dd2004-04-26 14:10:20 +0000236#define EXTRA_SIZE sizeof(MemPage)
drh3b7511c2001-05-26 13:15:44 +0000237
238/*
drha059ad02001-04-17 20:09:11 +0000239** Everything we need to know about an open database
240*/
241struct Btree {
242 Pager *pPager; /* The page cache */
drh306dc212001-05-21 13:45:10 +0000243 BtCursor *pCursor; /* A list of all open cursors */
drh3aac2dd2004-04-26 14:10:20 +0000244 MemPage *pPage1; /* First page of the database */
drh663fc632002-02-02 18:49:19 +0000245 u8 inTrans; /* True if a transaction is in progress */
drh3aac2dd2004-04-26 14:10:20 +0000246 u8 inStmt; /* True if there is a checkpoint on the transaction */
drh5df72a52002-06-06 23:16:05 +0000247 u8 readOnly; /* True if the underlying file is readonly */
drh9e572e62004-04-23 23:43:10 +0000248 int pageSize; /* Number of usable bytes on each page */
249 int maxLocal; /* Maximum local payload */
drha059ad02001-04-17 20:09:11 +0000250};
251typedef Btree Bt;
252
drh365d68f2001-05-11 11:02:46 +0000253/*
254** A cursor is a pointer to a particular entry in the BTree.
255** The entry is identified by its MemPage and the index in
drh5e2f8b92001-05-28 00:41:15 +0000256** MemPage.apCell[] of the entry.
drh365d68f2001-05-11 11:02:46 +0000257*/
drh72f82862001-05-24 21:06:34 +0000258struct BtCursor {
drh5e2f8b92001-05-28 00:41:15 +0000259 Btree *pBt; /* The Btree to which this cursor belongs */
drh14acc042001-06-10 19:56:58 +0000260 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
drhf74b8d92002-09-01 23:20:45 +0000261 BtCursor *pShared; /* Loop of cursors with the same root page */
drh3aac2dd2004-04-26 14:10:20 +0000262 int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
263 void *pArg; /* First arg to xCompare() */
drh8b2f49b2001-06-08 00:21:52 +0000264 Pgno pgnoRoot; /* The root page of this tree */
drh5e2f8b92001-05-28 00:41:15 +0000265 MemPage *pPage; /* Page that contains the entry */
drh3aac2dd2004-04-26 14:10:20 +0000266 int idx; /* Index of the entry in pPage->aCell[] */
drhecdc7532001-09-23 02:35:53 +0000267 u8 wrFlag; /* True if writable */
drh2dcc9aa2002-12-04 13:40:25 +0000268 u8 eSkip; /* Determines if next step operation is a no-op */
drh5e2f8b92001-05-28 00:41:15 +0000269 u8 iMatch; /* compare result from last sqliteBtreeMoveto() */
drh365d68f2001-05-11 11:02:46 +0000270};
drh7e3b0a02001-04-28 16:52:40 +0000271
drha059ad02001-04-17 20:09:11 +0000272/*
drh2dcc9aa2002-12-04 13:40:25 +0000273** Legal values for BtCursor.eSkip.
274*/
275#define SKIP_NONE 0 /* Always step the cursor */
276#define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */
277#define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */
278#define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */
279
280/*
drh3aac2dd2004-04-26 14:10:20 +0000281** Read or write a two-, four-, and eight-byte big-endian integer values.
drh0d316a42002-08-11 20:10:47 +0000282*/
drh9e572e62004-04-23 23:43:10 +0000283static u32 get2byte(unsigned char *p){
284 return (p[0]<<8) | p[1];
drh0d316a42002-08-11 20:10:47 +0000285}
drh9e572e62004-04-23 23:43:10 +0000286static u32 get4byte(unsigned char *p){
287 return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
288}
drh3aac2dd2004-04-26 14:10:20 +0000289static u64 get8byte(unsigned char *p){
drh9e572e62004-04-23 23:43:10 +0000290 u64 v = get4byte(p);
291 return (v<<32) | get4byte(&p[4]);
292}
293static void put2byte(unsigned char *p, u32 v){
294 p[0] = v>>8;
295 p[1] = v;
296}
297static void put4byte(unsigned char *p, u32 v){
298 p[0] = v>>24;
299 p[1] = v>>16;
300 p[2] = v>>8;
301 p[3] = v;
302}
303static void put8byte(unsigned char *p, u64 v){
304 put4byte(&p[4], v>>32);
305 put4byte(p, v);
306}
307
308/*
309** Read a variable-length integer. Store the result in *pResult.
310** Return the number of bytes in the integer.
311*/
312static unsigned int getVarint(unsigned char *p, u64 *pResult){
313 u64 x = p[0] & 0x7f;
314 int n = 0;
315 while( (p[n++]&0x80)!=0 ){
316 x |= (p[n]&0x7f)<<(n*7);
317 }
318 *pResult = x;
319 return n;
320}
321
322/*
323** Write a variable length integer with value v into p[]. Return
324** the number of bytes written.
325*/
326static unsigned int putVarint(unsigned char *p, u64 v){
327 int i = 0;
328 do{
329 p[i++] = v & 0x7f;
330 v >>= 7;
331 }while( v!=0 );
332 p[i-1] |= 0x80;
333 return i;
drh0d316a42002-08-11 20:10:47 +0000334}
335
336/*
drh3aac2dd2004-04-26 14:10:20 +0000337** Parse a cell header and fill in the CellInfo structure.
338*/
339static void parseCellHeader(
340 MemPage *pPage, /* Page containing the cell */
341 unsigned char *pCell, /* The cell */
342 u64 *pnData, /* Number of bytes of data in payload */
343 u64 *pnKey, /* Number of bytes of key, or key value for intKey */
344 int *pnHeader /* Size of header in bytes. Offset to payload */
345){
346 int n;
347 if( pPage->leaf ){
348 n = 2;
349 }else{
350 n = 6;
351 }
352 if( pPage->zeroData ){
353 *pnData = 0;
354 }else{
355 n += getVarint(&pCell[n], pnData);
356 }
357 n += getVarint(pCell, pnKey);
358 *pnHeader = n;
359}
360
361/*
drh3b7511c2001-05-26 13:15:44 +0000362** Compute the total number of bytes that a Cell needs on the main
drh5e2f8b92001-05-28 00:41:15 +0000363** database page. The number returned includes the Cell header,
364** local payload storage, and the pointer to overflow pages (if
drh8c42ca92001-06-22 19:15:00 +0000365** applicable). Additional space allocated on overflow pages
drhbd03cae2001-06-02 02:40:57 +0000366** is NOT included in the value returned from this routine.
drh3b7511c2001-05-26 13:15:44 +0000367*/
drh9e572e62004-04-23 23:43:10 +0000368static int cellSize(MemPage *pPage, unsigned char *pCell){
drh3aac2dd2004-04-26 14:10:20 +0000369 CellInfo info;
370 int n;
drh9e572e62004-04-23 23:43:10 +0000371 u64 nData, nKey;
drh3aac2dd2004-04-26 14:10:20 +0000372 int nPayload, maxPayload;
373
374 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
375 nPayload = (int)nData;
376 if( !pPage->intKey ){
377 nPayload += (int)nKey;
drh3b7511c2001-05-26 13:15:44 +0000378 }
drh3aac2dd2004-04-26 14:10:20 +0000379 maxPayload = pPage->pBt->maxLocal;
drh9e572e62004-04-23 23:43:10 +0000380 if( nPayload>maxPayload ){
381 nPayload = maxPayload + 4;
382 }
383 return n + nPayload;
drh3b7511c2001-05-26 13:15:44 +0000384}
385
386/*
drh72f82862001-05-24 21:06:34 +0000387** Defragment the page given. All Cells are moved to the
388** beginning of the page and all free space is collected
389** into one big FreeBlk at the end of the page.
drh365d68f2001-05-11 11:02:46 +0000390*/
drh9e572e62004-04-23 23:43:10 +0000391static void defragmentPage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000392 int pc, i, n, addr;
393 int start, hdr, size;
drh9e572e62004-04-23 23:43:10 +0000394 int leftover;
395 unsigned char *oldPage;
396 unsigned char newPage[SQLITE_PAGE_SIZE];
drh2af926b2001-05-15 00:39:25 +0000397
drh9e572e62004-04-23 23:43:10 +0000398 assert( sqlitepager_iswriteable(pPage->aData) );
399 assert( pPage->pBt!=0 );
400 assert( pPage->pageSize <= SQLITE_PAGE_SIZE );
401 oldPage = pPage->aData;
402 hdr = pPage->hdrOffset;
drh3aac2dd2004-04-26 14:10:20 +0000403 addr = 3+hdr;
drh9e572e62004-04-23 23:43:10 +0000404 n = 6+hdr;
405 if( !pPage->leaf ){
406 n += 4;
drh2af926b2001-05-15 00:39:25 +0000407 }
drh9e572e62004-04-23 23:43:10 +0000408 start = n;
drh3aac2dd2004-04-26 14:10:20 +0000409 pc = get2byte(&oldPage[addr]);
drh9e572e62004-04-23 23:43:10 +0000410 i = 0;
411 while( pc>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000412 assert( n<pPage->pBt->pageSize );
drh9e572e62004-04-23 23:43:10 +0000413 size = cellSize(pPage, &oldPage[pc]);
414 memcpy(&newPage[n], &oldPage[pc], size);
drh3aac2dd2004-04-26 14:10:20 +0000415 put2byte(&newPage[addr],n);
drh9e572e62004-04-23 23:43:10 +0000416 pPage->aCell[i] = &oldPage[n];
417 n += size;
drh3aac2dd2004-04-26 14:10:20 +0000418 addr = pc;
drh9e572e62004-04-23 23:43:10 +0000419 pc = get2byte(&oldPage[pc]);
drh2aa679f2001-06-25 02:11:07 +0000420 }
drh3aac2dd2004-04-26 14:10:20 +0000421 leftover = pPage->pBt->pageSize - n;
drh9e572e62004-04-23 23:43:10 +0000422 assert( leftover>=0 );
423 assert( pPage->nFree==leftover );
424 if( leftover<4 ){
425 oldPage[hdr+5] = leftover;
426 leftover = 0;
drh3aac2dd2004-04-26 14:10:20 +0000427 n = pPage->pBt->pageSize;
drh9e572e62004-04-23 23:43:10 +0000428 }
429 memcpy(&oldPage[start], &newPage[start], n-start);
430 if( leftover==0 ){
431 put2byte(&oldPage[hdr+3], 0);
432 }else if( leftover>=4 ){
433 put2byte(&oldPage[hdr+3], n);
434 put2byte(&oldPage[n], 0);
435 put2byte(&oldPage[n+2], leftover);
436 memset(&oldPage[n+4], 0, leftover-4);
437 }
drh365d68f2001-05-11 11:02:46 +0000438}
439
drha059ad02001-04-17 20:09:11 +0000440/*
drh9e572e62004-04-23 23:43:10 +0000441** Allocate nByte bytes of space on a page. If nByte is less than
442** 4 it is rounded up to 4.
drhbd03cae2001-06-02 02:40:57 +0000443**
drh9e572e62004-04-23 23:43:10 +0000444** Return the index into pPage->aData[] of the first byte of
drhbd03cae2001-06-02 02:40:57 +0000445** the new allocation. Or return 0 if there is not enough free
446** space on the page to satisfy the allocation request.
drh2af926b2001-05-15 00:39:25 +0000447**
drh72f82862001-05-24 21:06:34 +0000448** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000449** nBytes of contiguous free space, then this routine automatically
450** calls defragementPage() to consolidate all free space before
451** allocating the new chunk.
drh9e572e62004-04-23 23:43:10 +0000452**
453** Algorithm: Carve a piece off of the first freeblock that is
454** nByte in size or that larger.
drh7e3b0a02001-04-28 16:52:40 +0000455*/
drh9e572e62004-04-23 23:43:10 +0000456static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000457 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000458 int size;
459 unsigned char *data;
drh44ce7e22003-06-17 02:57:17 +0000460#ifndef NDEBUG
461 int cnt = 0;
462#endif
drh72f82862001-05-24 21:06:34 +0000463
drh9e572e62004-04-23 23:43:10 +0000464 data = pPage->aData;
drh4b70f112004-05-02 21:12:19 +0000465 assert( sqlitepager_iswriteable(data->aData) );
drh9e572e62004-04-23 23:43:10 +0000466 assert( pPage->pBt );
467 if( nByte<4 ) nByte = 4;
drh14acc042001-06-10 19:56:58 +0000468 if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
drh9e572e62004-04-23 23:43:10 +0000469 hdr = pPage->hdrOffset;
470 if( data[hdr+5]>=252 ){
471 defragmentPage(pPage);
drh2af926b2001-05-15 00:39:25 +0000472 }
drh3aac2dd2004-04-26 14:10:20 +0000473 addr = hdr+1;
474 pc = get2byte(&data[addr]);
475 assert( addr<pc );
drh9e572e62004-04-23 23:43:10 +0000476 assert( pc<=pPage->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000477 while( (size = get2byte(&data[pc+2]))<nByte ){
478 addr = pc;
479 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000480 assert( pc<=pPage->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000481 assert( pc>=addr+size+4 || pc==0 );
drh9e572e62004-04-23 23:43:10 +0000482 if( pc==0 ){
483 assert( (cnt++)==0 );
484 defragmentPage(pPage);
485 assert( data[hdr+5]==0 );
drh3aac2dd2004-04-26 14:10:20 +0000486 addr = pPage->hdrOffset+1;
487 pc = get2byte(&data[addr]);
drh9e572e62004-04-23 23:43:10 +0000488 }
489 }
490 assert( pc>0 && size>=nByte );
491 assert( pc+size<=pPage->pageSize );
492 if( size>nByte+4 ){
drh3aac2dd2004-04-26 14:10:20 +0000493 put2byte(&data[addr], pc+nByte);
drh9e572e62004-04-23 23:43:10 +0000494 put2byte(&data[pc+size], get2byte(&data[pc]));
495 put2byte(&data[pc+size+2], size-nByte);
drh2af926b2001-05-15 00:39:25 +0000496 }else{
drh3aac2dd2004-04-26 14:10:20 +0000497 put2byte(&data[addr], get2byte(&data[pc]));
drh9e572e62004-04-23 23:43:10 +0000498 data[hdr+5] += size-nByte;
drh2af926b2001-05-15 00:39:25 +0000499 }
500 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000501 assert( pPage->nFree>=0 );
502 return pc;
drh7e3b0a02001-04-28 16:52:40 +0000503}
504
505/*
drh9e572e62004-04-23 23:43:10 +0000506** Return a section of the pPage->aData to the freelist.
507** The first byte of the new free block is pPage->aDisk[start]
508** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000509**
510** Most of the effort here is involved in coalesing adjacent
511** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000512*/
drh9e572e62004-04-23 23:43:10 +0000513static void freeSpace(MemPage *pPage, int start, int size){
514 int end = start + size; /* End of the segment being freed */
drh3aac2dd2004-04-26 14:10:20 +0000515 int addr, pbegin, pend;
drh9e572e62004-04-23 23:43:10 +0000516#ifndef NDEBUG
517 int tsize = 0; /* Total size of all freeblocks */
518#endif
519 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000520
drh9e572e62004-04-23 23:43:10 +0000521 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000522 assert( sqlitepager_iswriteable(data->aData) );
drh9e572e62004-04-23 23:43:10 +0000523 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
524 assert( end<=pPage->pBt->pageSize );
525 if( size<4 ) size = 4;
526
527 /* Add the space back into the linked list of freeblocks */
drh3aac2dd2004-04-26 14:10:20 +0000528 addr = pPage->hdrOffset + 1;
529 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drh9e572e62004-04-23 23:43:10 +0000530 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000531 assert( pbegin>addr );
532 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000533 }
drh9e572e62004-04-23 23:43:10 +0000534 assert( pbegin<=pPage->pBt->pageSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000535 assert( pbegin>addr || pbegin==0 );
536 put2bytes(&data[addr], start);
drh9e572e62004-04-23 23:43:10 +0000537 put2bytes(&data[start], pbegin);
538 put2bytes(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000539 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000540
541 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000542 addr = pPage->hdrOffset + 1;
543 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000544 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000545 assert( pbegin>addr );
drh9e572e62004-04-23 23:43:10 +0000546 assert( pbegin<pPage->pBt->pageSize-4 );
547 pnext = get2byte(&data[pbegin]);
548 psize = get2byte(&data[pbegin+2]);
549 if( pbegin + psize + 3 >= pnext && pnext>0 ){
550 int frag = pnext - (pbegin+psize);
551 assert( frag<=data[pPage->hdrOffset+5] );
552 data[pPage->hdrOffset+5] -= frag;
553 put2byte(&data[pbegin], get2byte(&data[pnext]));
554 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
555 }else{
556 assert( (tsize += psize)>0 );
drh3aac2dd2004-04-26 14:10:20 +0000557 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000558 }
559 }
560 assert( tsize+data[pPage->hdrOffset+5]==pPage->nFree );
drh7e3b0a02001-04-28 16:52:40 +0000561}
562
drh3aac2dd2004-04-26 14:10:20 +0000563#if 0
drh7e3b0a02001-04-28 16:52:40 +0000564/*
drh9e572e62004-04-23 23:43:10 +0000565** The following is the default comparison function for (non-integer)
566** keys in the btrees. This function returns negative, zero, or
567** positive if the first key is less than, equal to, or greater than
568** the second.
569**
drh4b70f112004-05-02 21:12:19 +0000570** The key consists of multiple fields. Each field begins with a variable
571** length integer which determines the field type and the number of bytes
572** of key data to follow for that field.
573**
574** initial varint bytes to follow type
575** -------------- --------------- ---------------
576** 0 0 NULL
577** 1 1 signed integer
578** 2 2 signed integer
579** 3 4 signed integer
580** 4 8 signed integer
581** 5 8 IEEE float
582** 6..12 reserved for expansion
583** N>=12 and even (N-12)/2 BLOB
584** N>=13 and odd (N-13)/2 text
585**
586** For a particular database, text is always either UTF-8, UTF-16BE, or
587** UTF-16LE. Which of these three formats to use is determined by one
588** of the meta values in the file header.
589**
drh9e572e62004-04-23 23:43:10 +0000590*/
591static int keyComp(
592 void *userData,
593 int nKey1, const unsigned char *aKey1,
594 int nKey2, const unsigned char *aKey2,
595){
596 KeyClass *pKeyClass = (KeyClass*)userData;
597 i1 = i2 = 0;
598 for(i1=i2=0; pKeyClass!=0; pKeyClass=pKeyClass->pNext){
599 if( varint32(aKey1, &i1, nKey1, &n1) ) goto bad_key;
600 if( varint32(aKey2, &i2, nKey2, &n2) ) goto bad_key;
601 if( n1==0 ){
602 if( n2>0 ) return -1;
603 /* both values are NULL. consider them equal for sorting purposes. */
604 }else if( n2==0 ){
605 /* right value is NULL but the left value is not. right comes first */
606 return +1;
607 }else if( n1<=5 ){
608 if( n2>5 ) return -1;
609 /* both values are numbers. sort them numerically */
610 ...
611 }else if( n2<=5 ){
612 /* right value is numeric and left is TEXT or BLOB. right comes first */
613 return +1;
614 }else if( n1<12 || n2<12 ){
615 /* bad coding for either the left or the right value */
616 goto bad_key;
617 }else if( (n1&0x01)==0 ){
618 if( n2&0x01)!=0 ) return -1;
619 /* both values are BLOB. use memcmp() */
620 n1 = (n1-12)/2;
621 n2 = (n2-12)/2;
622 if( i1+n1>nKey1 || i2+n2>nKey2 ) goto bad_key;
623 c = memcmp(&aKey1[i1], &aKey2[i2], n1<n2 ? n1 : n2);
624 if( c!=0 ){
625 return c | 1;
626 }
627 if( n1!=n2 ){
628 return (n1-n2) | 1;
629 }
630 i1 += n1;
631 i2 += n2;
632 }else if( n2&0x01)!=0 ){
633 /* right value if BLOB and left is TEXT. BLOB comes first */
634 return +1;
635 }else{
636 /* both values are TEXT. use the supplied comparison function */
637 n1 = (n1-13)/2;
638 n2 = (n2-13)/2;
639 if( i1+n1>nKey1 || i2+n2>nKey2 ) goto bad_key;
640 c = pKeyClass->xCompare(pKeyClass->pUser, n1, &aKey1[i1], n2, &aKey2[i2]);
641 if( c!=0 ){
642 return c | 1;
643 }
644 i1 += n1;
645 i2 += n2;
646 }
647 }
648 return 0;
649
650bad_key:
651 return 1;
652}
drh3aac2dd2004-04-26 14:10:20 +0000653#endif
drh9e572e62004-04-23 23:43:10 +0000654
655/*
drh4b70f112004-05-02 21:12:19 +0000656** Resize the aCell[] array of the given page so that it is able to
657** hold at least nNewSz entries.
658**
659** Return SQLITE_OK or SQLITE_NOMEM.
660*/
661static int resizeCellArray(MemPage *pPage, int nNewSz){
662 if( pPage->nCellAlloc<nNewSize ){
663 pPage->aCell = sqliteRealloc(pPage->aCell, nNewSz*sizeof(pPage->aCell[0]) );
664 if( sqlite_malloc_failed ) return SQLITE_NOMEM;
665 pPage->nCellAlloc = nNewSize;
666 }
667 return SQLITE_OK;
668}
669
670/*
drh7e3b0a02001-04-28 16:52:40 +0000671** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000672**
drhbd03cae2001-06-02 02:40:57 +0000673** The pParent parameter must be a pointer to the MemPage which
drh9e572e62004-04-23 23:43:10 +0000674** is the parent of the page being initialized. The root of a
675** BTree has no parent and so for that page, pParent==NULL.
drh5e2f8b92001-05-28 00:41:15 +0000676**
drh72f82862001-05-24 21:06:34 +0000677** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000678** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000679** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
680** guarantee that the page is well-formed. It only shows that
681** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000682*/
drh9e572e62004-04-23 23:43:10 +0000683static int initPage(
drh3aac2dd2004-04-26 14:10:20 +0000684 MemPage *pPage, /* The page to be initialized */
drh9e572e62004-04-23 23:43:10 +0000685 MemPage *pParent /* The parent. Might be NULL */
686){
drh3aac2dd2004-04-26 14:10:20 +0000687 int c, pc, i, hdr;
drh9e572e62004-04-23 23:43:10 +0000688 int sumCell = 0; /* Total size of all cells */
drh2af926b2001-05-15 00:39:25 +0000689
drh3aac2dd2004-04-26 14:10:20 +0000690 assert( pPage->pBt!=0 );
drh4b70f112004-05-02 21:12:19 +0000691 assert( pParent==0 || pParent->pBt==pPage->pBt );
692 assert( pPage->pgno==sqlitepager_pagenumber(pPage->aData) );
drh3aac2dd2004-04-26 14:10:20 +0000693 assert( pPage->aData == &((unsigned char*)pPage)[pPage->pBt->pageSize] );
drh4b70f112004-05-02 21:12:19 +0000694 assert( pPage->isInit==0 || pPage->pParent==pParent );
695 if( pPage->isInit ) return SQLITE_OK;
696 assert( pPage->pParent==0 );
697 pPage->pParent = pParent;
drh5e2f8b92001-05-28 00:41:15 +0000698 if( pParent ){
drh9e572e62004-04-23 23:43:10 +0000699 sqlitepager_ref(pParent->aData);
drh5e2f8b92001-05-28 00:41:15 +0000700 }
drh4b70f112004-05-02 21:12:19 +0000701 pPage->nCell = pPage->nCellAlloc = 0;
702 pPage->hdrOffset = hdr = pPage->pgno==1 ? 100 : 0;
703 c = pPage->aData[hdr];
drh9e572e62004-04-23 23:43:10 +0000704 pPage->intKey = (c & PTF_INTKEY)!=0;
705 pPage->zeroData = (c & PTF_ZERODATA)!=0;
drh4b70f112004-05-02 21:12:19 +0000706 pPage->leaf = (c & PTF_LEAF)!=0;
drh2af926b2001-05-15 00:39:25 +0000707
drh9e572e62004-04-23 23:43:10 +0000708 /* Initialize the cell count and cell pointers */
709 pc = get2byte(&data[hdr+3]);
710 while( pc>0 ){
711 if( pc>=pBt->pageSize ) return SQLITE_CORRUPT;
712 if( pPage->nCell>pBt->pageSize ) return SQLITE_CORRUPT;
713 pPage->nCell++;
714 pc = get2byte(&data[pc]);
715 }
drh4b70f112004-05-02 21:12:19 +0000716 if( resizeCellArray(pPage, pPage->nCell) ){
drh9e572e62004-04-23 23:43:10 +0000717 return SQLITE_NOMEM;
718 }
719 pc = get2byte(&data[hdr+3]);
720 for(i=0; pc>0; i++){
721 pPage->aCell[i] = &data[pc];
722 pc = get2byte(&data[pc]);
723 sumCell += cellSize(pPage, &data[pc]);
724 }
725
726 /* Compute the total free space on the page */
727 pPage->nFree = data[hdr+5];
728 pc = get2byte(&data[hdr+1]);
729 while( pc>0 ){
730 int next, size;
731 if( pc>=pBt->pageSize ) return SQLITE_CORRUPT;
732 next = get2byte(&data[pc]);
733 size = get2byte(&data[pc+2]);
drh3aac2dd2004-04-26 14:10:20 +0000734 if( next>0 && next<=pc+size+3 ) return SQLITE_CORRUPT;
drh9e572e62004-04-23 23:43:10 +0000735 pPage->nFree += size;
736 pc = next;
737 }
738 if( pPage->nFree>=pBt->pageSize ) return SQLITE_CORRUPT;
739
740 /* Sanity check: Cells and freespace and header must sum to the size
741 ** a page. */
742 if( sumCell+pPage->nFree+hdr+10-pPage->leaf*4 != pBt->pageSize ){
743 return CORRUPT;
744 }
745
746 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +0000747}
748
749/*
drh8b2f49b2001-06-08 00:21:52 +0000750** Set up a raw page so that it looks like a database page holding
751** no entries.
drhbd03cae2001-06-02 02:40:57 +0000752*/
drh9e572e62004-04-23 23:43:10 +0000753static void zeroPage(MemPage *pPage, int flags){
754 unsigned char *data = pPage->aData;
755 Btree *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +0000756 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +0000757 int first;
758
drh4b70f112004-05-02 21:12:19 +0000759 assert( sqlitepager_iswriteable(data->aData) );
drh9e572e62004-04-23 23:43:10 +0000760 memset(&data[hdr], 0, pBt->pageSize - hdr);
761 data[hdr] = flags;
762 first = hdr + 6 + 4*((flags&0x01)!=0);
763 put2byte(&data[hdr+1], first);
764 put2byte(&data[first+2], pBt->pageSize - first);
765 sqliteFree(pPage->aCell);
766 pPage->aCell = 0;
drh8c42ca92001-06-22 19:15:00 +0000767 pPage->nCell = 0;
drh9e572e62004-04-23 23:43:10 +0000768 pPage->nFree = pBt->pageSize - first;
769 pPage->intKey = (flags & PTF_INTKEY)!=0;
770 pPage->leaf = (flags & PTF_LEAF)!=0;
771 pPage->zeroData = (flags & PTF_ZERODATA)!=0;
772 pPage->hdrOffset = hdr;
drhbd03cae2001-06-02 02:40:57 +0000773}
774
775/*
drh3aac2dd2004-04-26 14:10:20 +0000776** Get a page from the pager. Initialize the MemPage.pBt and
777** MemPage.aData elements if needed.
778*/
779static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
780 int rc;
781 unsigned char *aData;
782 MemPage *pPage;
783 rc = sqlitepager_get(pBt->pPager, pgno, &aData);
784 if( rc ) return rc;
785 pPage = (MemPage*)aData[pBt->pageSize];
786 pPage->aData = aData;
787 pPage->pBt = pBt;
788 pPage->pgno = pgno;
789 *ppPage = pPage;
790 return SQLITE_OK;
791}
792
793/*
794** Release a MemPage. This should be called once for each prior
795** call to getPage.
796*/
drh4b70f112004-05-02 21:12:19 +0000797static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +0000798 if( pPage ){
799 assert( pPage->aData );
800 assert( pPage->pBt );
801 assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
802 sqlitepager_unref(pPage->aData);
803 }
804}
805
806/*
drh72f82862001-05-24 21:06:34 +0000807** This routine is called when the reference count for a page
808** reaches zero. We need to unref the pParent pointer when that
809** happens.
810*/
811static void pageDestructor(void *pData){
drh9e572e62004-04-23 23:43:10 +0000812 MemPage *pPage = (MemPage*)&((char*)pData)[SQLITE_PAGE_SIZE];
drh72f82862001-05-24 21:06:34 +0000813 if( pPage->pParent ){
814 MemPage *pParent = pPage->pParent;
815 pPage->pParent = 0;
drh3aac2dd2004-04-26 14:10:20 +0000816 releasepage(pParent);
drh72f82862001-05-24 21:06:34 +0000817 }
drh9e572e62004-04-23 23:43:10 +0000818 sqliteFree(pPage->aCell);
819 pPage->aCell = 0;
drh3aac2dd2004-04-26 14:10:20 +0000820 pPage->isInit = 0;
drh72f82862001-05-24 21:06:34 +0000821}
822
823/*
drh306dc212001-05-21 13:45:10 +0000824** Open a new database.
825**
826** Actually, this routine just sets up the internal data structures
drh72f82862001-05-24 21:06:34 +0000827** for accessing the database. We do not open the database file
828** until the first page is loaded.
drh382c0242001-10-06 16:33:02 +0000829**
830** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +0000831** a new database with a random name is created. This randomly named
832** database file will be deleted when sqliteBtreeClose() is called.
drha059ad02001-04-17 20:09:11 +0000833*/
drh6019e162001-07-02 17:51:45 +0000834int sqliteBtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +0000835 const char *zFilename, /* Name of the file containing the BTree database */
836 Btree **ppBtree, /* Pointer to new Btree object written here */
837 int nCache, /* Number of cache pages */
838 int flags /* Options */
drh6019e162001-07-02 17:51:45 +0000839){
drha059ad02001-04-17 20:09:11 +0000840 Btree *pBt;
drh3aac2dd2004-04-26 14:10:20 +0000841 int rc, i;
842 int nCache = 2000;
843 int omitJournal = 0;
drha059ad02001-04-17 20:09:11 +0000844
drhd62d3d02003-01-24 12:14:20 +0000845 /*
846 ** The following asserts make sure that structures used by the btree are
847 ** the right size. This is to guard against size changes that result
848 ** when compiling on a different architecture.
849 */
drh9e572e62004-04-23 23:43:10 +0000850 assert( sizeof(u64)==8 );
drhd62d3d02003-01-24 12:14:20 +0000851 assert( sizeof(u32)==4 );
852 assert( sizeof(u16)==2 );
853 assert( sizeof(Pgno)==4 );
drhd62d3d02003-01-24 12:14:20 +0000854 assert( sizeof(ptr)==sizeof(char*) );
855 assert( sizeof(uptr)==sizeof(ptr) );
856
drha059ad02001-04-17 20:09:11 +0000857 pBt = sqliteMalloc( sizeof(*pBt) );
858 if( pBt==0 ){
drh8c42ca92001-06-22 19:15:00 +0000859 *ppBtree = 0;
drha059ad02001-04-17 20:09:11 +0000860 return SQLITE_NOMEM;
861 }
drh6019e162001-07-02 17:51:45 +0000862 if( nCache<10 ) nCache = 10;
drhda47d772002-12-02 04:25:19 +0000863 rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
drh3aac2dd2004-04-26 14:10:20 +0000864 (flags & BTREE_OMIT_JOURNAL)==0);
drha059ad02001-04-17 20:09:11 +0000865 if( rc!=SQLITE_OK ){
866 if( pBt->pPager ) sqlitepager_close(pBt->pPager);
867 sqliteFree(pBt);
868 *ppBtree = 0;
869 return rc;
870 }
drh72f82862001-05-24 21:06:34 +0000871 sqlitepager_set_destructor(pBt->pPager, pageDestructor);
drha059ad02001-04-17 20:09:11 +0000872 pBt->pCursor = 0;
873 pBt->page1 = 0;
drh5df72a52002-06-06 23:16:05 +0000874 pBt->readOnly = sqlitepager_isreadonly(pBt->pPager);
drh4b70f112004-05-02 21:12:19 +0000875 pBt->pageSize = SQLITE_PAGE_SIZE; /* FIX ME - read from header */
876 pBt->maxLocal = (pBt->pageSize-10)/4-12;
drha059ad02001-04-17 20:09:11 +0000877 *ppBtree = pBt;
878 return SQLITE_OK;
879}
880
881/*
882** Close an open database and invalidate all cursors.
883*/
drh3aac2dd2004-04-26 14:10:20 +0000884int sqlite3BtreeClose(Btree *pBt){
drha059ad02001-04-17 20:09:11 +0000885 while( pBt->pCursor ){
drh3aac2dd2004-04-26 14:10:20 +0000886 sqlite3BtreeCloseCursor(pBt->pCursor);
drha059ad02001-04-17 20:09:11 +0000887 }
888 sqlitepager_close(pBt->pPager);
889 sqliteFree(pBt);
890 return SQLITE_OK;
891}
892
893/*
drhda47d772002-12-02 04:25:19 +0000894** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +0000895**
896** The maximum number of cache pages is set to the absolute
897** value of mxPage. If mxPage is negative, the pager will
898** operate asynchronously - it will not stop to do fsync()s
899** to insure data is written to the disk surface before
900** continuing. Transactions still work if synchronous is off,
901** and the database cannot be corrupted if this program
902** crashes. But if the operating system crashes or there is
903** an abrupt power failure when synchronous is off, the database
904** could be left in an inconsistent and unrecoverable state.
905** Synchronous is on by default so database corruption is not
906** normally a worry.
drhf57b14a2001-09-14 18:54:08 +0000907*/
drh3aac2dd2004-04-26 14:10:20 +0000908int sqilte3BtreeSetCacheSize(Btree *pBt, int mxPage){
drhf57b14a2001-09-14 18:54:08 +0000909 sqlitepager_set_cachesize(pBt->pPager, mxPage);
910 return SQLITE_OK;
911}
912
913/*
drh973b6e32003-02-12 14:09:42 +0000914** Change the way data is synced to disk in order to increase or decrease
915** how well the database resists damage due to OS crashes and power
916** failures. Level 1 is the same as asynchronous (no syncs() occur and
917** there is a high probability of damage) Level 2 is the default. There
918** is a very low but non-zero probability of damage. Level 3 reduces the
919** probability of damage to near zero but with a write performance reduction.
920*/
drh3aac2dd2004-04-26 14:10:20 +0000921int sqlite3BtreeSetSafetyLevel(Btree *pBt, int level){
drh973b6e32003-02-12 14:09:42 +0000922 sqlitepager_set_safety_level(pBt->pPager, level);
923 return SQLITE_OK;
924}
925
926/*
drh306dc212001-05-21 13:45:10 +0000927** Get a reference to page1 of the database file. This will
928** also acquire a readlock on that file.
929**
930** SQLITE_OK is returned on success. If the file is not a
931** well-formed database file, then SQLITE_CORRUPT is returned.
932** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
933** is returned if we run out of memory. SQLITE_PROTOCOL is returned
934** if there is a locking protocol violation.
935*/
936static int lockBtree(Btree *pBt){
937 int rc;
drh3aac2dd2004-04-26 14:10:20 +0000938 MemPage *pPage1;
drh306dc212001-05-21 13:45:10 +0000939 if( pBt->page1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000940 rc = getPage(pBt, 1, &pPage1);
drh306dc212001-05-21 13:45:10 +0000941 if( rc!=SQLITE_OK ) return rc;
drh3aac2dd2004-04-26 14:10:20 +0000942
drh306dc212001-05-21 13:45:10 +0000943
944 /* Do some checking to help insure the file we opened really is
945 ** a valid database file.
946 */
947 if( sqlitepager_pagecount(pBt->pPager)>0 ){
drh3aac2dd2004-04-26 14:10:20 +0000948 if( memcmp(pPage1->aData, zMagicHeader, 16)!=0 ){
drhc602f9a2004-02-12 19:01:04 +0000949 rc = SQLITE_NOTADB;
drh72f82862001-05-24 21:06:34 +0000950 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +0000951 }
drh9e572e62004-04-23 23:43:10 +0000952 /*** TBD: Other header checks such as page size ****/
drh306dc212001-05-21 13:45:10 +0000953 }
drh3aac2dd2004-04-26 14:10:20 +0000954 pBt->pPage1 = pPage1;
drh306dc212001-05-21 13:45:10 +0000955 return rc;
956
drh72f82862001-05-24 21:06:34 +0000957page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +0000958 releasePage(pPage1);
959 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +0000960 return rc;
drh306dc212001-05-21 13:45:10 +0000961}
962
963/*
drhb8ca3072001-12-05 00:21:20 +0000964** If there are no outstanding cursors and we are not in the middle
965** of a transaction but there is a read lock on the database, then
966** this routine unrefs the first page of the database file which
967** has the effect of releasing the read lock.
968**
969** If there are any outstanding cursors, this routine is a no-op.
970**
971** If there is a transaction in progress, this routine is a no-op.
972*/
973static void unlockBtreeIfUnused(Btree *pBt){
drh3aac2dd2004-04-26 14:10:20 +0000974 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->pPage1!=0 ){
975 releasePage(pBt->pPage1);
976 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +0000977 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +0000978 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +0000979 }
980}
981
982/*
drh9e572e62004-04-23 23:43:10 +0000983** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +0000984** file.
drh8b2f49b2001-06-08 00:21:52 +0000985*/
986static int newDatabase(Btree *pBt){
drh9e572e62004-04-23 23:43:10 +0000987 MemPage *pP1;
988 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +0000989 int rc;
drh7c717f72001-06-24 20:39:41 +0000990 if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +0000991 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +0000992 assert( pP1!=0 );
993 data = pP1->aData;
994 rc = sqlitepager_write(data);
drh8b2f49b2001-06-08 00:21:52 +0000995 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +0000996 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
997 assert( sizeof(zMagicHeader)==16 );
998 put2byte(&data[16], SQLITE_PAGE_SIZE);
999 data[18] = 1;
1000 data[19] = 1;
1001 put2byte(&data[22], (SQLITE_PAGE_SIZE-10)/4-12);
1002 zeroPage(pP1, PTF_INTKEY|PTF_LEAF);
drh8b2f49b2001-06-08 00:21:52 +00001003 return SQLITE_OK;
1004}
1005
1006/*
drh72f82862001-05-24 21:06:34 +00001007** Attempt to start a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001008**
1009** A transaction must be started before attempting any changes
1010** to the database. None of the following routines will work
1011** unless a transaction is started first:
1012**
1013** sqliteBtreeCreateTable()
drhc6b52df2002-01-04 03:09:29 +00001014** sqliteBtreeCreateIndex()
drh8b2f49b2001-06-08 00:21:52 +00001015** sqliteBtreeClearTable()
1016** sqliteBtreeDropTable()
1017** sqliteBtreeInsert()
1018** sqliteBtreeDelete()
1019** sqliteBtreeUpdateMeta()
drha059ad02001-04-17 20:09:11 +00001020*/
drh3aac2dd2004-04-26 14:10:20 +00001021int sqlite3BtreeBeginTrans(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001022 int rc;
1023 if( pBt->inTrans ) return SQLITE_ERROR;
drhf74b8d92002-09-01 23:20:45 +00001024 if( pBt->readOnly ) return SQLITE_READONLY;
drh3aac2dd2004-04-26 14:10:20 +00001025 if( pBt->pPage1==0 ){
drh7e3b0a02001-04-28 16:52:40 +00001026 rc = lockBtree(pBt);
drh8c42ca92001-06-22 19:15:00 +00001027 if( rc!=SQLITE_OK ){
1028 return rc;
1029 }
drha059ad02001-04-17 20:09:11 +00001030 }
drh3aac2dd2004-04-26 14:10:20 +00001031 rc = sqlitepager_begin(pBt->pPage1->aData);
drhf74b8d92002-09-01 23:20:45 +00001032 if( rc==SQLITE_OK ){
1033 rc = newDatabase(pBt);
drha059ad02001-04-17 20:09:11 +00001034 }
drhb8ca3072001-12-05 00:21:20 +00001035 if( rc==SQLITE_OK ){
1036 pBt->inTrans = 1;
drh3aac2dd2004-04-26 14:10:20 +00001037 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001038 }else{
1039 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001040 }
drhb8ca3072001-12-05 00:21:20 +00001041 return rc;
drha059ad02001-04-17 20:09:11 +00001042}
1043
1044/*
drh2aa679f2001-06-25 02:11:07 +00001045** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00001046**
1047** This will release the write lock on the database file. If there
1048** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001049*/
drh3aac2dd2004-04-26 14:10:20 +00001050int sqlite3BtreeCommit(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001051 int rc;
drh5df72a52002-06-06 23:16:05 +00001052 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager);
drh7c717f72001-06-24 20:39:41 +00001053 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001054 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00001055 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001056 return rc;
1057}
1058
1059/*
drhecdc7532001-09-23 02:35:53 +00001060** Rollback the transaction in progress. All cursors will be
1061** invalided by this operation. Any attempt to use a cursor
1062** that was open at the beginning of this operation will result
1063** in an error.
drh5e00f6c2001-09-13 13:46:56 +00001064**
1065** This will release the write lock on the database file. If there
1066** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00001067*/
drh3aac2dd2004-04-26 14:10:20 +00001068int sqlite3BtreeRollback(Btree *pBt){
drha059ad02001-04-17 20:09:11 +00001069 int rc;
drhecdc7532001-09-23 02:35:53 +00001070 BtCursor *pCur;
drh7c717f72001-06-24 20:39:41 +00001071 if( pBt->inTrans==0 ) return SQLITE_OK;
1072 pBt->inTrans = 0;
drh3aac2dd2004-04-26 14:10:20 +00001073 pBt->inStmt = 0;
drh3a840692003-01-29 22:58:26 +00001074 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager);
drhecdc7532001-09-23 02:35:53 +00001075 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drh3aac2dd2004-04-26 14:10:20 +00001076 MemPage *pPage = pCur->pPage;
1077 if( pPage && !pPage->isInit ){
1078 releasePage(pPage);
drhecdc7532001-09-23 02:35:53 +00001079 pCur->pPage = 0;
1080 }
1081 }
drh5e00f6c2001-09-13 13:46:56 +00001082 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001083 return rc;
1084}
1085
1086/*
drh663fc632002-02-02 18:49:19 +00001087** Set the checkpoint for the current transaction. The checkpoint serves
1088** as a sub-transaction that can be rolled back independently of the
1089** main transaction. You must start a transaction before starting a
1090** checkpoint. The checkpoint is ended automatically if the transaction
1091** commits or rolls back.
1092**
1093** Only one checkpoint may be active at a time. It is an error to try
1094** to start a new checkpoint if another checkpoint is already active.
1095*/
drh3aac2dd2004-04-26 14:10:20 +00001096int sqlite3BtreeBeginStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001097 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001098 if( !pBt->inTrans || pBt->inStmt ){
drhf74b8d92002-09-01 23:20:45 +00001099 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh0d65dc02002-02-03 00:56:09 +00001100 }
drh5df72a52002-06-06 23:16:05 +00001101 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager);
drh3aac2dd2004-04-26 14:10:20 +00001102 pBt->inStmt = 1;
drh663fc632002-02-02 18:49:19 +00001103 return rc;
1104}
1105
1106
1107/*
1108** Commit a checkpoint to transaction currently in progress. If no
1109** checkpoint is active, this is a no-op.
1110*/
drh3aac2dd2004-04-26 14:10:20 +00001111int sqlite3BtreeCommitStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001112 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001113 if( pBt->inStmt && !pBt->readOnly ){
drh663fc632002-02-02 18:49:19 +00001114 rc = sqlitepager_ckpt_commit(pBt->pPager);
1115 }else{
1116 rc = SQLITE_OK;
1117 }
drh3aac2dd2004-04-26 14:10:20 +00001118 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001119 return rc;
1120}
1121
1122/*
1123** Rollback the checkpoint to the current transaction. If there
1124** is no active checkpoint or transaction, this routine is a no-op.
1125**
1126** All cursors will be invalided by this operation. Any attempt
1127** to use a cursor that was open at the beginning of this operation
1128** will result in an error.
1129*/
drh3aac2dd2004-04-26 14:10:20 +00001130int sqlite3BtreeRollbackStmt(Btree *pBt){
drh663fc632002-02-02 18:49:19 +00001131 int rc;
1132 BtCursor *pCur;
drh3aac2dd2004-04-26 14:10:20 +00001133 if( pBt->inStmt==0 || pBt->readOnly ) return SQLITE_OK;
drh3a840692003-01-29 22:58:26 +00001134 rc = sqlitepager_ckpt_rollback(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00001135 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drh3aac2dd2004-04-26 14:10:20 +00001136 MemPage *pPage = pCur->pPage;
1137 if( pPage && !pPage->isInit ){
1138 releasePage(pPage);
drh663fc632002-02-02 18:49:19 +00001139 pCur->pPage = 0;
1140 }
1141 }
drh3aac2dd2004-04-26 14:10:20 +00001142 pBt->inStmt = 0;
drh663fc632002-02-02 18:49:19 +00001143 return rc;
1144}
1145
1146/*
drh3aac2dd2004-04-26 14:10:20 +00001147** Default key comparison function to be used if no comparison function
1148** is specified on the sqlite3BtreeCursor() call.
1149*/
1150static int dfltCompare(
1151 void *NotUsed, /* User data is not used */
1152 int n1, const void *p1, /* First key to compare */
1153 int n2, const void *p2 /* Second key to compare */
1154){
1155 int c;
1156 c = memcmp(p1, p2, n1<n2 ? n1 : n2);
1157 if( c==0 ){
1158 c = n1 - n2;
1159 }
1160 return c;
1161}
1162
1163/*
drh8b2f49b2001-06-08 00:21:52 +00001164** Create a new cursor for the BTree whose root is on the page
1165** iTable. The act of acquiring a cursor gets a read lock on
1166** the database file.
drh1bee3d72001-10-15 00:44:35 +00001167**
1168** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00001169** If wrFlag==1, then the cursor can be used for reading or for
1170** writing if other conditions for writing are also met. These
1171** are the conditions that must be met in order for writing to
1172** be allowed:
drh6446c4d2001-12-15 14:22:18 +00001173**
drhf74b8d92002-09-01 23:20:45 +00001174** 1: The cursor must have been opened with wrFlag==1
1175**
1176** 2: No other cursors may be open with wrFlag==0 on the same table
1177**
1178** 3: The database must be writable (not on read-only media)
1179**
1180** 4: There must be an active transaction.
1181**
1182** Condition 2 warrants further discussion. If any cursor is opened
1183** on a table with wrFlag==0, that prevents all other cursors from
1184** writing to that table. This is a kind of "read-lock". When a cursor
1185** is opened with wrFlag==0 it is guaranteed that the table will not
1186** change as long as the cursor is open. This allows the cursor to
1187** do a sequential scan of the table without having to worry about
1188** entries being inserted or deleted during the scan. Cursors should
1189** be opened with wrFlag==0 only if this read-lock property is needed.
1190** That is to say, cursors should be opened with wrFlag==0 only if they
1191** intend to use the sqliteBtreeNext() system call. All other cursors
1192** should be opened with wrFlag==1 even if they never really intend
1193** to write.
1194**
drh6446c4d2001-12-15 14:22:18 +00001195** No checking is done to make sure that page iTable really is the
1196** root page of a b-tree. If it is not, then the cursor acquired
1197** will not work correctly.
drh3aac2dd2004-04-26 14:10:20 +00001198**
1199** The comparison function must be logically the same for every cursor
1200** on a particular table. Changing the comparison function will result
1201** in incorrect operations. If the comparison function is NULL, a
1202** default comparison function is used. The comparison function is
1203** always ignored for INTKEY tables.
drha059ad02001-04-17 20:09:11 +00001204*/
drh3aac2dd2004-04-26 14:10:20 +00001205int sqlite3BtreeCursor(
1206 Btree *pBt, /* The btree */
1207 int iTable, /* Root page of table to open */
1208 int wrFlag, /* 1 to write. 0 read-only */
1209 int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
1210 void *pArg, /* First arg to xCompare() */
1211 BtCursor **ppCur /* Write new cursor here */
1212){
drha059ad02001-04-17 20:09:11 +00001213 int rc;
drhf74b8d92002-09-01 23:20:45 +00001214 BtCursor *pCur, *pRing;
drhecdc7532001-09-23 02:35:53 +00001215
drha0c9a112004-03-10 13:42:37 +00001216 if( pBt->readOnly && wrFlag ){
1217 *ppCur = 0;
1218 return SQLITE_READONLY;
1219 }
drh4b70f112004-05-02 21:12:19 +00001220 if( pBt->pPage1==0 ){
drha059ad02001-04-17 20:09:11 +00001221 rc = lockBtree(pBt);
1222 if( rc!=SQLITE_OK ){
1223 *ppCur = 0;
1224 return rc;
1225 }
1226 }
1227 pCur = sqliteMalloc( sizeof(*pCur) );
1228 if( pCur==0 ){
drhbd03cae2001-06-02 02:40:57 +00001229 rc = SQLITE_NOMEM;
1230 goto create_cursor_exception;
1231 }
drh8b2f49b2001-06-08 00:21:52 +00001232 pCur->pgnoRoot = (Pgno)iTable;
drh4b70f112004-05-02 21:12:19 +00001233 rc = getPage(pBt, pCur->pgnoRoot, &pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001234 if( rc!=SQLITE_OK ){
1235 goto create_cursor_exception;
1236 }
drh3aac2dd2004-04-26 14:10:20 +00001237 rc = initPage(pCur->pPage, 0);
drhbd03cae2001-06-02 02:40:57 +00001238 if( rc!=SQLITE_OK ){
1239 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00001240 }
drh3aac2dd2004-04-26 14:10:20 +00001241 pCur->xCompare = xCmp ? xCmp : dfltCompare;
1242 pCur->pArg = pArg;
drh14acc042001-06-10 19:56:58 +00001243 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00001244 pCur->wrFlag = wrFlag;
drh14acc042001-06-10 19:56:58 +00001245 pCur->idx = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001246 pCur->eSkip = SKIP_INVALID;
drha059ad02001-04-17 20:09:11 +00001247 pCur->pNext = pBt->pCursor;
1248 if( pCur->pNext ){
1249 pCur->pNext->pPrev = pCur;
1250 }
drh14acc042001-06-10 19:56:58 +00001251 pCur->pPrev = 0;
drhf74b8d92002-09-01 23:20:45 +00001252 pRing = pBt->pCursor;
1253 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1254 if( pRing ){
1255 pCur->pShared = pRing->pShared;
1256 pRing->pShared = pCur;
1257 }else{
1258 pCur->pShared = pCur;
1259 }
drha059ad02001-04-17 20:09:11 +00001260 pBt->pCursor = pCur;
drh2af926b2001-05-15 00:39:25 +00001261 *ppCur = pCur;
1262 return SQLITE_OK;
drhbd03cae2001-06-02 02:40:57 +00001263
1264create_cursor_exception:
1265 *ppCur = 0;
1266 if( pCur ){
drh3aac2dd2004-04-26 14:10:20 +00001267 releasePage(pCur->pPage);
drhbd03cae2001-06-02 02:40:57 +00001268 sqliteFree(pCur);
1269 }
drh5e00f6c2001-09-13 13:46:56 +00001270 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00001271 return rc;
drha059ad02001-04-17 20:09:11 +00001272}
1273
1274/*
drh5e00f6c2001-09-13 13:46:56 +00001275** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00001276** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00001277*/
drh3aac2dd2004-04-26 14:10:20 +00001278int sqlite3BtreeCloseCursor(BtCursor *pCur){
drha059ad02001-04-17 20:09:11 +00001279 Btree *pBt = pCur->pBt;
drha059ad02001-04-17 20:09:11 +00001280 if( pCur->pPrev ){
1281 pCur->pPrev->pNext = pCur->pNext;
1282 }else{
1283 pBt->pCursor = pCur->pNext;
1284 }
1285 if( pCur->pNext ){
1286 pCur->pNext->pPrev = pCur->pPrev;
1287 }
drh3aac2dd2004-04-26 14:10:20 +00001288 releasePage(pCur->pPage);
drhf74b8d92002-09-01 23:20:45 +00001289 if( pCur->pShared!=pCur ){
1290 BtCursor *pRing = pCur->pShared;
1291 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1292 pRing->pShared = pCur->pShared;
1293 }
drh5e00f6c2001-09-13 13:46:56 +00001294 unlockBtreeIfUnused(pBt);
drha059ad02001-04-17 20:09:11 +00001295 sqliteFree(pCur);
drh8c42ca92001-06-22 19:15:00 +00001296 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001297}
1298
drh7e3b0a02001-04-28 16:52:40 +00001299/*
drh5e2f8b92001-05-28 00:41:15 +00001300** Make a temporary cursor by filling in the fields of pTempCur.
1301** The temporary cursor is not on the cursor list for the Btree.
1302*/
drh14acc042001-06-10 19:56:58 +00001303static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
drh5e2f8b92001-05-28 00:41:15 +00001304 memcpy(pTempCur, pCur, sizeof(*pCur));
1305 pTempCur->pNext = 0;
1306 pTempCur->pPrev = 0;
drhecdc7532001-09-23 02:35:53 +00001307 if( pTempCur->pPage ){
drh3aac2dd2004-04-26 14:10:20 +00001308 sqlitepager_ref(pTempCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001309 }
drh5e2f8b92001-05-28 00:41:15 +00001310}
1311
1312/*
drhbd03cae2001-06-02 02:40:57 +00001313** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00001314** function above.
1315*/
drh14acc042001-06-10 19:56:58 +00001316static void releaseTempCursor(BtCursor *pCur){
drhecdc7532001-09-23 02:35:53 +00001317 if( pCur->pPage ){
drh3aac2dd2004-04-26 14:10:20 +00001318 sqlitepager_unref(pCur->pPage->aData);
drhecdc7532001-09-23 02:35:53 +00001319 }
drh5e2f8b92001-05-28 00:41:15 +00001320}
1321
1322/*
drh3aac2dd2004-04-26 14:10:20 +00001323** Set *pSize to the size of the buffer needed to hold the value of
1324** the key for the current entry. If the cursor is not pointing
1325** to a valid entry, *pSize is set to 0.
1326**
drh4b70f112004-05-02 21:12:19 +00001327** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00001328** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00001329*/
drh3aac2dd2004-04-26 14:10:20 +00001330int sqlite3BtreeKeySize(BtCursor *pCur, u64 *pSize){
drh2af926b2001-05-15 00:39:25 +00001331 MemPage *pPage;
1332
1333 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001334 assert( pPage!=0 );
1335 if( pCur->idx >= pPage->nCell ){
drh72f82862001-05-24 21:06:34 +00001336 *pSize = 0;
1337 }else{
drh3aac2dd2004-04-26 14:10:20 +00001338 unsigned char *cell = pPage->aCell[pCur->idx];
1339 cell += 2; /* Skip the offset to the next cell */
1340 if( pPage->leaf ){
1341 cell += 4; /* Skip the child pointer */
1342 }
1343 if( !pPage->zeroData ){
1344 while( (0x80&*(data++))!=0 ){} /* Skip the data size number */
1345 }
1346 getVarint(data, pSize);
drh72f82862001-05-24 21:06:34 +00001347 }
1348 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00001349}
drh2af926b2001-05-15 00:39:25 +00001350
drh72f82862001-05-24 21:06:34 +00001351/*
1352** Read payload information from the entry that the pCur cursor is
1353** pointing to. Begin reading the payload at "offset" and read
1354** a total of "amt" bytes. Put the result in zBuf.
1355**
1356** This routine does not make a distinction between key and data.
1357** It just reads bytes from the payload area.
1358*/
drh3aac2dd2004-04-26 14:10:20 +00001359static int getPayload(
1360 BtCursor *pCur, /* Cursor pointing to entry to read from */
1361 int offset, /* Begin reading this far into payload */
1362 int amt, /* Read this many bytes */
1363 unsigned char *pBuf, /* Write the bytes into this buffer */
1364 int skipKey /* offset begins at data if this is true */
1365){
1366 unsigned char *aPayload;
drh2af926b2001-05-15 00:39:25 +00001367 Pgno nextPage;
drh8c42ca92001-06-22 19:15:00 +00001368 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001369 MemPage *pPage;
1370 Btree *pBt;
1371 u64 nData, nKey;
1372 int maxLocal, ovflSize;
1373
drh72f82862001-05-24 21:06:34 +00001374 assert( pCur!=0 && pCur->pPage!=0 );
drh3aac2dd2004-04-26 14:10:20 +00001375 pBt = pCur->pBt;
1376 pPage = pCur->pPage;
1377 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1378 aPayload = pPage->aCell[pCur->idx];
1379 aPayload += 2; /* Skip the next cell index */
1380 if( pPage->leaf ){
1381 aPayload += 4; /* Skip the child pointer */
1382 }
1383 if( pPage->zeroData ){
1384 nData = 0;
1385 }else{
1386 aPayload += getVarint(aPayload, &nData);
1387 }
1388 aPayload += getVarInt(aPayload, &nKey);
1389 if( pPage->intKey ){
1390 nKey = 0;
1391 }
1392 assert( offset>=0 );
1393 if( skipKey ){
1394 offset += nKey;
1395 }
1396 if( offset+amt > nKey+nData ){
1397 sqlite SQLITE_ERROR;
1398 }
1399 maxLocal = pBt->maxLocal
1400 if( offset<maxLocal ){
drh2af926b2001-05-15 00:39:25 +00001401 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001402 if( a+offset>maxLocal ){
1403 a = maxLocal - offset;
drh2af926b2001-05-15 00:39:25 +00001404 }
drh5e2f8b92001-05-28 00:41:15 +00001405 memcpy(zBuf, &aPayload[offset], a);
drh2af926b2001-05-15 00:39:25 +00001406 if( a==amt ){
1407 return SQLITE_OK;
1408 }
drh2aa679f2001-06-25 02:11:07 +00001409 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001410 zBuf += a;
1411 amt -= a;
drhdd793422001-06-28 01:54:48 +00001412 }else{
drh3aac2dd2004-04-26 14:10:20 +00001413 offset -= maxLocal;
drhbd03cae2001-06-02 02:40:57 +00001414 }
1415 if( amt>0 ){
drh3aac2dd2004-04-26 14:10:20 +00001416 nextPage = get4bytes(&aPayload[maxLocal]);
drh2af926b2001-05-15 00:39:25 +00001417 }
drh3aac2dd2004-04-26 14:10:20 +00001418 ovflSize = pBt->pageSize - 4;
drh2af926b2001-05-15 00:39:25 +00001419 while( amt>0 && nextPage ){
drh3aac2dd2004-04-26 14:10:20 +00001420 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&aPayload);
drh2af926b2001-05-15 00:39:25 +00001421 if( rc!=0 ){
1422 return rc;
1423 }
drh3aac2dd2004-04-26 14:10:20 +00001424 nextPage = get4bytes(aPayload);
1425 if( offset<ovflSize ){
drh2af926b2001-05-15 00:39:25 +00001426 int a = amt;
drh3aac2dd2004-04-26 14:10:20 +00001427 if( a + offset > ovflSize ){
1428 a = ovflSize - offset;
drh2af926b2001-05-15 00:39:25 +00001429 }
drh3aac2dd2004-04-26 14:10:20 +00001430 memcpy(zBuf, &aPayload[offset], a);
drh2aa679f2001-06-25 02:11:07 +00001431 offset = 0;
drh2af926b2001-05-15 00:39:25 +00001432 amt -= a;
1433 zBuf += a;
drh2aa679f2001-06-25 02:11:07 +00001434 }else{
drh3aac2dd2004-04-26 14:10:20 +00001435 offset -= ovflSize;
drh2af926b2001-05-15 00:39:25 +00001436 }
drh3aac2dd2004-04-26 14:10:20 +00001437 sqlitepager_unref(aPayload);
drh2af926b2001-05-15 00:39:25 +00001438 }
drha7fcb052001-12-14 15:09:55 +00001439 if( amt>0 ){
1440 return SQLITE_CORRUPT;
1441 }
1442 return SQLITE_OK;
drh2af926b2001-05-15 00:39:25 +00001443}
1444
drh72f82862001-05-24 21:06:34 +00001445/*
drh3aac2dd2004-04-26 14:10:20 +00001446** Read part of the key associated with cursor pCur. Exactly
1447** "amt" bytes will be transfered into zBuf[]. The transfer
1448** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00001449**
drh3aac2dd2004-04-26 14:10:20 +00001450** Return SQLITE_OK on success or an error code if anything goes
1451** wrong. An error is returned if "offset+amt" is larger than
1452** the available payload.
drh72f82862001-05-24 21:06:34 +00001453*/
drh3aac2dd2004-04-26 14:10:20 +00001454int sqlite3BtreeKey(BtCursor *pCur, int offset, int amt, void *pBuf){
drh72f82862001-05-24 21:06:34 +00001455 MemPage *pPage;
drha059ad02001-04-17 20:09:11 +00001456
drh8c1238a2003-01-02 14:43:55 +00001457 assert( amt>=0 );
1458 assert( offset>=0 );
1459 assert( pCur->pPage!=0 );
drh72f82862001-05-24 21:06:34 +00001460 pPage = pCur->pPage;
drh3aac2dd2004-04-26 14:10:20 +00001461 if( pCur->idx >= pPage->nCell || pPage->intKey ){
1462 assert( amt==0 );
1463 return SQLITE_OK;
1464 }
1465 return getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
1466}
1467
1468/*
1469** Return a pointer to the key of record that cursor pCur
1470** is point to if the entire key is in contiguous memory.
1471** If the key is split up among multiple tables, return 0.
1472** If pCur is not pointing to a valid entry return 0.
1473**
1474** The pointer returned is ephemeral. The key may move
1475** or be destroyed on the next call to any Btree routine.
1476**
1477** This routine is used to do quick key comparisons in the
1478** common case where the entire key fits in the payload area
drh4b70f112004-05-02 21:12:19 +00001479** of a cell and does not overflow onto secondary pages. If
1480** this routine returns 0 for a valid cursor, the caller will
1481** need to allocate a buffer big enough to hold the whole key
1482** then use sqlite3BtreeKey() to copy the key value into the
1483** buffer. That is substantially slower. But fortunately,
1484** most keys are small enough to fit in the payload area so
1485** the slower method is rarely needed.
drh3aac2dd2004-04-26 14:10:20 +00001486*/
1487void *sqlite3BtreeKeyFetch(BtCursor *pCur){
1488 unsigned char *aPayload;
1489 MemPage *pPage;
1490 Btree *pBt;
1491 u64 nData, nKey;
1492
1493 assert( pCur!=0 && pCur->pPage!=0 );
1494 pBt = pCur->pBt;
1495 pPage = pCur->pPage;
1496 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
1497 aPayload = pPage->aCell[pCur->idx];
1498 aPayload += 2; /* Skip the next cell index */
1499 if( pPage->leaf ){
1500 aPayload += 4; /* Skip the child pointer */
1501 }
1502 if( !pPage->zeroData ){
1503 aPayload += getVarint(aPayload, &nData);
1504 }
1505 aPayload += getVarInt(aPayload, &nKey);
1506 if( pPage->intKey || nKey>pBt->maxLocal ){
drh5e00f6c2001-09-13 13:46:56 +00001507 return 0;
drh72f82862001-05-24 21:06:34 +00001508 }
drh3aac2dd2004-04-26 14:10:20 +00001509 return aPayload;
drh72f82862001-05-24 21:06:34 +00001510}
1511
drh3aac2dd2004-04-26 14:10:20 +00001512
drh72f82862001-05-24 21:06:34 +00001513/*
drhbd03cae2001-06-02 02:40:57 +00001514** Set *pSize to the number of bytes of data in the entry the
1515** cursor currently points to. Always return SQLITE_OK.
1516** Failure is not possible. If the cursor is not currently
1517** pointing to an entry (which can happen, for example, if
1518** the database is empty) then *pSize is set to 0.
drh72f82862001-05-24 21:06:34 +00001519*/
drh3aac2dd2004-04-26 14:10:20 +00001520int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drh72f82862001-05-24 21:06:34 +00001521 MemPage *pPage;
1522
1523 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001524 assert( pPage!=0 );
drh3aac2dd2004-04-26 14:10:20 +00001525 if( pCur->idx >= pPage->nCell || pPage->zeroData ){
drh72f82862001-05-24 21:06:34 +00001526 *pSize = 0;
1527 }else{
drh3aac2dd2004-04-26 14:10:20 +00001528 unsigned char *cell;
1529 u64 size;
1530 cell = pPage->aCell[pCur->idx];
1531 cell += 2; /* Skip the offset to the next cell */
1532 if( pPage->leaf ){
1533 cell += 4; /* Skip the child pointer */
1534 }
1535 getVarint(data, size);
1536 assert( (size & 0x00000000ffffffff)==size );
1537 *pSize = size;
drh72f82862001-05-24 21:06:34 +00001538 }
1539 return SQLITE_OK;
1540}
1541
1542/*
drh3aac2dd2004-04-26 14:10:20 +00001543** Read part of the data associated with cursor pCur. Exactly
1544** "amt" bytes will be transfered into zBuf[]. The transfer
1545** begins at "offset".
1546**
1547** Return SQLITE_OK on success or an error code if anything goes
1548** wrong. An error is returned if "offset+amt" is larger than
1549** the available payload.
drh72f82862001-05-24 21:06:34 +00001550*/
drh3aac2dd2004-04-26 14:10:20 +00001551int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh72f82862001-05-24 21:06:34 +00001552 MemPage *pPage;
1553
drh8c1238a2003-01-02 14:43:55 +00001554 assert( amt>=0 );
1555 assert( offset>=0 );
1556 assert( pCur->pPage!=0 );
drh72f82862001-05-24 21:06:34 +00001557 pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001558 if( pCur->idx >= pPage->nCell ){
drh5e00f6c2001-09-13 13:46:56 +00001559 return 0;
drh72f82862001-05-24 21:06:34 +00001560 }
drh5e2f8b92001-05-28 00:41:15 +00001561 pCell = pPage->apCell[pCur->idx];
drh3aac2dd2004-04-26 14:10:20 +00001562 return getPayload(pCur, offset, amt, pBuf, 1);
drh2af926b2001-05-15 00:39:25 +00001563}
1564
drh72f82862001-05-24 21:06:34 +00001565/*
drh8178a752003-01-05 21:41:40 +00001566** Move the cursor down to a new child page. The newPgno argument is the
1567** page number of the child page in the byte order of the disk image.
drh72f82862001-05-24 21:06:34 +00001568*/
drh3aac2dd2004-04-26 14:10:20 +00001569static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00001570 int rc;
1571 MemPage *pNewPage;
drh3aac2dd2004-04-26 14:10:20 +00001572 MemPage *pOldPage;
1573 unsigned char *aData;
drh0d316a42002-08-11 20:10:47 +00001574 Btree *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00001575
drh3aac2dd2004-04-26 14:10:20 +00001576 rc = getPage(pBt, newPgno, &pNewPage);
drh6019e162001-07-02 17:51:45 +00001577 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001578 rc = initPage(pNewPage, pCur->pPage);
drh6019e162001-07-02 17:51:45 +00001579 if( rc ) return rc;
drh428ae8c2003-01-04 16:48:09 +00001580 pNewPage->idxParent = pCur->idx;
drh3aac2dd2004-04-26 14:10:20 +00001581 pOldPage = pCur->pPage;
1582 pOldPage->idxShift = 0;
1583 releasePage(pOldPage);
drh72f82862001-05-24 21:06:34 +00001584 pCur->pPage = pNewPage;
1585 pCur->idx = 0;
drh4be295b2003-12-16 03:44:47 +00001586 if( pNewPage->nCell<1 ){
1587 return SQLITE_CORRUPT;
1588 }
drh72f82862001-05-24 21:06:34 +00001589 return SQLITE_OK;
1590}
1591
1592/*
drh8856d6a2004-04-29 14:42:46 +00001593** Return true if the page is the virtual root of its table.
1594**
1595** The virtual root page is the root page for most tables. But
1596** for the table rooted on page 1, sometime the real root page
1597** is empty except for the right-pointer. In such cases the
1598** virtual root page is the page that the right-pointer of page
1599** 1 is pointing to.
1600*/
1601static int isRootPage(MemPage *pPage){
1602 MemPage *pParent = pPage->pParent;
1603 assert( pParent==0 || pParent->isInit );
1604 if( pParent || (pParent->pgno==1 && pParent->nCell==0) ) return 1;
1605 return 0;
1606}
1607
1608/*
drh5e2f8b92001-05-28 00:41:15 +00001609** Move the cursor up to the parent page.
1610**
1611** pCur->idx is set to the cell index that contains the pointer
1612** to the page we are coming from. If we are coming from the
1613** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00001614** the largest cell index.
drh72f82862001-05-24 21:06:34 +00001615*/
drh8178a752003-01-05 21:41:40 +00001616static void moveToParent(BtCursor *pCur){
drh72f82862001-05-24 21:06:34 +00001617 Pgno oldPgno;
1618 MemPage *pParent;
drh8178a752003-01-05 21:41:40 +00001619 MemPage *pPage;
drh428ae8c2003-01-04 16:48:09 +00001620 int idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001621
drh8178a752003-01-05 21:41:40 +00001622 pPage = pCur->pPage;
1623 assert( pPage!=0 );
drh8856d6a2004-04-29 14:42:46 +00001624 assert( !isRootPage(pPage) );
drh8178a752003-01-05 21:41:40 +00001625 pParent = pPage->pParent;
1626 assert( pParent!=0 );
1627 idxParent = pPage->idxParent;
drh3aac2dd2004-04-26 14:10:20 +00001628 sqlitepager_ref(pParent->aData);
1629 oldPgno = pPage->pgno;
1630 releasePage(pPage);
drh72f82862001-05-24 21:06:34 +00001631 pCur->pPage = pParent;
drh428ae8c2003-01-04 16:48:09 +00001632 assert( pParent->idxShift==0 );
1633 if( pParent->idxShift==0 ){
1634 pCur->idx = idxParent;
1635#ifndef NDEBUG
1636 /* Verify that pCur->idx is the correct index to point back to the child
1637 ** page we just came from
1638 */
drh428ae8c2003-01-04 16:48:09 +00001639 if( pCur->idx<pParent->nCell ){
drh3aac2dd2004-04-26 14:10:20 +00001640 assert( get4Byte(&pParent->aCell[idxParent][2])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001641 }else{
drh3aac2dd2004-04-26 14:10:20 +00001642 assert( get4Byte(&pParent->aData[pParent->hdrOffset+6])==oldPgno );
drh428ae8c2003-01-04 16:48:09 +00001643 }
1644#endif
1645 }else{
1646 /* The MemPage.idxShift flag indicates that cell indices might have
1647 ** changed since idxParent was set and hence idxParent might be out
1648 ** of date. So recompute the parent cell index by scanning all cells
1649 ** and locating the one that points to the child we just came from.
1650 */
1651 int i;
1652 pCur->idx = pParent->nCell;
drh428ae8c2003-01-04 16:48:09 +00001653 for(i=0; i<pParent->nCell; i++){
drh3aac2dd2004-04-26 14:10:20 +00001654 if( get4byte(&pParent->aCell[i][2])==oldPgno ){
drh428ae8c2003-01-04 16:48:09 +00001655 pCur->idx = i;
1656 break;
1657 }
drh72f82862001-05-24 21:06:34 +00001658 }
1659 }
1660}
1661
1662/*
1663** Move the cursor to the root page
1664*/
drh5e2f8b92001-05-28 00:41:15 +00001665static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00001666 MemPage *pRoot;
drhbd03cae2001-06-02 02:40:57 +00001667 int rc;
drh0d316a42002-08-11 20:10:47 +00001668 Btree *pBt = pCur->pBt;
drhbd03cae2001-06-02 02:40:57 +00001669
drh8856d6a2004-04-29 14:42:46 +00001670 rc = getPage(pBt, pCur->pgnoRoot, &pRoot);
drhbd03cae2001-06-02 02:40:57 +00001671 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001672 rc = initPage(pRoot, 0);
drh6019e162001-07-02 17:51:45 +00001673 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001674 releasePage(pCur->pPage);
1675 pCur->pPage = pRoot;
drh72f82862001-05-24 21:06:34 +00001676 pCur->idx = 0;
drh8856d6a2004-04-29 14:42:46 +00001677 if( pRoot->nCell==0 && !pRoot->leaf ){
1678 Pgno subpage;
1679 assert( pRoot->pgno==1 );
1680 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+6]);
1681 assert( subpage>0 );
drh4b70f112004-05-02 21:12:19 +00001682 rc = moveToChild(pCur, subpage);
drh8856d6a2004-04-29 14:42:46 +00001683 }
1684 return rc;
drh72f82862001-05-24 21:06:34 +00001685}
drh2af926b2001-05-15 00:39:25 +00001686
drh5e2f8b92001-05-28 00:41:15 +00001687/*
1688** Move the cursor down to the left-most leaf entry beneath the
1689** entry to which it is currently pointing.
1690*/
1691static int moveToLeftmost(BtCursor *pCur){
1692 Pgno pgno;
1693 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001694 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00001695
drh3aac2dd2004-04-26 14:10:20 +00001696 while( !(pPage = pCur->pPage)->leaf ){
1697 assert( pCur->idx>=0 && pCur->idx<pPage->nPage );
1698 pgno = get4byte(pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001699 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00001700 if( rc ) return rc;
1701 }
1702 return SQLITE_OK;
1703}
1704
drh2dcc9aa2002-12-04 13:40:25 +00001705/*
1706** Move the cursor down to the right-most leaf entry beneath the
1707** page to which it is currently pointing. Notice the difference
1708** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1709** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1710** finds the right-most entry beneath the *page*.
1711*/
1712static int moveToRightmost(BtCursor *pCur){
1713 Pgno pgno;
1714 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001715 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001716
drh3aac2dd2004-04-26 14:10:20 +00001717 while( !(pPage = pCur->pPage)->leaf ){
1718 pgno = get4byte(&pPage->aData[pPage->hdrOffset+6]);
1719 pCur->idx = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00001720 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001721 if( rc ) return rc;
1722 }
drh3aac2dd2004-04-26 14:10:20 +00001723 pCur->idx = pPage->nCell - 1;
drh2dcc9aa2002-12-04 13:40:25 +00001724 return SQLITE_OK;
1725}
1726
drh5e00f6c2001-09-13 13:46:56 +00001727/* Move the cursor to the first entry in the table. Return SQLITE_OK
1728** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001729** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00001730*/
drh3aac2dd2004-04-26 14:10:20 +00001731int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00001732 int rc;
drhecdc7532001-09-23 02:35:53 +00001733 if( pCur->pPage==0 ) return SQLITE_ABORT;
drh5e00f6c2001-09-13 13:46:56 +00001734 rc = moveToRoot(pCur);
1735 if( rc ) return rc;
1736 if( pCur->pPage->nCell==0 ){
1737 *pRes = 1;
1738 return SQLITE_OK;
1739 }
1740 *pRes = 0;
1741 rc = moveToLeftmost(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00001742 pCur->eSkip = SKIP_NONE;
drh5e00f6c2001-09-13 13:46:56 +00001743 return rc;
1744}
drh5e2f8b92001-05-28 00:41:15 +00001745
drh9562b552002-02-19 15:00:07 +00001746/* Move the cursor to the last entry in the table. Return SQLITE_OK
1747** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00001748** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00001749*/
drh3aac2dd2004-04-26 14:10:20 +00001750int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00001751 int rc;
drh9562b552002-02-19 15:00:07 +00001752 if( pCur->pPage==0 ) return SQLITE_ABORT;
1753 rc = moveToRoot(pCur);
1754 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00001755 assert( pCur->pPage->isInit );
drh9562b552002-02-19 15:00:07 +00001756 if( pCur->pPage->nCell==0 ){
1757 *pRes = 1;
1758 return SQLITE_OK;
1759 }
1760 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001761 rc = moveToRightmost(pCur);
1762 pCur->eSkip = SKIP_NONE;
drh9562b552002-02-19 15:00:07 +00001763 return rc;
1764}
1765
drh3aac2dd2004-04-26 14:10:20 +00001766/* Move the cursor so that it points to an entry near pKey/nKey.
drh72f82862001-05-24 21:06:34 +00001767** Return a success code.
1768**
drh3aac2dd2004-04-26 14:10:20 +00001769** For INTKEY tables, only the nKey parameter is used. pKey is
1770** ignored. For other tables, nKey is the number of bytes of data
1771** in nKey. The comparison function specified when the cursor was
1772** created is used to compare keys.
1773**
drh5e2f8b92001-05-28 00:41:15 +00001774** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00001775** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00001776** were present. The cursor might point to an entry that comes
1777** before or after the key.
1778**
drhbd03cae2001-06-02 02:40:57 +00001779** The result of comparing the key with the entry to which the
1780** cursor is left pointing is stored in pCur->iMatch. The same
1781** value is also written to *pRes if pRes!=NULL. The meaning of
1782** this value is as follows:
1783**
1784** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00001785** is smaller than pKey or if the table is empty
1786** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00001787**
1788** *pRes==0 The cursor is left pointing at an entry that
1789** exactly matches pKey.
1790**
1791** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00001792** is larger than pKey.
drha059ad02001-04-17 20:09:11 +00001793*/
drh3aac2dd2004-04-26 14:10:20 +00001794int sqlite3BtreeMoveto(BtCursor *pCur, const void *pKey, u64 nKey, int *pRes){
drh72f82862001-05-24 21:06:34 +00001795 int rc;
drhecdc7532001-09-23 02:35:53 +00001796 if( pCur->pPage==0 ) return SQLITE_ABORT;
drh2dcc9aa2002-12-04 13:40:25 +00001797 pCur->eSkip = SKIP_NONE;
drh5e2f8b92001-05-28 00:41:15 +00001798 rc = moveToRoot(pCur);
drh72f82862001-05-24 21:06:34 +00001799 if( rc ) return rc;
1800 for(;;){
1801 int lwr, upr;
1802 Pgno chldPg;
1803 MemPage *pPage = pCur->pPage;
drh1a844c32002-12-04 22:29:28 +00001804 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00001805 lwr = 0;
1806 upr = pPage->nCell-1;
1807 while( lwr<=upr ){
drh3aac2dd2004-04-26 14:10:20 +00001808 void *pCellKey;
1809 u64 nCellKey;
drh72f82862001-05-24 21:06:34 +00001810 pCur->idx = (lwr+upr)/2;
drh3aac2dd2004-04-26 14:10:20 +00001811 nCellKey = sqlite3BtreeKeySize(pCur, &nCellKey);
1812 if( pPage->intKey ){
1813 if( nCellKey<nKey ){
1814 c = -1;
1815 }else if( nCellKey>nKey ){
1816 c = +1;
1817 }else{
1818 c = 0;
1819 }
1820 }else if( (pCellKey = sqlite3BtreeKeyFetch(pCur))!=0 ){
1821 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1822 }else{
1823 pCellKey = sqliteMalloc( nCellKey );
1824 if( pCellKey==0 ) return SQLITE_NOMEM;
1825 rc = sqlite3BtreeKey(pCur, 0, nCellKey, pCellKey);
1826 c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
1827 sqliteFree(pCellKey);
1828 if( rc ) return rc;
1829 }
drh72f82862001-05-24 21:06:34 +00001830 if( c==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001831 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001832 if( pRes ) *pRes = 0;
1833 return SQLITE_OK;
1834 }
1835 if( c<0 ){
1836 lwr = pCur->idx+1;
1837 }else{
1838 upr = pCur->idx-1;
1839 }
1840 }
1841 assert( lwr==upr+1 );
drh7aa128d2002-06-21 13:09:16 +00001842 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00001843 if( pPage->leaf ){
1844 chldpg = 0;
1845 }else if( lwr>=pPage->nCell ){
1846 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh72f82862001-05-24 21:06:34 +00001847 }else{
drh3aac2dd2004-04-26 14:10:20 +00001848 chldPg = get4byte(&pPage->aCell[lwr][2]);
drh72f82862001-05-24 21:06:34 +00001849 }
1850 if( chldPg==0 ){
drh5e2f8b92001-05-28 00:41:15 +00001851 pCur->iMatch = c;
drh72f82862001-05-24 21:06:34 +00001852 if( pRes ) *pRes = c;
1853 return SQLITE_OK;
1854 }
drh428ae8c2003-01-04 16:48:09 +00001855 pCur->idx = lwr;
drh8178a752003-01-05 21:41:40 +00001856 rc = moveToChild(pCur, chldPg);
drh72f82862001-05-24 21:06:34 +00001857 if( rc ) return rc;
1858 }
drhbd03cae2001-06-02 02:40:57 +00001859 /* NOT REACHED */
drh72f82862001-05-24 21:06:34 +00001860}
1861
1862/*
drhbd03cae2001-06-02 02:40:57 +00001863** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00001864** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00001865** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00001866** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00001867*/
drh3aac2dd2004-04-26 14:10:20 +00001868int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00001869 int rc;
drh8178a752003-01-05 21:41:40 +00001870 MemPage *pPage = pCur->pPage;
drh8c1238a2003-01-02 14:43:55 +00001871 assert( pRes!=0 );
drh8178a752003-01-05 21:41:40 +00001872 if( pPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00001873 *pRes = 1;
drhecdc7532001-09-23 02:35:53 +00001874 return SQLITE_ABORT;
1875 }
drh8178a752003-01-05 21:41:40 +00001876 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00001877 assert( pCur->eSkip!=SKIP_INVALID );
drh8178a752003-01-05 21:41:40 +00001878 if( pPage->nCell==0 ){
drh8c1238a2003-01-02 14:43:55 +00001879 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001880 return SQLITE_OK;
1881 }
drh8178a752003-01-05 21:41:40 +00001882 assert( pCur->idx<pPage->nCell );
drh2dcc9aa2002-12-04 13:40:25 +00001883 if( pCur->eSkip==SKIP_NEXT ){
1884 pCur->eSkip = SKIP_NONE;
drh8c1238a2003-01-02 14:43:55 +00001885 *pRes = 0;
drh72f82862001-05-24 21:06:34 +00001886 return SQLITE_OK;
1887 }
drh2dcc9aa2002-12-04 13:40:25 +00001888 pCur->eSkip = SKIP_NONE;
drh72f82862001-05-24 21:06:34 +00001889 pCur->idx++;
drh8178a752003-01-05 21:41:40 +00001890 if( pCur->idx>=pPage->nCell ){
drh3aac2dd2004-04-26 14:10:20 +00001891 if( !pPage->left ){
1892 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh5e2f8b92001-05-28 00:41:15 +00001893 if( rc ) return rc;
1894 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001895 *pRes = 0;
1896 return rc;
drh72f82862001-05-24 21:06:34 +00001897 }
drh5e2f8b92001-05-28 00:41:15 +00001898 do{
drh8856d6a2004-04-29 14:42:46 +00001899 if( isRootPage(pPage) ){
drh8c1238a2003-01-02 14:43:55 +00001900 *pRes = 1;
drh5e2f8b92001-05-28 00:41:15 +00001901 return SQLITE_OK;
1902 }
drh8178a752003-01-05 21:41:40 +00001903 moveToParent(pCur);
1904 pPage = pCur->pPage;
1905 }while( pCur->idx>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00001906 *pRes = 0;
drh8178a752003-01-05 21:41:40 +00001907 return SQLITE_OK;
1908 }
1909 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00001910 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00001911 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00001912 }
drh5e2f8b92001-05-28 00:41:15 +00001913 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00001914 return rc;
drh72f82862001-05-24 21:06:34 +00001915}
1916
drh3b7511c2001-05-26 13:15:44 +00001917/*
drh2dcc9aa2002-12-04 13:40:25 +00001918** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00001919** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00001920** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00001921** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00001922*/
drh3aac2dd2004-04-26 14:10:20 +00001923int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00001924 int rc;
1925 Pgno pgno;
drh8178a752003-01-05 21:41:40 +00001926 MemPage *pPage;
1927 pPage = pCur->pPage;
1928 if( pPage==0 ){
1929 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001930 return SQLITE_ABORT;
1931 }
drh8178a752003-01-05 21:41:40 +00001932 assert( pPage->isInit );
drh2dcc9aa2002-12-04 13:40:25 +00001933 assert( pCur->eSkip!=SKIP_INVALID );
drh8178a752003-01-05 21:41:40 +00001934 if( pPage->nCell==0 ){
1935 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00001936 return SQLITE_OK;
1937 }
1938 if( pCur->eSkip==SKIP_PREV ){
1939 pCur->eSkip = SKIP_NONE;
drh8178a752003-01-05 21:41:40 +00001940 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001941 return SQLITE_OK;
1942 }
1943 pCur->eSkip = SKIP_NONE;
1944 assert( pCur->idx>=0 );
drh3aac2dd2004-04-26 14:10:20 +00001945 if( !pPage->left ){
1946 pgno = get4byte(&pPage->aCell[pCur->idx][2]);
drh8178a752003-01-05 21:41:40 +00001947 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00001948 if( rc ) return rc;
1949 rc = moveToRightmost(pCur);
1950 }else{
1951 while( pCur->idx==0 ){
drh8856d6a2004-04-29 14:42:46 +00001952 if( isRootPage(pPage) ){
drh2dcc9aa2002-12-04 13:40:25 +00001953 if( pRes ) *pRes = 1;
1954 return SQLITE_OK;
1955 }
drh8178a752003-01-05 21:41:40 +00001956 moveToParent(pCur);
1957 pPage = pCur->pPage;
drh2dcc9aa2002-12-04 13:40:25 +00001958 }
1959 pCur->idx--;
1960 rc = SQLITE_OK;
1961 }
drh8178a752003-01-05 21:41:40 +00001962 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00001963 return rc;
1964}
1965
1966/*
drh3b7511c2001-05-26 13:15:44 +00001967** Allocate a new page from the database file.
1968**
1969** The new page is marked as dirty. (In other words, sqlitepager_write()
1970** has already been called on the new page.) The new page has also
1971** been referenced and the calling routine is responsible for calling
1972** sqlitepager_unref() on the new page when it is done.
1973**
1974** SQLITE_OK is returned on success. Any other return value indicates
1975** an error. *ppPage and *pPgno are undefined in the event of an error.
1976** Do not invoke sqlitepager_unref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00001977**
drh199e3cf2002-07-18 11:01:47 +00001978** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1979** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00001980** attempt to keep related pages close to each other in the database file,
1981** which in turn can make database access faster.
drh3b7511c2001-05-26 13:15:44 +00001982*/
drh199e3cf2002-07-18 11:01:47 +00001983static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
drh3aac2dd2004-04-26 14:10:20 +00001984 u32 pn;
1985 MemPage *pPage1;
1986 MemPage *pPage;
drh8c42ca92001-06-22 19:15:00 +00001987 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001988 int n; /* Number of pages on the freelist */
1989 int k; /* Number of leaves on the trunk of the freelist */
drh30e58752002-03-02 20:41:57 +00001990
drh3aac2dd2004-04-26 14:10:20 +00001991 pPage1 = pBt->pPage1;
1992 n = get4byte(&pPage1->aData[36]);
1993 if( n>0 ){
1994 /* There exists pages on the freelist. Reuse one of those pages. */
1995 MemPage *pTrunk;
1996 rc = sqlitepager_write(pPage1->aData);
drh3b7511c2001-05-26 13:15:44 +00001997 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00001998 put4byte(&pPage1->aData[36], n-1);
1999 rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002000 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002001 rc = sqlitepager_write(pTrunk->aData);
drh3b7511c2001-05-26 13:15:44 +00002002 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00002003 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002004 return rc;
2005 }
drh3aac2dd2004-04-26 14:10:20 +00002006 k = get4byte(&pTrunk->aData[4]);
2007 if( k==0 ){
2008 /* The trunk has no leaves. So extract the trunk page itself and
2009 ** use it as the newly allocated page */
2010 *pPgno = get4byte(pPage1->aData[32]);
2011 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
2012 *ppPage = pTrunk;
drh30e58752002-03-02 20:41:57 +00002013 }else{
drh3aac2dd2004-04-26 14:10:20 +00002014 /* Extract a leaf from the trunk */
2015 int closest;
2016 unsigned char *aData = pTrunk->aData;
2017 if( nearby>0 ){
drhbea00b92002-07-08 10:59:50 +00002018 int i, dist;
2019 closest = 0;
drh3aac2dd2004-04-26 14:10:20 +00002020 dist = get4byte(&aData[8]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002021 if( dist<0 ) dist = -dist;
drh0d316a42002-08-11 20:10:47 +00002022 for(i=1; i<n; i++){
drh3aac2dd2004-04-26 14:10:20 +00002023 int d2 = get4byte(&aData[8+i*4]) - nearby;
drhbea00b92002-07-08 10:59:50 +00002024 if( d2<0 ) d2 = -d2;
2025 if( d2<dist ) closest = i;
2026 }
2027 }else{
2028 closest = 0;
2029 }
drh3aac2dd2004-04-26 14:10:20 +00002030 put4byte(&aData[4], n-1);
2031 *pPgno = get4data(&aData[8+closest*4]);
2032 memcpy(&aData[8+closest*4], &aData[4+closest*n], 4);
2033 rc = getPage(pBt, *pPgno, ppPage);
2034 releasePage(pTrunk);
drh30e58752002-03-02 20:41:57 +00002035 if( rc==SQLITE_OK ){
2036 sqlitepager_dont_rollback(*ppPage);
drh3aac2dd2004-04-26 14:10:20 +00002037 rc = sqlitepager_write((*ppPage)->aData);
drh30e58752002-03-02 20:41:57 +00002038 }
2039 }
drh3b7511c2001-05-26 13:15:44 +00002040 }else{
drh3aac2dd2004-04-26 14:10:20 +00002041 /* There are no pages on the freelist, so create a new page at the
2042 ** end of the file */
drh2aa679f2001-06-25 02:11:07 +00002043 *pPgno = sqlitepager_pagecount(pBt->pPager) + 1;
drh3aac2dd2004-04-26 14:10:20 +00002044 rc = getPage(pBt, *pPgno, ppPage);
drh3b7511c2001-05-26 13:15:44 +00002045 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002046 rc = sqlitepager_write((*ppPage)->aData);
drh3b7511c2001-05-26 13:15:44 +00002047 }
2048 return rc;
2049}
2050
2051/*
drh3aac2dd2004-04-26 14:10:20 +00002052** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00002053**
drhdd793422001-06-28 01:54:48 +00002054** sqlitepager_unref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00002055*/
drh3aac2dd2004-04-26 14:10:20 +00002056static int freePage(MemPage *pPage){
2057 Btree *pBt = pPage->pBt;
2058 MemPage *pPage1 = pBt->pPage1;
2059 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00002060
drh3aac2dd2004-04-26 14:10:20 +00002061 /* Prepare the page for freeing */
2062 assert( pPage->pgno>1 );
2063 pPage->isInit = 0;
2064 releasePage(pPage->pParent);
2065 pPage->pParent = 0;
2066
2067 /* Increment the free page count on page1 */
2068 rc = sqlitepager_write(pPage1->aData);
2069 if( rc ) return rc;
2070 n = get4byte(&pPage1->aData[36]);
2071 put4byte(&pPage1->aData[36], n+1);
2072
2073 if( n==0 ){
2074 /* This is the first free page */
2075 memset(pPage->aData, 0, 8);
2076 put4byte(pPage1->aData[32], pPage->pgno);
2077 }else{
2078 /* Other free pages already exist. Retrive the first trunk page
2079 ** of the freelist and find out how many leaves it has. */
2080 MemPage *pTrunk
2081 rc = getPage(pBt, get4byte(pPage1->aData[32], &pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002082 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002083 k = get4byte(&pTrunk->aData[4]);
2084 if( k==pBt->pageSize/4 - 8 ){
2085 /* The trunk is full. Turn the page being freed into a new
2086 ** trunk page with no leaves. */
2087 rc = sqlitepager_write(pPage->aData);
2088 if( rc ) return rc;
2089 put4byte(pPage->aData, pTrunk->pgno);
2090 put4byte(&pPage->aData[4], 0);
2091 put4byte(&pPage1->aData[32], pPage->pgno);
2092 }else{
2093 /* Add the newly freed page as a leaf on the current trunk */
2094 rc = sqlitepager_write(pTrunk->aData);
2095 if( rc ) return rc;
2096 put4byte(&pTrunk->aData[4], k+1);
2097 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
2098 sqlitepager_dont_write(pBt->pPager, pPage->pgno);
2099 }
2100 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00002101 }
drh3b7511c2001-05-26 13:15:44 +00002102 return rc;
2103}
2104
2105/*
drh3aac2dd2004-04-26 14:10:20 +00002106** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00002107*/
drh3aac2dd2004-04-26 14:10:20 +00002108static int clearCell(MemPage *pPage, unsigned char *pCell){
2109 Btree *pBt = pPage->pBt;
2110 int rc, n;
2111 u64 nData, nKey;
2112 Pgno ovflPgno;
drh3b7511c2001-05-26 13:15:44 +00002113
drh3aac2dd2004-04-26 14:10:20 +00002114 parseCellHeader(pPage, pCell, &nData, &nKey, &n);
2115 nPayload = nData;
2116 if( !pPage->intKey ){
2117 nPayload += nKey;
drh5e2f8b92001-05-28 00:41:15 +00002118 }
drh3aac2dd2004-04-26 14:10:20 +00002119 if( nPayload<=pBt->maxLocal ){
2120 return; /* There are no overflow pages. Return without doing anything */
2121 }
2122 ovflPgno = get4byte(&pCell[n+pBt->maxLocal]);
2123 while( ovflPgno!=0 ){
2124 MemPage *pOvfl;
2125 rc = getPage(pBt, ovflPgno, &pOvfl);
drh3b7511c2001-05-26 13:15:44 +00002126 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002127 ovflPgno = get4byte(pOvfl->aData);
drhbd03cae2001-06-02 02:40:57 +00002128 rc = freePage(pBt, pOvfl, ovfl);
2129 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00002130 sqlitepager_unref(pOvfl->aData);
drh3b7511c2001-05-26 13:15:44 +00002131 }
drh5e2f8b92001-05-28 00:41:15 +00002132 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00002133}
2134
2135/*
drh4b70f112004-05-02 21:12:19 +00002136** Compute the number of bytes required by a cell header. Fill in
2137** the nData and nKey values of the header that pHeader points to.
drh3aac2dd2004-04-26 14:10:20 +00002138*/
2139static int makeCellHeader(
2140 MemPage *pPage, /* The page that will contain the cell */
2141 u64 nKey, /* Size of key, or the key value if intKey */
2142 int nData, /* Size of data. Ignored for zerodata */
2143 unsigned char *pHeader /* Write header bytes here */
2144){
2145 int n = 2;
2146 if( !pPage->leaf ) n += 4;
2147 if( !pPage->zeroData ){
2148 n += putVarint(&pHeader[n], nData);
2149 }
2150 n += putVarint(&pHeader[n], nKey);
2151 return n;
2152}
2153
2154/*
2155** Fill in the payload section of a cell into the space provided. If
2156** the payload will not completely fit in the cell, allocate additional
2157** overflow pages and fill them in.
drh3b7511c2001-05-26 13:15:44 +00002158*/
2159static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00002160 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00002161 unsigned char *pCell, /* Complete text of the cell */
drh3aac2dd2004-04-26 14:10:20 +00002162 const void *pKey, u64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00002163 const void *pData,int nData, /* The data */
2164 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00002165){
drh3b7511c2001-05-26 13:15:44 +00002166 int nPayload;
drh3aac2dd2004-04-26 14:10:20 +00002167 const void *pSrc;
2168 int nSrc, nSrc2;
2169 int spaceLeft;
2170 MemPage *pOvfl = 0;
2171 unsigned char *pPrior;
2172 unsigned char *pPayload;
2173 Btree *pBt = pPage->pBt;
2174 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00002175 int nHeader;
drh3b7511c2001-05-26 13:15:44 +00002176
drh4b70f112004-05-02 21:12:19 +00002177 nHeader = makeCellHeader(pPage, pCell, nKey, nData);
drh3aac2dd2004-04-26 14:10:20 +00002178 nPayload = nData;
2179 if( pPage->intKey ){
2180 pSrc = pData;
2181 nSrc = nData;
2182 nSrc2 = 0;
2183 }else{
2184 nPayload += nKey;
2185 pSrc = pKey;
2186 nSrc = nKey;
2187 }
drh4b70f112004-05-02 21:12:19 +00002188 if( nPayload>pBt->maxLocal ){
2189 *pnSize = nHeader + pBt->maxLocal + 4;
2190 }else{
2191 *pnSize = nHeader + nPayload;
2192 }
drh3aac2dd2004-04-26 14:10:20 +00002193 spaceLeft = pBt->maxLocal;
2194 pPayload = &pCell[nHeader];
2195 pPrior = &pPayload[pBt->maxLocal];
drh3b7511c2001-05-26 13:15:44 +00002196
drh3b7511c2001-05-26 13:15:44 +00002197 while( nPayload>0 ){
2198 if( spaceLeft==0 ){
drh3aac2dd2004-04-26 14:10:20 +00002199 rc = allocatePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl);
drh3b7511c2001-05-26 13:15:44 +00002200 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00002201 clearCell(pPage, pCell);
drh3b7511c2001-05-26 13:15:44 +00002202 return rc;
2203 }
drh3aac2dd2004-04-26 14:10:20 +00002204 put4byte(pPrior, pgnoOvfl);
2205 pPrior = pOvfl->aData;
2206 put4byte(pPrior, 0);
2207 pPayload = &pOvfl->aData[4];
2208 spaceLeft = pBt->pageSize - 4;
drh3b7511c2001-05-26 13:15:44 +00002209 }
2210 n = nPayload;
2211 if( n>spaceLeft ) n = spaceLeft;
drh3aac2dd2004-04-26 14:10:20 +00002212 if( n>nSrc ) n = nSrc;
2213 memcpy(pPayload, pSrc, n);
drh3b7511c2001-05-26 13:15:44 +00002214 nPayload -= n;
drh3aac2dd2004-04-26 14:10:20 +00002215 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00002216 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00002217 if( nSrc==0 ){
2218 nSrc = nData;
2219 pSrc = pData;
2220 }
2221 if( pOvfl && (spaceLeft==0 || nPayload==0) ){
2222 releasePage(pOvfl);
2223 }
drhdd793422001-06-28 01:54:48 +00002224 }
drh3b7511c2001-05-26 13:15:44 +00002225 return SQLITE_OK;
2226}
2227
2228/*
drhbd03cae2001-06-02 02:40:57 +00002229** Change the MemPage.pParent pointer on the page whose number is
drh8b2f49b2001-06-08 00:21:52 +00002230** given in the second argument so that MemPage.pParent holds the
drhbd03cae2001-06-02 02:40:57 +00002231** pointer in the third argument.
2232*/
drh4b70f112004-05-02 21:12:19 +00002233static void reparentPage(Btree *pBt, Pgno pgno, MemPage *pNewParent, int idx){
drhbd03cae2001-06-02 02:40:57 +00002234 MemPage *pThis;
drh4b70f112004-05-02 21:12:19 +00002235 unsigned char *aData;
drhbd03cae2001-06-02 02:40:57 +00002236
drhdd793422001-06-28 01:54:48 +00002237 if( pgno==0 ) return;
drh4b70f112004-05-02 21:12:19 +00002238 assert( pBt->pPager!=0 );
2239 aData = sqlitepager_lookup(pBt->pPager, pgno);
2240 pThis = (MemPage)&aData[pBt->pageSize];
drh6019e162001-07-02 17:51:45 +00002241 if( pThis && pThis->isInit ){
drhdd793422001-06-28 01:54:48 +00002242 if( pThis->pParent!=pNewParent ){
drh4b70f112004-05-02 21:12:19 +00002243 if( pThis->pParent ) sqlitepager_unref(pThis->pParent->aData);
drhdd793422001-06-28 01:54:48 +00002244 pThis->pParent = pNewParent;
drh4b70f112004-05-02 21:12:19 +00002245 if( pNewParent ) sqlitepager_ref(pNewParent->aData);
drhdd793422001-06-28 01:54:48 +00002246 }
drh428ae8c2003-01-04 16:48:09 +00002247 pThis->idxParent = idx;
drh4b70f112004-05-02 21:12:19 +00002248 sqlitepager_unref(aData);
drhbd03cae2001-06-02 02:40:57 +00002249 }
2250}
2251
2252/*
drh4b70f112004-05-02 21:12:19 +00002253** Change the pParent pointer of all children of pPage to point back
2254** to pPage.
2255**
drhbd03cae2001-06-02 02:40:57 +00002256** In other words, for every child of pPage, invoke reparentPage()
drh5e00f6c2001-09-13 13:46:56 +00002257** to make sure that each child knows that pPage is its parent.
drhbd03cae2001-06-02 02:40:57 +00002258**
2259** This routine gets called after you memcpy() one page into
2260** another.
2261*/
drh4b70f112004-05-02 21:12:19 +00002262static void reparentChildPages(MemPage *pPage){
drhbd03cae2001-06-02 02:40:57 +00002263 int i;
drh4b70f112004-05-02 21:12:19 +00002264 Btree *pBt;
2265
2266 if( pPage->left ) return;
2267 pBt = pPage->pBt;
drhbd03cae2001-06-02 02:40:57 +00002268 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002269 reparentPage(pBt, get4byte(&pPage->aCell[i][2]), pPage, i);
drhbd03cae2001-06-02 02:40:57 +00002270 }
drh4b70f112004-05-02 21:12:19 +00002271 reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+6]), pPage, i);
drh428ae8c2003-01-04 16:48:09 +00002272 pPage->idxShift = 0;
drh14acc042001-06-10 19:56:58 +00002273}
2274
2275/*
2276** Remove the i-th cell from pPage. This routine effects pPage only.
2277** The cell content is not freed or deallocated. It is assumed that
2278** the cell content has been copied someplace else. This routine just
2279** removes the reference to the cell from pPage.
2280**
2281** "sz" must be the number of bytes in the cell.
2282**
2283** Do not bother maintaining the integrity of the linked list of Cells.
drh8c42ca92001-06-22 19:15:00 +00002284** Only the pPage->apCell[] array is important. The relinkCellList()
2285** routine will be called soon after this routine in order to rebuild
2286** the linked list.
drh14acc042001-06-10 19:56:58 +00002287*/
drh4b70f112004-05-02 21:12:19 +00002288static void dropCell(MemPage *pPage, int idx, int sz){
drh14acc042001-06-10 19:56:58 +00002289 int j;
drh8c42ca92001-06-22 19:15:00 +00002290 assert( idx>=0 && idx<pPage->nCell );
drh4b70f112004-05-02 21:12:19 +00002291 assert( sz==cellSize(pPage, pPage->aCell[idx]) );
2292 assert( sqlitepager_iswriteable(pPage->aData) );
2293 assert( pPage->aCell[idx]>=pPage->aData );
2294 assert( pPage->aCell[idx]<&pPage->aData[pPage->pBt->pageSize-sz] );
2295 freeSpace(pPage, idx, sz);
drh7c717f72001-06-24 20:39:41 +00002296 for(j=idx; j<pPage->nCell-1; j++){
drh4b70f112004-05-02 21:12:19 +00002297 pPage->aCell[j] = pPage->aCell[j+1];
drh14acc042001-06-10 19:56:58 +00002298 }
2299 pPage->nCell--;
drh428ae8c2003-01-04 16:48:09 +00002300 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002301}
2302
2303/*
2304** Insert a new cell on pPage at cell index "i". pCell points to the
2305** content of the cell.
2306**
2307** If the cell content will fit on the page, then put it there. If it
2308** will not fit, then just make pPage->apCell[i] point to the content
2309** and set pPage->isOverfull.
2310**
2311** Do not bother maintaining the integrity of the linked list of Cells.
drh8c42ca92001-06-22 19:15:00 +00002312** Only the pPage->apCell[] array is important. The relinkCellList()
2313** routine will be called soon after this routine in order to rebuild
2314** the linked list.
drh14acc042001-06-10 19:56:58 +00002315*/
drh4b70f112004-05-02 21:12:19 +00002316static void insertCell(MemPage *pPage, int i, unsigned char *pCell, int sz){
drh14acc042001-06-10 19:56:58 +00002317 int idx, j;
2318 assert( i>=0 && i<=pPage->nCell );
drh0d316a42002-08-11 20:10:47 +00002319 assert( sz==cellSize(pBt, pCell) );
drh4b70f112004-05-02 21:12:19 +00002320 assert( sqlitepager_iswriteable(pPage->aData) );
drh0d316a42002-08-11 20:10:47 +00002321 idx = allocateSpace(pBt, pPage, sz);
drh4b70f112004-05-02 21:12:19 +00002322 resizeCellArray(pPage, pPage->nCell+1);
drh14acc042001-06-10 19:56:58 +00002323 for(j=pPage->nCell; j>i; j--){
drh4b70f112004-05-02 21:12:19 +00002324 pPage->aCell[j] = pPage->aCell[j-1];
drh14acc042001-06-10 19:56:58 +00002325 }
2326 pPage->nCell++;
drh14acc042001-06-10 19:56:58 +00002327 if( idx<=0 ){
2328 pPage->isOverfull = 1;
drh4b70f112004-05-02 21:12:19 +00002329 pPage->aCell[i] = pCell;
drh14acc042001-06-10 19:56:58 +00002330 }else{
drh4b70f112004-05-02 21:12:19 +00002331 memcpy(&pPage->aData[idx], pCell, sz);
2332 pPage->aCell[i] = &pPage->aData[idx];
drh14acc042001-06-10 19:56:58 +00002333 }
drh428ae8c2003-01-04 16:48:09 +00002334 pPage->idxShift = 1;
drh14acc042001-06-10 19:56:58 +00002335}
2336
2337/*
2338** Rebuild the linked list of cells on a page so that the cells
drh4b70f112004-05-02 21:12:19 +00002339** occur in the order specified by the pPage->aCell[] array.
drh8c42ca92001-06-22 19:15:00 +00002340** Invoke this routine once to repair damage after one or more
2341** invocations of either insertCell() or dropCell().
drh14acc042001-06-10 19:56:58 +00002342*/
drh4b70f112004-05-02 21:12:19 +00002343static void relinkCellList(MemPage *pPage){
2344 int i, idxFrom;
2345 assert( sqlitepager_iswriteable(pPage->aData) );
2346 idxFrom = pPage->hdrOffset+3;
drh14acc042001-06-10 19:56:58 +00002347 for(i=0; i<pPage->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002348 int idx = Addr(pPage->aCell[i]) - Addr(pPage);
2349 assert( idx>pPage->hdrOffset && idx<pPage->pBt->pageSize );
2350 put2byte(&pPage->aData[idxFrom], idx);
2351 idxFrom = idx;
drh14acc042001-06-10 19:56:58 +00002352 }
drh4b70f112004-05-02 21:12:19 +00002353 put2byte(&pPage->aData[idxFrom], 0);
drh14acc042001-06-10 19:56:58 +00002354}
2355
2356/*
drh4b70f112004-05-02 21:12:19 +00002357** Make a copy of the contents of pFrom into pTo. The pFrom->aCell[]
2358** pointers that point into pFrom->aData[] must be adjusted to point
2359** into pTo->aData[] instead. But some pFrom->aCell[] entries might
2360** not point to pFrom->aData[]. Those are unchanged.
drh14acc042001-06-10 19:56:58 +00002361*/
2362static void copyPage(MemPage *pTo, MemPage *pFrom){
2363 uptr from, to;
2364 int i;
drh4b70f112004-05-02 21:12:19 +00002365 int pageSize;
2366 int ofst;
2367
2368 assert( pTo->hdrOffset==0 );
2369 ofst = pFrom->hdrOffset;
2370 pageSize = pTo->pBt->pageSize;
2371 memcpy(pTo->aData, &pFrom->aData[ofst], pageSize - ofst);
drhdd793422001-06-28 01:54:48 +00002372 pTo->pParent = 0;
drh14acc042001-06-10 19:56:58 +00002373 pTo->isInit = 1;
drh4b70f112004-05-02 21:12:19 +00002374 resizeCellArray(pTo, pFrom->nCell);
drh14acc042001-06-10 19:56:58 +00002375 pTo->nCell = pFrom->nCell;
drh4b70f112004-05-02 21:12:19 +00002376 pTo->nFree = pFrom->nFree + ofst;
2377 assert( pTo->aData[5]<155 );
2378 pTo->aData[5] += ofst;
drh14acc042001-06-10 19:56:58 +00002379 pTo->isOverfull = pFrom->isOverfull;
drh4b70f112004-05-02 21:12:19 +00002380 to = Addr(pTo->aData);
2381 from = Addr(pFrom->aData);
drh14acc042001-06-10 19:56:58 +00002382 for(i=0; i<pTo->nCell; i++){
drh4b70f112004-05-02 21:12:19 +00002383 uptr x = Addr(pFrom->aCell[i]);
2384 if( x>from && x<from+pageSize ){
2385 *((uptr*)&pTo->aCell[i]) = x + to - from;
drhdd793422001-06-28 01:54:48 +00002386 }else{
drh4b70f112004-05-02 21:12:19 +00002387 pTo->aCell[i] = pFrom->aCell[i];
drh14acc042001-06-10 19:56:58 +00002388 }
2389 }
drhbd03cae2001-06-02 02:40:57 +00002390}
2391
2392/*
drhc3b70572003-01-04 19:44:07 +00002393** The following parameters determine how many adjacent pages get involved
2394** in a balancing operation. NN is the number of neighbors on either side
2395** of the page that participate in the balancing operation. NB is the
2396** total number of pages that participate, including the target page and
2397** NN neighbors on either side.
2398**
2399** The minimum value of NN is 1 (of course). Increasing NN above 1
2400** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2401** in exchange for a larger degradation in INSERT and UPDATE performance.
2402** The value of NN appears to give the best results overall.
2403*/
2404#define NN 1 /* Number of neighbors on either side of pPage */
2405#define NB (NN*2+1) /* Total pages involved in the balance */
2406
2407/*
drh8b2f49b2001-06-08 00:21:52 +00002408** This routine redistributes Cells on pPage and up to two siblings
2409** of pPage so that all pages have about the same amount of free space.
drh14acc042001-06-10 19:56:58 +00002410** Usually one sibling on either side of pPage is used in the balancing,
drh8b2f49b2001-06-08 00:21:52 +00002411** though both siblings might come from one side if pPage is the first
2412** or last child of its parent. If pPage has fewer than two siblings
2413** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00002414** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00002415**
2416** The number of siblings of pPage might be increased or decreased by
drh8c42ca92001-06-22 19:15:00 +00002417** one in an effort to keep pages between 66% and 100% full. The root page
2418** is special and is allowed to be less than 66% full. If pPage is
2419** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00002420** or decreased by one, as necessary, to keep the root page from being
2421** overfull or empty.
2422**
drh4b70f112004-05-02 21:12:19 +00002423** This routine alwyas calls relinkCellList() on its input page regardless of
drh14acc042001-06-10 19:56:58 +00002424** whether or not it does any real balancing. Client routines will typically
2425** invoke insertCell() or dropCell() before calling this routine, so we
2426** need to call relinkCellList() to clean up the mess that those other
2427** routines left behind.
2428**
drh8b2f49b2001-06-08 00:21:52 +00002429** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00002430** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00002431** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00002432** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00002433**
drh8c42ca92001-06-22 19:15:00 +00002434** In the course of balancing the siblings of pPage, the parent of pPage
2435** might become overfull or underfull. If that happens, then this routine
2436** is called recursively on the parent.
2437**
drh5e00f6c2001-09-13 13:46:56 +00002438** If this routine fails for any reason, it might leave the database
2439** in a corrupted state. So if this routine fails, the database should
2440** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00002441*/
drh4b70f112004-05-02 21:12:19 +00002442static int balance(MemPage *pPage){
drh8b2f49b2001-06-08 00:21:52 +00002443 MemPage *pParent; /* The parent of pPage */
drh4b70f112004-05-02 21:12:19 +00002444 Btree *pBt; /* The whole database */
drh8b2f49b2001-06-08 00:21:52 +00002445 int nCell; /* Number of cells in apCell[] */
2446 int nOld; /* Number of pages in apOld[] */
2447 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00002448 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00002449 int i, j, k; /* Loop counters */
2450 int idx; /* Index of pPage in pParent->apCell[] */
2451 int nxDiv; /* Next divider slot in pParent->apCell[] */
2452 int rc; /* The return code */
2453 int iCur; /* apCell[iCur] is the cell of the cursor */
drh5edc3122001-09-13 21:53:09 +00002454 MemPage *pOldCurPage; /* The cursor originally points to this page */
drh6019e162001-07-02 17:51:45 +00002455 int subtotal; /* Subtotal of bytes in cells on one page */
drhc3b70572003-01-04 19:44:07 +00002456 MemPage *apOld[NB]; /* pPage and up to two siblings */
2457 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00002458 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drhc3b70572003-01-04 19:44:07 +00002459 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */
2460 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */
2461 int idxDiv[NB]; /* Indices of divider cells in pParent */
drh4b70f112004-05-02 21:12:19 +00002462 u8 *apDiv[NB]; /* Divider cells in pParent */
2463 u8 aTemp[NB][MX_CELL_SIZE]; /* Temporary holding area for apDiv[] */
drhc3b70572003-01-04 19:44:07 +00002464 int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */
2465 int szNew[NB+1]; /* Combined size of cells place on i-th page */
drh4b70f112004-05-02 21:12:19 +00002466 u8 *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
drhc3b70572003-01-04 19:44:07 +00002467 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */
drh4b70f112004-05-02 21:12:19 +00002468 u8 aCopy[NB][MX_PAGE_SIZE+sizeof(MemPage)]; /* Space for apCopy[] */
drh8b2f49b2001-06-08 00:21:52 +00002469
drh14acc042001-06-10 19:56:58 +00002470 /*
2471 ** Return without doing any work if pPage is neither overfull nor
2472 ** underfull.
drh8b2f49b2001-06-08 00:21:52 +00002473 */
drh4b70f112004-05-02 21:12:19 +00002474 assert( sqlitepager_iswriteable(pPage->aData) );
2475 pBt = pPage->pBt;
2476 if( !pPage->isOverfull && pPage->nFree<pBt->pageSize/2 && pPage->nCell>=2){
2477 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002478 return SQLITE_OK;
2479 }
2480
2481 /*
drh4b70f112004-05-02 21:12:19 +00002482 ** Find the parent of the page to be balanced. If there is no parent,
2483 ** it means this page is the root page and special rules apply.
drh8b2f49b2001-06-08 00:21:52 +00002484 */
drh14acc042001-06-10 19:56:58 +00002485 pParent = pPage->pParent;
drh8b2f49b2001-06-08 00:21:52 +00002486 if( pParent==0 ){
2487 Pgno pgnoChild;
drh8c42ca92001-06-22 19:15:00 +00002488 MemPage *pChild;
drh7aa128d2002-06-21 13:09:16 +00002489 assert( pPage->isInit );
drh8b2f49b2001-06-08 00:21:52 +00002490 if( pPage->nCell==0 ){
drh8856d6a2004-04-29 14:42:46 +00002491 if( pPage->leaf ){
2492 /* The table is completely empty */
2493 relinkCellList(pPage);
2494 }else{
2495 /* The root page is empty but has one child. Transfer the
2496 ** information from that one child into the root page if it
2497 ** will fit. This reduces the depth of the BTree by one.
2498 **
2499 ** If the root page is page 1, it has less space available than
drh4b70f112004-05-02 21:12:19 +00002500 ** its child (due to the 100 byte header that occurs at the beginning
2501 ** of the database fle), so it might not be able to hold all of the
2502 ** information currently contained in the child. If this is the
2503 ** case, then do not do the transfer. Leave page 1 empty except
2504 ** for the right-pointer to the child page. The child page becomes
2505 ** the virtual root of the tree.
drh8b2f49b2001-06-08 00:21:52 +00002506 */
drh8856d6a2004-04-29 14:42:46 +00002507 pgnoChild = get4byte(pPage->aData[pPage->hdrOffset+6]);
2508 assert( pgnoChild>0 && pgnoChild<=sqlit3pager_pagecount(pBt->pPager) );
2509 rc = getPage(pBt, pgnoChild, &pChild);
drh8b2f49b2001-06-08 00:21:52 +00002510 if( rc ) return rc;
drh8856d6a2004-04-29 14:42:46 +00002511 if( pPage->pgno==1 ){
drh4b70f112004-05-02 21:12:19 +00002512 rc = initPage(pChild, pPage);
drh8856d6a2004-04-29 14:42:46 +00002513 if( rc ) return rc;
2514 if( pChild->nFree>=100 ){
drh4b70f112004-05-02 21:12:19 +00002515 /* The child information will fit on the root page, so do the
2516 ** copy */
2517 zeroPage(pPage, pChild->aData[0]);
2518 resizeCellArray(pPage, pChild->nCell);
2519 for(i=0; i<pChild->nCell; i++){
2520 insertCell(pPage, i, pChild->aCell[i],
2521 cellSize(pChild, pChild->aCell[i]));
2522 }
2523 freePage(pChild);
2524 }else{
2525 /* The child has more information that will fit on the root.
2526 ** The tree is already balanced. Do nothing. */
drh8856d6a2004-04-29 14:42:46 +00002527 }
2528 }else{
drh4b70f112004-05-02 21:12:19 +00002529 memcpy(pPage, pChild, pBt->pageSize);
drh8856d6a2004-04-29 14:42:46 +00002530 pPage->isInit = 0;
drh4b70f112004-05-02 21:12:19 +00002531 pPage->pParent = 0;
2532 rc = initPage(pPage, 0);
drh8856d6a2004-04-29 14:42:46 +00002533 assert( rc==SQLITE_OK );
drh4b70f112004-05-02 21:12:19 +00002534 freePage(pChild);
drh5edc3122001-09-13 21:53:09 +00002535 }
drh4b70f112004-05-02 21:12:19 +00002536 reparentChildPages(pPage);
2537 releasePage(pChild);
drhefc251d2001-07-01 22:12:01 +00002538 }else{
drh4b70f112004-05-02 21:12:19 +00002539 relinkCellList(pPage);
drh8b2f49b2001-06-08 00:21:52 +00002540 }
2541 return SQLITE_OK;
2542 }
drh14acc042001-06-10 19:56:58 +00002543 if( !pPage->isOverfull ){
drh8b2f49b2001-06-08 00:21:52 +00002544 /* It is OK for the root page to be less than half full.
2545 */
drh0d316a42002-08-11 20:10:47 +00002546 relinkCellList(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00002547 return SQLITE_OK;
2548 }
drh14acc042001-06-10 19:56:58 +00002549 /*
2550 ** If we get to here, it means the root page is overfull.
drh8b2f49b2001-06-08 00:21:52 +00002551 ** When this happens, Create a new child page and copy the
2552 ** contents of the root into the child. Then make the root
drh14acc042001-06-10 19:56:58 +00002553 ** page an empty page with rightChild pointing to the new
drh8b2f49b2001-06-08 00:21:52 +00002554 ** child. Then fall thru to the code below which will cause
2555 ** the overfull child page to be split.
2556 */
drh4b70f112004-05-02 21:12:19 +00002557 rc = allocatePage(pBt, &pChild, &pgnoChild, pPage->pgno);
drh14acc042001-06-10 19:56:58 +00002558 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002559 assert( sqlitepager_iswriteable(pChild->aData) );
drh14acc042001-06-10 19:56:58 +00002560 copyPage(pChild, pPage);
2561 pChild->pParent = pPage;
drhbb49aba2003-01-04 18:53:27 +00002562 pChild->idxParent = 0;
drh4b70f112004-05-02 21:12:19 +00002563 sqlitepager_ref(pPage->aData);
drh14acc042001-06-10 19:56:58 +00002564 pChild->isOverfull = 1;
drh4b70f112004-05-02 21:12:19 +00002565 zeroPage(pPage, pPage->aData[pPage->hdrOffset] & ~PTF_LEAF);
2566 put4byte(&pPage->aData[pPage->hdrOffset+6], pChild->pgno);
drh8b2f49b2001-06-08 00:21:52 +00002567 pParent = pPage;
2568 pPage = pChild;
drh8b2f49b2001-06-08 00:21:52 +00002569 }
drh4b70f112004-05-02 21:12:19 +00002570 rc = sqlitepager_write(pParent->aData);
drh6019e162001-07-02 17:51:45 +00002571 if( rc ) return rc;
drh7aa128d2002-06-21 13:09:16 +00002572 assert( pParent->isInit );
drh14acc042001-06-10 19:56:58 +00002573
drh8b2f49b2001-06-08 00:21:52 +00002574 /*
drh4b70f112004-05-02 21:12:19 +00002575 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00002576 ** to pPage. The "idx" variable is the index of that cell. If pPage
2577 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00002578 */
drhbb49aba2003-01-04 18:53:27 +00002579 if( pParent->idxShift ){
2580 Pgno pgno, swabPgno;
drh4b70f112004-05-02 21:12:19 +00002581 pgno = pPage->pgno;
2582 assert( pgno==sqlitepager_pagenumber(pPage->aData) );
drhbb49aba2003-01-04 18:53:27 +00002583 for(idx=0; idx<pParent->nCell; idx++){
drh4b70f112004-05-02 21:12:19 +00002584 if( get4byte(pParent->aCell[idx][2])==pgno ){
drhbb49aba2003-01-04 18:53:27 +00002585 break;
2586 }
drh8b2f49b2001-06-08 00:21:52 +00002587 }
drh4b70f112004-05-02 21:12:19 +00002588 assert( idx<pParent->nCell
2589 || get4byte(&pParent->aData[pParent->hdrOffset+6])==pgno );
drhbb49aba2003-01-04 18:53:27 +00002590 }else{
2591 idx = pPage->idxParent;
drh8b2f49b2001-06-08 00:21:52 +00002592 }
drh8b2f49b2001-06-08 00:21:52 +00002593
2594 /*
drh14acc042001-06-10 19:56:58 +00002595 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00002596 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00002597 */
drh14acc042001-06-10 19:56:58 +00002598 nOld = nNew = 0;
drh4b70f112004-05-02 21:12:19 +00002599 sqlitepager_ref(pParent->aData);
drh14acc042001-06-10 19:56:58 +00002600
2601 /*
drh4b70f112004-05-02 21:12:19 +00002602 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00002603 ** the siblings. An attempt is made to find NN siblings on either
2604 ** side of pPage. More siblings are taken from one side, however, if
2605 ** pPage there are fewer than NN siblings on the other side. If pParent
2606 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00002607 */
drhc3b70572003-01-04 19:44:07 +00002608 nxDiv = idx - NN;
2609 if( nxDiv + NB > pParent->nCell ){
2610 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00002611 }
drhc3b70572003-01-04 19:44:07 +00002612 if( nxDiv<0 ){
2613 nxDiv = 0;
2614 }
drh8b2f49b2001-06-08 00:21:52 +00002615 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00002616 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00002617 if( k<pParent->nCell ){
2618 idxDiv[i] = k;
drh4b70f112004-05-02 21:12:19 +00002619 apDiv[i] = pParent->aCell[k];
drh8b2f49b2001-06-08 00:21:52 +00002620 nDiv++;
drh4b70f112004-05-02 21:12:19 +00002621 assert( !pParent->left );
2622 pgnoOld[i] = get4byte(&apDev[i][2]);
drh14acc042001-06-10 19:56:58 +00002623 }else if( k==pParent->nCell ){
drh4b70f112004-05-02 21:12:19 +00002624 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+6]);
drh14acc042001-06-10 19:56:58 +00002625 }else{
2626 break;
drh8b2f49b2001-06-08 00:21:52 +00002627 }
drh4b70f112004-05-02 21:12:19 +00002628 rc = getPage(pBt, pgnoOld[i], &apOld[i]);
drh14acc042001-06-10 19:56:58 +00002629 if( rc ) goto balance_cleanup;
drh4b70f112004-05-02 21:12:19 +00002630 rc = initPage(apOld[i], pParent);
drh6019e162001-07-02 17:51:45 +00002631 if( rc ) goto balance_cleanup;
drh428ae8c2003-01-04 16:48:09 +00002632 apOld[i]->idxParent = k;
drh14acc042001-06-10 19:56:58 +00002633 nOld++;
drh8b2f49b2001-06-08 00:21:52 +00002634 }
2635
2636 /*
drh14acc042001-06-10 19:56:58 +00002637 ** Make copies of the content of pPage and its siblings into aOld[].
2638 ** The rest of this function will use data from the copies rather
2639 ** that the original pages since the original pages will be in the
2640 ** process of being overwritten.
2641 */
2642 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002643 apCopy[i] = (MemPage*)&aCopy[i+1][-sizeof(MemPage)];
2644 memset(apCopy[i], 0, sizeof(MemPage));
2645 apCopy[i]->aData = &((u8*)apCopy)[-pBt->pageSize];
2646 copyPage(apCopy[i], apOld[i]);
drh14acc042001-06-10 19:56:58 +00002647 }
2648
2649 /*
2650 ** Load pointers to all cells on sibling pages and the divider cells
2651 ** into the local apCell[] array. Make copies of the divider cells
2652 ** into aTemp[] and remove the the divider Cells from pParent.
drh4b70f112004-05-02 21:12:19 +00002653 **
2654 ** If the siblings are on leaf pages, then the child pointers of the
2655 ** divider cells are stripped from the cells before they are copied
2656 ** into aTemp[]. In this wall, all cells in apCell[] are without
2657 ** child pointers. If siblings are not leaves, then all cell in
2658 ** apCell[] include child pointers. Either way, all cells in apCell[]
2659 ** are alike.
drh8b2f49b2001-06-08 00:21:52 +00002660 */
2661 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00002662 leafCorrection = pPage->leaf*4;
drh8b2f49b2001-06-08 00:21:52 +00002663 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002664 MemPage *pOld = apCopy[i];
drh8b2f49b2001-06-08 00:21:52 +00002665 for(j=0; j<pOld->nCell; j++){
drh4b70f112004-05-02 21:12:19 +00002666 apCell[nCell] = pOld->aCell[j];
2667 szCell[nCell] = cellSize(pOld, apCell[nCell]);
drh14acc042001-06-10 19:56:58 +00002668 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002669 }
2670 if( i<nOld-1 ){
drh4b70f112004-05-02 21:12:19 +00002671 szCell[nCell] = cellSize(pParent, apDiv[i]) - leafCorrection;
2672 memcpy(aTemp[i], apDiv[i], szCell[nCell] + leafCorrection);
2673 apCell[nCell] = &aTemp[i][leafCorrection];
2674 dropCell(pParent, nxDiv, szCell[nCell]);
2675 assert( get4byte(&apCell[nCell][2])==pgnoOld[i] );
2676 if( !pOld->leaf ){
2677 assert( leafCorrection==0 );
2678 /* The right pointer of the child page pOld becomes the left
2679 ** pointer of the divider cell */
2680 memcpy(&apCell[nCell][2], &pOld->aData[pOld->hdrOffset+6], 4);
2681 }else{
2682 assert( leafCorrection==4 );
2683 }
drh14acc042001-06-10 19:56:58 +00002684 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00002685 }
2686 }
2687
2688 /*
drh6019e162001-07-02 17:51:45 +00002689 ** Figure out the number of pages needed to hold all nCell cells.
2690 ** Store this number in "k". Also compute szNew[] which is the total
2691 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00002692 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00002693 ** cntNew[k] should equal nCell.
2694 **
2695 ** This little patch of code is critical for keeping the tree
2696 ** balanced.
drh8b2f49b2001-06-08 00:21:52 +00002697 */
drh4b70f112004-05-02 21:12:19 +00002698 usableSpace = pBt->pageSize - 10 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00002699 for(subtotal=k=i=0; i<nCell; i++){
2700 subtotal += szCell[i];
drh4b70f112004-05-02 21:12:19 +00002701 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00002702 szNew[k] = subtotal - szCell[i];
2703 cntNew[k] = i;
2704 subtotal = 0;
2705 k++;
2706 }
2707 }
2708 szNew[k] = subtotal;
2709 cntNew[k] = nCell;
2710 k++;
2711 for(i=k-1; i>0; i--){
drh4b70f112004-05-02 21:12:19 +00002712 while( szNew[i]<usableSpace/2 ){
drh6019e162001-07-02 17:51:45 +00002713 cntNew[i-1]--;
2714 assert( cntNew[i-1]>0 );
2715 szNew[i] += szCell[cntNew[i-1]];
2716 szNew[i-1] -= szCell[cntNew[i-1]-1];
2717 }
2718 }
2719 assert( cntNew[0]>0 );
drh8b2f49b2001-06-08 00:21:52 +00002720
2721 /*
drh6b308672002-07-08 02:16:37 +00002722 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00002723 */
drh4b70f112004-05-02 21:12:19 +00002724 assert( pPage->pgno>1 );
2725 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00002726 for(i=0; i<k; i++){
drh6b308672002-07-08 02:16:37 +00002727 if( i<nOld ){
2728 apNew[i] = apOld[i];
2729 pgnoNew[i] = pgnoOld[i];
2730 apOld[i] = 0;
2731 sqlitepager_write(apNew[i]);
2732 }else{
drhbea00b92002-07-08 10:59:50 +00002733 rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]);
drh6b308672002-07-08 02:16:37 +00002734 if( rc ) goto balance_cleanup;
2735 }
drh14acc042001-06-10 19:56:58 +00002736 nNew++;
drh4b70f112004-05-02 21:12:19 +00002737 zeroPage(apNew[i], pageFlags);
drh6019e162001-07-02 17:51:45 +00002738 apNew[i]->isInit = 1;
drh8b2f49b2001-06-08 00:21:52 +00002739 }
2740
drh6b308672002-07-08 02:16:37 +00002741 /* Free any old pages that were not reused as new pages.
2742 */
2743 while( i<nOld ){
drh4b70f112004-05-02 21:12:19 +00002744 rc = freePage(apOld[i]);
drh6b308672002-07-08 02:16:37 +00002745 if( rc ) goto balance_cleanup;
drh4b70f112004-05-02 21:12:19 +00002746 sqlitepager_unref(apOld[i]->aData);
drh6b308672002-07-08 02:16:37 +00002747 apOld[i] = 0;
2748 i++;
2749 }
2750
drh8b2f49b2001-06-08 00:21:52 +00002751 /*
drhf9ffac92002-03-02 19:00:31 +00002752 ** Put the new pages in accending order. This helps to
2753 ** keep entries in the disk file in order so that a scan
2754 ** of the table is a linear scan through the file. That
2755 ** in turn helps the operating system to deliver pages
2756 ** from the disk more rapidly.
2757 **
2758 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00002759 ** n is never more than NB (a small constant), that should
2760 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00002761 **
drhc3b70572003-01-04 19:44:07 +00002762 ** When NB==3, this one optimization makes the database
2763 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00002764 */
2765 for(i=0; i<k-1; i++){
2766 int minV = pgnoNew[i];
2767 int minI = i;
2768 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00002769 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00002770 minI = j;
2771 minV = pgnoNew[j];
2772 }
2773 }
2774 if( minI>i ){
2775 int t;
2776 MemPage *pT;
2777 t = pgnoNew[i];
2778 pT = apNew[i];
2779 pgnoNew[i] = pgnoNew[minI];
2780 apNew[i] = apNew[minI];
2781 pgnoNew[minI] = t;
2782 apNew[minI] = pT;
2783 }
2784 }
2785
2786 /*
drh14acc042001-06-10 19:56:58 +00002787 ** Evenly distribute the data in apCell[] across the new pages.
2788 ** Insert divider cells into pParent as necessary.
2789 */
2790 j = 0;
2791 for(i=0; i<nNew; i++){
2792 MemPage *pNew = apNew[i];
drh4b70f112004-05-02 21:12:19 +00002793 assert( pNew->pgno==pgnoNew[i] );
2794 resizeCellArray(pNew, cntNew[i] - j);
drh6019e162001-07-02 17:51:45 +00002795 while( j<cntNew[i] ){
2796 assert( pNew->nFree>=szCell[j] );
drh0d316a42002-08-11 20:10:47 +00002797 insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]);
drh14acc042001-06-10 19:56:58 +00002798 j++;
2799 }
drh6019e162001-07-02 17:51:45 +00002800 assert( pNew->nCell>0 );
drh14acc042001-06-10 19:56:58 +00002801 assert( !pNew->isOverfull );
drh4b70f112004-05-02 21:12:19 +00002802 relinkCellList(pNew);
drh14acc042001-06-10 19:56:58 +00002803 if( i<nNew-1 && j<nCell ){
drh4b70f112004-05-02 21:12:19 +00002804 u8 *pCell = apCell[j];
2805 if( !pNew->leaf ){
2806 memcpy(&pNew->aData[6], &apCell[j][2], 4);
2807 }else{
2808 pCell -= 4;
2809 }
2810 insertCell(pParent, nxDiv, pCell, szCell[j]+leafCorrection);
2811 put4byte(&pParent->aCell[nxDiv][2], pNew->pgno);
drh14acc042001-06-10 19:56:58 +00002812 j++;
2813 nxDiv++;
2814 }
2815 }
drh6019e162001-07-02 17:51:45 +00002816 assert( j==nCell );
drh4b70f112004-05-02 21:12:19 +00002817 if( (pageFlags & PTF_LEAF)==0 ){
2818 memcpy(&apNew[nNew-1]->aData[6], &apCopy[nOld-1]->aData[6], 4);
drh14acc042001-06-10 19:56:58 +00002819 }
drh4b70f112004-05-02 21:12:19 +00002820 if( nxDiv==pParent->nCell ){
2821 /* Right-most sibling is the right-most child of pParent */
2822 put4byte(&pParent->aData[pParent->hdrOffset+6], pgnoNew[nNew-1]);
2823 }else{
2824 /* Right-most sibling is the left child of the first entry in pParent
2825 ** past the right-most divider entry */
2826 put4byte(&pParent->apCell[nxDiv][2], pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00002827 }
2828
2829 /*
2830 ** Reparent children of all cells.
drh8b2f49b2001-06-08 00:21:52 +00002831 */
2832 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00002833 reparentChildPages(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00002834 }
drh4b70f112004-05-02 21:12:19 +00002835 reparentChildPages(pParent);
drh8b2f49b2001-06-08 00:21:52 +00002836
2837 /*
drh14acc042001-06-10 19:56:58 +00002838 ** balance the parent page.
drh8b2f49b2001-06-08 00:21:52 +00002839 */
drh4b70f112004-05-02 21:12:19 +00002840 rc = balance(pParent);
drh8b2f49b2001-06-08 00:21:52 +00002841
2842 /*
drh14acc042001-06-10 19:56:58 +00002843 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00002844 */
drh14acc042001-06-10 19:56:58 +00002845balance_cleanup:
drh8b2f49b2001-06-08 00:21:52 +00002846 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00002847 if( apOld[i]!=0 ) sqlitepager_unref(apOld[i]->aData);
drh8b2f49b2001-06-08 00:21:52 +00002848 }
drh14acc042001-06-10 19:56:58 +00002849 for(i=0; i<nNew; i++){
drh4b70f112004-05-02 21:12:19 +00002850 sqlitepager_unref(apNew[i]->aData);
drh8b2f49b2001-06-08 00:21:52 +00002851 }
drh4b70f112004-05-02 21:12:19 +00002852 sqlitepager_unref(pParent->aData);
drh8b2f49b2001-06-08 00:21:52 +00002853 return rc;
2854}
2855
2856/*
drhf74b8d92002-09-01 23:20:45 +00002857** This routine checks all cursors that point to the same table
2858** as pCur points to. If any of those cursors were opened with
2859** wrFlag==0 then this routine returns SQLITE_LOCKED. If all
2860** cursors point to the same table were opened with wrFlag==1
2861** then this routine returns SQLITE_OK.
2862**
2863** In addition to checking for read-locks (where a read-lock
2864** means a cursor opened with wrFlag==0) this routine also moves
2865** all cursors other than pCur so that they are pointing to the
2866** first Cell on root page. This is necessary because an insert
2867** or delete might change the number of cells on a page or delete
2868** a page entirely and we do not want to leave any cursors
2869** pointing to non-existant pages or cells.
2870*/
2871static int checkReadLocks(BtCursor *pCur){
2872 BtCursor *p;
2873 assert( pCur->wrFlag );
2874 for(p=pCur->pShared; p!=pCur; p=p->pShared){
2875 assert( p );
2876 assert( p->pgnoRoot==pCur->pgnoRoot );
2877 if( p->wrFlag==0 ) return SQLITE_LOCKED;
2878 if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){
2879 moveToRoot(p);
2880 }
2881 }
2882 return SQLITE_OK;
2883}
2884
2885/*
drh3b7511c2001-05-26 13:15:44 +00002886** Insert a new record into the BTree. The key is given by (pKey,nKey)
2887** and the data is given by (pData,nData). The cursor is used only to
2888** define what database the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00002889** is left pointing at a random location.
2890**
2891** For an INTKEY table, only the nKey value of the key is used. pKey is
2892** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00002893*/
drh3aac2dd2004-04-26 14:10:20 +00002894int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00002895 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4b70f112004-05-02 21:12:19 +00002896 const void *pKey, u64 nKey, /* The key of the new record */
drh5c4d9702001-08-20 00:33:58 +00002897 const void *pData, int nData /* The data of the new record */
drh3b7511c2001-05-26 13:15:44 +00002898){
drh3b7511c2001-05-26 13:15:44 +00002899 int rc;
2900 int loc;
drh14acc042001-06-10 19:56:58 +00002901 int szNew;
drh3b7511c2001-05-26 13:15:44 +00002902 MemPage *pPage;
2903 Btree *pBt = pCur->pBt;
drh4b70f112004-05-02 21:12:19 +00002904 unsigned char newCell[MX_CELL_SIZE], *oldCell;
drh3b7511c2001-05-26 13:15:44 +00002905
drhecdc7532001-09-23 02:35:53 +00002906 if( pCur->pPage==0 ){
2907 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2908 }
drhf74b8d92002-09-01 23:20:45 +00002909 if( !pBt->inTrans || nKey+nData==0 ){
2910 /* Must start a transaction before doing an insert */
2911 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002912 }
drhf74b8d92002-09-01 23:20:45 +00002913 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00002914 if( !pCur->wrFlag ){
2915 return SQLITE_PERM; /* Cursor not open for writing */
2916 }
drhf74b8d92002-09-01 23:20:45 +00002917 if( checkReadLocks(pCur) ){
2918 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2919 }
drh3aac2dd2004-04-26 14:10:20 +00002920 rc = sqlite3BtreeMoveto(pCur, pKey, nKey, &loc);
drh3b7511c2001-05-26 13:15:44 +00002921 if( rc ) return rc;
drh14acc042001-06-10 19:56:58 +00002922 pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00002923 assert( nData==0 || pPage->zeroData!=0 );
drh7aa128d2002-06-21 13:09:16 +00002924 assert( pPage->isInit );
drh14acc042001-06-10 19:56:58 +00002925 rc = sqlitepager_write(pPage);
drhbd03cae2001-06-02 02:40:57 +00002926 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002927 rc = fillInCell(pPage, &newCell, pKey, nKey, pData, nData, &szNew);
drh3b7511c2001-05-26 13:15:44 +00002928 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002929 assert( szNew==cellSize(pPage, newCell) );
drh3b7511c2001-05-26 13:15:44 +00002930 if( loc==0 ){
drh4b70f112004-05-02 21:12:19 +00002931 int szOld
2932 assert( pCur->idx>=0 && pCur->idx<pPage->nPage );
2933 oldCell = pPage->aCell[pCur->idx];
2934 if( !pPage->leaf ){
2935 memcpy(&newCell[2], &oldCell[2], 4);
2936 }
2937 szOld = cellSize(pPage, oldCell);
2938 rc = clearCell(pPage, oldCell);
drh5e2f8b92001-05-28 00:41:15 +00002939 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002940 dropCell(pPage, pCur->idx, szOld);
drh7c717f72001-06-24 20:39:41 +00002941 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00002942 assert( pPage->leaf );
drh14acc042001-06-10 19:56:58 +00002943 pCur->idx++;
2944 }else{
drh4b70f112004-05-02 21:12:19 +00002945 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00002946 }
drh4b70f112004-05-02 21:12:19 +00002947 insertCell(pPage, pCur->idx, &newCell, szNew);
2948 rc = balance(pPage);
drh3fc190c2001-09-14 03:24:23 +00002949 /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
2950 /* fflush(stdout); */
drh4b70f112004-05-02 21:12:19 +00002951 moveToRoot(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00002952 pCur->eSkip = SKIP_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00002953 return rc;
2954}
2955
2956/*
drh4b70f112004-05-02 21:12:19 +00002957** Delete the entry that the cursor is pointing to. The cursor
2958** is left pointing at a random location.
drh3b7511c2001-05-26 13:15:44 +00002959*/
drh3aac2dd2004-04-26 14:10:20 +00002960int sqlite3BtreeDelete(BtCursor *pCur){
drh5e2f8b92001-05-28 00:41:15 +00002961 MemPage *pPage = pCur->pPage;
drh4b70f112004-05-02 21:12:19 +00002962 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00002963 int rc;
drh8c42ca92001-06-22 19:15:00 +00002964 Pgno pgnoChild;
drh0d316a42002-08-11 20:10:47 +00002965 Btree *pBt = pCur->pBt;
drh8b2f49b2001-06-08 00:21:52 +00002966
drh7aa128d2002-06-21 13:09:16 +00002967 assert( pPage->isInit );
drhecdc7532001-09-23 02:35:53 +00002968 if( pCur->pPage==0 ){
2969 return SQLITE_ABORT; /* A rollback destroyed this cursor */
2970 }
drhf74b8d92002-09-01 23:20:45 +00002971 if( !pBt->inTrans ){
2972 /* Must start a transaction before doing a delete */
2973 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00002974 }
drhf74b8d92002-09-01 23:20:45 +00002975 assert( !pBt->readOnly );
drhbd03cae2001-06-02 02:40:57 +00002976 if( pCur->idx >= pPage->nCell ){
2977 return SQLITE_ERROR; /* The cursor is not pointing to anything */
2978 }
drhecdc7532001-09-23 02:35:53 +00002979 if( !pCur->wrFlag ){
2980 return SQLITE_PERM; /* Did not open this cursor for writing */
2981 }
drhf74b8d92002-09-01 23:20:45 +00002982 if( checkReadLocks(pCur) ){
2983 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2984 }
drhbd03cae2001-06-02 02:40:57 +00002985 rc = sqlitepager_write(pPage);
2986 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00002987 pCell = pPage->aCell[pCur->idx];
2988 if( !pPage->leaf ){
2989 pgnoChild = get4byte(&pCell[2]);
2990 }
2991 clearCell(pPage, pCell);
2992 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00002993 /*
drh5e00f6c2001-09-13 13:46:56 +00002994 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00002995 ** do something we will leave a hole on an internal page.
2996 ** We have to fill the hole by moving in a cell from a leaf. The
2997 ** next Cell after the one to be deleted is guaranteed to exist and
2998 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00002999 */
drh14acc042001-06-10 19:56:58 +00003000 BtCursor leafCur;
drh4b70f112004-05-02 21:12:19 +00003001 unsigned char *pNext;
drh14acc042001-06-10 19:56:58 +00003002 int szNext;
drh8c1238a2003-01-02 14:43:55 +00003003 int notUsed;
drh14acc042001-06-10 19:56:58 +00003004 getTempCursor(pCur, &leafCur);
drh3aac2dd2004-04-26 14:10:20 +00003005 rc = sqlite3BtreeNext(&leafCur, &notUsed);
drh14acc042001-06-10 19:56:58 +00003006 if( rc!=SQLITE_OK ){
drh8a6ac0a2004-02-14 17:35:07 +00003007 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
3008 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003009 }
drh6019e162001-07-02 17:51:45 +00003010 rc = sqlitepager_write(leafCur.pPage);
3011 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003012 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3013 pNext = leafCur.pPage->aCell[leafCur.idx];
3014 szNext = cellSize(leafCur.pPage, pNext);
3015 insertCell(pPage, pCur->idx, &pNext[-4], szNext+4);
3016 put4byte(&pNext[-2], pgnoChild);
3017 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003018 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003019 dropCell(leafCur.pPage, leafCur.idx, szNext);
3020 rc = balance(leafCur.pPage);
drh8c42ca92001-06-22 19:15:00 +00003021 releaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00003022 }else{
drh4b70f112004-05-02 21:12:19 +00003023 dropCell(pPage, pCur->idx, cellSize(pPage, pCell));
3024 rc = balance(pPage);
drh5e2f8b92001-05-28 00:41:15 +00003025 }
drh4b70f112004-05-02 21:12:19 +00003026 moveToRoot(pCur);
drh5e2f8b92001-05-28 00:41:15 +00003027 return rc;
drh3b7511c2001-05-26 13:15:44 +00003028}
drh8b2f49b2001-06-08 00:21:52 +00003029
3030/*
drhc6b52df2002-01-04 03:09:29 +00003031** Create a new BTree table. Write into *piTable the page
3032** number for the root page of the new table.
3033**
3034** In the current implementation, BTree tables and BTree indices are the
drh144f9ea2003-04-16 01:28:16 +00003035** the same. In the future, we may change this so that BTree tables
drhc6b52df2002-01-04 03:09:29 +00003036** are restricted to having a 4-byte integer key and arbitrary data and
3037** BTree indices are restricted to having an arbitrary key and no data.
drh144f9ea2003-04-16 01:28:16 +00003038** But for now, this routine also serves to create indices.
drh8b2f49b2001-06-08 00:21:52 +00003039*/
drh3aac2dd2004-04-26 14:10:20 +00003040int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
drh8b2f49b2001-06-08 00:21:52 +00003041 MemPage *pRoot;
3042 Pgno pgnoRoot;
3043 int rc;
3044 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003045 /* Must start a transaction first */
3046 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003047 }
drh5df72a52002-06-06 23:16:05 +00003048 if( pBt->readOnly ){
3049 return SQLITE_READONLY;
3050 }
drhbea00b92002-07-08 10:59:50 +00003051 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
drh8b2f49b2001-06-08 00:21:52 +00003052 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003053 assert( sqlitepager_iswriteable(pRoot->aData) );
drh0d316a42002-08-11 20:10:47 +00003054 zeroPage(pBt, pRoot);
drh8b2f49b2001-06-08 00:21:52 +00003055 sqlitepager_unref(pRoot);
3056 *piTable = (int)pgnoRoot;
3057 return SQLITE_OK;
3058}
3059
3060/*
3061** Erase the given database page and all its children. Return
3062** the page to the freelist.
3063*/
drh4b70f112004-05-02 21:12:19 +00003064static int clearDatabasePage(
3065 Btree *pBt, /* The BTree that contains the table */
3066 Pgno pgno, /* Page number to clear */
3067 MemPage *pParent, /* Parent page. NULL for the root */
3068 int freePageFlag /* Deallocate page if true */
3069){
drh8b2f49b2001-06-08 00:21:52 +00003070 MemPage *pPage;
3071 int rc;
drh4b70f112004-05-02 21:12:19 +00003072 unsigned char *pCell;
3073 int i;
drh8b2f49b2001-06-08 00:21:52 +00003074
drh4b70f112004-05-02 21:12:19 +00003075 rc = getPage(pBt, pgno, &pPage);
drh8b2f49b2001-06-08 00:21:52 +00003076 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003077 rc = sqlitepager_write(pPage->aData);
drh6019e162001-07-02 17:51:45 +00003078 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003079 rc = initPage(pPage, pParent);
drh7aa128d2002-06-21 13:09:16 +00003080 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003081 for(i=0; i<pPage->nCell; i++){
3082 pCell = pPage->aCell[i];
3083 if( !pPage->leaf ){
3084 rc = clearDatabasePage(pBt, get4byte(&pCell[2]), 1);
drh8b2f49b2001-06-08 00:21:52 +00003085 if( rc ) return rc;
3086 }
drh4b70f112004-05-02 21:12:19 +00003087 rc = clearCell(pPage, pCell);
drh8b2f49b2001-06-08 00:21:52 +00003088 if( rc ) return rc;
3089 }
drh4b70f112004-05-02 21:12:19 +00003090 if( !pPage->left ){
3091 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[6]), 1);
drh2aa679f2001-06-25 02:11:07 +00003092 if( rc ) return rc;
3093 }
3094 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00003095 rc = freePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003096 }else{
drh4b70f112004-05-02 21:12:19 +00003097 zeroPage(pPage, pPage->aData[0]);
drh2aa679f2001-06-25 02:11:07 +00003098 }
drh4b70f112004-05-02 21:12:19 +00003099 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00003100 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003101}
3102
3103/*
3104** Delete all information from a single table in the database.
3105*/
drh3aac2dd2004-04-26 14:10:20 +00003106int sqlite3BtreeClearTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003107 int rc;
drhf74b8d92002-09-01 23:20:45 +00003108 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003109 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003110 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003111 }
drhf74b8d92002-09-01 23:20:45 +00003112 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3113 if( pCur->pgnoRoot==(Pgno)iTable ){
3114 if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
3115 moveToRoot(pCur);
3116 }
drhecdc7532001-09-23 02:35:53 +00003117 }
drh2aa679f2001-06-25 02:11:07 +00003118 rc = clearDatabasePage(pBt, (Pgno)iTable, 0);
drh8b2f49b2001-06-08 00:21:52 +00003119 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003120 sqlite3BtreeRollback(pBt);
drh8b2f49b2001-06-08 00:21:52 +00003121 }
drh8c42ca92001-06-22 19:15:00 +00003122 return rc;
drh8b2f49b2001-06-08 00:21:52 +00003123}
3124
3125/*
3126** Erase all information in a table and add the root of the table to
3127** the freelist. Except, the root of the principle table (the one on
3128** page 2) is never added to the freelist.
3129*/
drh3aac2dd2004-04-26 14:10:20 +00003130int sqlite3BtreeDropTable(Btree *pBt, int iTable){
drh8b2f49b2001-06-08 00:21:52 +00003131 int rc;
3132 MemPage *pPage;
drhf74b8d92002-09-01 23:20:45 +00003133 BtCursor *pCur;
drh8b2f49b2001-06-08 00:21:52 +00003134 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003135 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00003136 }
drhf74b8d92002-09-01 23:20:45 +00003137 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3138 if( pCur->pgnoRoot==(Pgno)iTable ){
3139 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */
3140 }
drh5df72a52002-06-06 23:16:05 +00003141 }
drh4b70f112004-05-02 21:12:19 +00003142 rc = getPage(pBt, (Pgno)iTable, pPage);
drh2aa679f2001-06-25 02:11:07 +00003143 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00003144 rc = sqlite3BtreeClearTable(pBt, iTable);
drh2aa679f2001-06-25 02:11:07 +00003145 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003146 if( iTable>1 ){
drh2aa679f2001-06-25 02:11:07 +00003147 rc = freePage(pBt, pPage, iTable);
3148 }else{
drh0d316a42002-08-11 20:10:47 +00003149 zeroPage(pBt, pPage);
drh8b2f49b2001-06-08 00:21:52 +00003150 }
drh4b70f112004-05-02 21:12:19 +00003151 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00003152 return rc;
3153}
3154
drh001bbcb2003-03-19 03:14:00 +00003155
drh8b2f49b2001-06-08 00:21:52 +00003156/*
3157** Read the meta-information out of a database file.
3158*/
drh3aac2dd2004-04-26 14:10:20 +00003159int sqlite3BtreeGetMeta(Btree *pBt, int idx, u32 *pMeta){
drh8b2f49b2001-06-08 00:21:52 +00003160 int rc;
drh0d316a42002-08-11 20:10:47 +00003161 int i;
drh4b70f112004-05-02 21:12:19 +00003162 unsigned char *pP1;
drh8b2f49b2001-06-08 00:21:52 +00003163
drh4b70f112004-05-02 21:12:19 +00003164 assert( idx>=0 && idx<15 );
drh8c42ca92001-06-22 19:15:00 +00003165 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
drh8b2f49b2001-06-08 00:21:52 +00003166 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00003167 *pMeta = get4byte(&pP1[40 + idx*4]);
drh8b2f49b2001-06-08 00:21:52 +00003168 sqlitepager_unref(pP1);
3169 return SQLITE_OK;
3170}
3171
3172/*
3173** Write meta-information back into the database.
3174*/
drh3aac2dd2004-04-26 14:10:20 +00003175int sqlite3BtreeUpdateMeta(Btree *pBt, int idx, u32 iMeta){
drh4b70f112004-05-02 21:12:19 +00003176 unsigned char *pP1;
drh0d316a42002-08-11 20:10:47 +00003177 int rc, i;
drh4b70f112004-05-02 21:12:19 +00003178 assert( idx>=0 && idx<15 );
drh8b2f49b2001-06-08 00:21:52 +00003179 if( !pBt->inTrans ){
drhf74b8d92002-09-01 23:20:45 +00003180 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh5df72a52002-06-06 23:16:05 +00003181 }
drh4b70f112004-05-02 21:12:19 +00003182 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
3183 if( rc ) return rc;
drh8b2f49b2001-06-08 00:21:52 +00003184 rc = sqlitepager_write(pP1);
drh4b70f112004-05-02 21:12:19 +00003185 if( rc ) return rc;
3186 put4byte(&pP1[40 + idx*4], iMeta);
drh8b2f49b2001-06-08 00:21:52 +00003187 return SQLITE_OK;
3188}
drh8c42ca92001-06-22 19:15:00 +00003189
drh5eddca62001-06-30 21:53:53 +00003190/******************************************************************************
3191** The complete implementation of the BTree subsystem is above this line.
3192** All the code the follows is for testing and troubleshooting the BTree
3193** subsystem. None of the code that follows is used during normal operation.
drh5eddca62001-06-30 21:53:53 +00003194******************************************************************************/
drh5eddca62001-06-30 21:53:53 +00003195
drh8c42ca92001-06-22 19:15:00 +00003196/*
3197** Print a disassembly of the given page on standard output. This routine
3198** is used for debugging and testing only.
3199*/
drhaaab5722002-02-19 13:39:21 +00003200#ifdef SQLITE_TEST
drh144f9ea2003-04-16 01:28:16 +00003201static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){
drh8c42ca92001-06-22 19:15:00 +00003202 int rc;
3203 MemPage *pPage;
3204 int i, j;
3205 int nFree;
3206 u16 idx;
drh4b70f112004-05-02 21:12:19 +00003207 int hdrOffset;
drh8c42ca92001-06-22 19:15:00 +00003208 char range[20];
3209 unsigned char payload[20];
drh4b70f112004-05-02 21:12:19 +00003210 rc = getPage(pBt, (Pgno)pgno, &pPage);
drh8c42ca92001-06-22 19:15:00 +00003211 if( rc ){
3212 return rc;
3213 }
drh4b70f112004-05-02 21:12:19 +00003214 printf("PAGE %d: flags=0x%02x frag=%d\n", pgno, pPage->aData[0],
3215 pPage->aData[5]);
drh8c42ca92001-06-22 19:15:00 +00003216 i = 0;
drh4b70f112004-05-02 21:12:19 +00003217 hdrOffset = pgno==1 ? 100 : 0;
3218 idx = get2byte(&pPage->aData[hdrOffset+3]);
3219 while( idx>0 && idx<=pBt->pageSize ){
3220 u64 nData, nKey;
3221 int nHeader;
3222 Pgno child;
3223 unsigned char *pCell = &pPage->aData[idx];
3224 int sz = cellSize(pPage, pCell);
drh8c42ca92001-06-22 19:15:00 +00003225 sprintf(range,"%d..%d", idx, idx+sz-1);
drh4b70f112004-05-02 21:12:19 +00003226 parseCellHeader(pPage, pCell, &nData, &nKey, &nHeader);
3227 if( pPage->leaf ){
3228 child = 0;
3229 }else{
3230 child = get4byte(&pCell[2]);
3231 }
drh0d316a42002-08-11 20:10:47 +00003232 sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
drh8c42ca92001-06-22 19:15:00 +00003233 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3234 memcpy(payload, pCell->aPayload, sz);
3235 for(j=0; j<sz; j++){
3236 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3237 }
3238 payload[sz] = 0;
3239 printf(
drh6019e162001-07-02 17:51:45 +00003240 "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n",
drh4b70f112004-05-02 21:12:19 +00003241 i, range, child, (int)nKey, (int)nData, payload
drh8c42ca92001-06-22 19:15:00 +00003242 );
drh4b70f112004-05-02 21:12:19 +00003243 if( pPage->isInit && pPage->aCell[i]!=pCell ){
3244 printf("**** aCell[%d] does not match on prior entry ****\n", i);
drh2aa679f2001-06-25 02:11:07 +00003245 }
drh7c717f72001-06-24 20:39:41 +00003246 i++;
drh4b70f112004-05-02 21:12:19 +00003247 idx = get2byte(pCell);
drh8c42ca92001-06-22 19:15:00 +00003248 }
3249 if( idx!=0 ){
3250 printf("ERROR: next cell index out of range: %d\n", idx);
3251 }
drh4b70f112004-05-02 21:12:19 +00003252 if( !pPage->leaf ){
3253 printf("right_child: %d\n", get4byte(&pPage->aData[6]));
3254 }
drh8c42ca92001-06-22 19:15:00 +00003255 nFree = 0;
3256 i = 0;
drh4b70f112004-05-02 21:12:19 +00003257 idx = get2byte(&pPage->aData[hdrOffset+1]);
drhd0ba1932004-02-10 01:54:28 +00003258 while( idx>0 && idx<SQLITE_USABLE_SIZE ){
drh4b70f112004-05-02 21:12:19 +00003259 int sz = get2byte(&pPage->aData[idx+2]);
3260 sprintf(range,"%d..%d", idx, idx+sz-1);
3261 nFree += sz;
drh8c42ca92001-06-22 19:15:00 +00003262 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
drh4b70f112004-05-02 21:12:19 +00003263 i, range, sz, nFree);
3264 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003265 i++;
drh8c42ca92001-06-22 19:15:00 +00003266 }
3267 if( idx!=0 ){
3268 printf("ERROR: next freeblock index out of range: %d\n", idx);
3269 }
drh4b70f112004-05-02 21:12:19 +00003270 if( recursive && !pPage->left ){
3271 idx = get2byte(&pPage->aData[hdrOffset+3]);
drhd0ba1932004-02-10 01:54:28 +00003272 while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
drh4b70f112004-05-02 21:12:19 +00003273 unsigned char *pCell = &pPage->aData[idx];
3274 fileBtreePageDump(pBt, get4byte(&pPage->aData[idx+2]), 1);
3275 idx = get2byte(&pPage->aData[idx]);
drh6019e162001-07-02 17:51:45 +00003276 }
drh4b70f112004-05-02 21:12:19 +00003277 fileBtreePageDump(pBt, get4byte(&pPage->aData[hdrOffset+6]), 1);
drh6019e162001-07-02 17:51:45 +00003278 }
drh8c42ca92001-06-22 19:15:00 +00003279 sqlitepager_unref(pPage);
3280 return SQLITE_OK;
3281}
drhaaab5722002-02-19 13:39:21 +00003282#endif
drh8c42ca92001-06-22 19:15:00 +00003283
drhaaab5722002-02-19 13:39:21 +00003284#ifdef SQLITE_TEST
drh8c42ca92001-06-22 19:15:00 +00003285/*
drh2aa679f2001-06-25 02:11:07 +00003286** Fill aResult[] with information about the entry and page that the
3287** cursor is pointing to.
3288**
3289** aResult[0] = The page number
3290** aResult[1] = The entry number
3291** aResult[2] = Total number of entries on this page
3292** aResult[3] = Size of this entry
3293** aResult[4] = Number of free bytes on this page
3294** aResult[5] = Number of free blocks on the page
3295** aResult[6] = Page number of the left child of this entry
3296** aResult[7] = Page number of the right child for the whole page
drh5eddca62001-06-30 21:53:53 +00003297**
3298** This routine is used for testing and debugging only.
drh8c42ca92001-06-22 19:15:00 +00003299*/
drh144f9ea2003-04-16 01:28:16 +00003300static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){
drh2aa679f2001-06-25 02:11:07 +00003301 int cnt, idx;
3302 MemPage *pPage = pCur->pPage;
drh0d316a42002-08-11 20:10:47 +00003303 Btree *pBt = pCur->pBt;
drh4b70f112004-05-02 21:12:19 +00003304 assert( pPage->isInit );
drh2aa679f2001-06-25 02:11:07 +00003305 aResult[0] = sqlitepager_pagenumber(pPage);
drh8c42ca92001-06-22 19:15:00 +00003306 aResult[1] = pCur->idx;
drh2aa679f2001-06-25 02:11:07 +00003307 aResult[2] = pPage->nCell;
3308 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
drh4b70f112004-05-02 21:12:19 +00003309 aResult[3] = cellSize(pPage, pPage->aCell[pCur->idx]);
3310 aResult[6] = pPage->leaf ? 0 : get4byte(&pPage->aCell[pCur->idx][2]);
drh2aa679f2001-06-25 02:11:07 +00003311 }else{
3312 aResult[3] = 0;
3313 aResult[6] = 0;
3314 }
3315 aResult[4] = pPage->nFree;
3316 cnt = 0;
drh4b70f112004-05-02 21:12:19 +00003317 idx = get2byte(&pPage->aData[pPage->hdrOffset+1]);
drhd0ba1932004-02-10 01:54:28 +00003318 while( idx>0 && idx<SQLITE_USABLE_SIZE ){
drh2aa679f2001-06-25 02:11:07 +00003319 cnt++;
drh4b70f112004-05-02 21:12:19 +00003320 idx = get2byte(&pPage->aData[idx]);
drh2aa679f2001-06-25 02:11:07 +00003321 }
3322 aResult[5] = cnt;
drh4b70f112004-05-02 21:12:19 +00003323 aResult[7] = pPage->leaf ? 0 : get4byte(&pPage->aData[pPage->hdrOffset+6]);
drh8c42ca92001-06-22 19:15:00 +00003324 return SQLITE_OK;
3325}
drhaaab5722002-02-19 13:39:21 +00003326#endif
drhdd793422001-06-28 01:54:48 +00003327
drhdd793422001-06-28 01:54:48 +00003328/*
drh5eddca62001-06-30 21:53:53 +00003329** Return the pager associated with a BTree. This routine is used for
3330** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00003331*/
drh3aac2dd2004-04-26 14:10:20 +00003332Pager *sqlite3BtreePager(Btree *pBt){
drhdd793422001-06-28 01:54:48 +00003333 return pBt->pPager;
3334}
drh5eddca62001-06-30 21:53:53 +00003335
3336/*
3337** This structure is passed around through all the sanity checking routines
3338** in order to keep track of some global state information.
3339*/
drhaaab5722002-02-19 13:39:21 +00003340typedef struct IntegrityCk IntegrityCk;
3341struct IntegrityCk {
drh100569d2001-10-02 13:01:48 +00003342 Btree *pBt; /* The tree being checked out */
3343 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
3344 int nPage; /* Number of pages in the database */
3345 int *anRef; /* Number of times each page is referenced */
drh100569d2001-10-02 13:01:48 +00003346 char *zErrMsg; /* An error message. NULL of no errors seen. */
drh5eddca62001-06-30 21:53:53 +00003347};
3348
3349/*
3350** Append a message to the error message string.
3351*/
drhaaab5722002-02-19 13:39:21 +00003352static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
drh5eddca62001-06-30 21:53:53 +00003353 if( pCheck->zErrMsg ){
3354 char *zOld = pCheck->zErrMsg;
3355 pCheck->zErrMsg = 0;
drh41743982003-12-06 21:43:55 +00003356 sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003357 sqliteFree(zOld);
3358 }else{
drh41743982003-12-06 21:43:55 +00003359 sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
drh5eddca62001-06-30 21:53:53 +00003360 }
3361}
3362
3363/*
3364** Add 1 to the reference count for page iPage. If this is the second
3365** reference to the page, add an error message to pCheck->zErrMsg.
3366** Return 1 if there are 2 ore more references to the page and 0 if
3367** if this is the first reference to the page.
3368**
3369** Also check that the page number is in bounds.
3370*/
drhaaab5722002-02-19 13:39:21 +00003371static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00003372 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00003373 if( iPage>pCheck->nPage || iPage<0 ){
drh5eddca62001-06-30 21:53:53 +00003374 char zBuf[100];
3375 sprintf(zBuf, "invalid page number %d", iPage);
3376 checkAppendMsg(pCheck, zContext, zBuf);
3377 return 1;
3378 }
3379 if( pCheck->anRef[iPage]==1 ){
3380 char zBuf[100];
3381 sprintf(zBuf, "2nd reference to page %d", iPage);
3382 checkAppendMsg(pCheck, zContext, zBuf);
3383 return 1;
3384 }
3385 return (pCheck->anRef[iPage]++)>1;
3386}
3387
3388/*
3389** Check the integrity of the freelist or of an overflow page list.
3390** Verify that the number of pages on the list is N.
3391*/
drh30e58752002-03-02 20:41:57 +00003392static void checkList(
3393 IntegrityCk *pCheck, /* Integrity checking context */
3394 int isFreeList, /* True for a freelist. False for overflow page list */
3395 int iPage, /* Page number for first page in the list */
3396 int N, /* Expected number of pages in the list */
3397 char *zContext /* Context for error messages */
3398){
3399 int i;
drh5eddca62001-06-30 21:53:53 +00003400 char zMsg[100];
drh30e58752002-03-02 20:41:57 +00003401 while( N-- > 0 ){
drh4b70f112004-05-02 21:12:19 +00003402 unsigned char *pOvfl;
drh5eddca62001-06-30 21:53:53 +00003403 if( iPage<1 ){
3404 sprintf(zMsg, "%d pages missing from overflow list", N+1);
3405 checkAppendMsg(pCheck, zContext, zMsg);
3406 break;
3407 }
3408 if( checkRef(pCheck, iPage, zContext) ) break;
3409 if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3410 sprintf(zMsg, "failed to get page %d", iPage);
3411 checkAppendMsg(pCheck, zContext, zMsg);
3412 break;
3413 }
drh30e58752002-03-02 20:41:57 +00003414 if( isFreeList ){
drh4b70f112004-05-02 21:12:19 +00003415 int n = get4byte(&pOvfl[4]);
drh0d316a42002-08-11 20:10:47 +00003416 for(i=0; i<n; i++){
drh4b70f112004-05-02 21:12:19 +00003417 checkRef(pCheck, get4byte(&pOvfl[8+i*4]), zContext);
drh30e58752002-03-02 20:41:57 +00003418 }
drh0d316a42002-08-11 20:10:47 +00003419 N -= n;
drh30e58752002-03-02 20:41:57 +00003420 }
drh4b70f112004-05-02 21:12:19 +00003421 iPage = get4byte(pOvfl);
drh5eddca62001-06-30 21:53:53 +00003422 sqlitepager_unref(pOvfl);
3423 }
3424}
3425
3426/*
drh1bffb9c2002-02-03 17:37:36 +00003427** Return negative if zKey1<zKey2.
3428** Return zero if zKey1==zKey2.
3429** Return positive if zKey1>zKey2.
3430*/
3431static int keyCompare(
3432 const char *zKey1, int nKey1,
3433 const char *zKey2, int nKey2
3434){
3435 int min = nKey1>nKey2 ? nKey2 : nKey1;
3436 int c = memcmp(zKey1, zKey2, min);
3437 if( c==0 ){
3438 c = nKey1 - nKey2;
3439 }
3440 return c;
3441}
3442
3443/*
drh5eddca62001-06-30 21:53:53 +00003444** Do various sanity checks on a single page of a tree. Return
3445** the tree depth. Root pages return 0. Parents of root pages
3446** return 1, and so forth.
3447**
3448** These checks are done:
3449**
3450** 1. Make sure that cells and freeblocks do not overlap
3451** but combine to completely cover the page.
3452** 2. Make sure cell keys are in order.
3453** 3. Make sure no key is less than or equal to zLowerBound.
3454** 4. Make sure no key is greater than or equal to zUpperBound.
3455** 5. Check the integrity of overflow pages.
3456** 6. Recursively call checkTreePage on all children.
3457** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00003458** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00003459** the root of the tree.
3460*/
3461static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00003462 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00003463 int iPage, /* Page number of the page to check */
3464 MemPage *pParent, /* Parent page */
3465 char *zParentContext, /* Parent context */
3466 char *zLowerBound, /* All keys should be greater than this, if not NULL */
drh1bffb9c2002-02-03 17:37:36 +00003467 int nLower, /* Number of characters in zLowerBound */
3468 char *zUpperBound, /* All keys should be less than this, if not NULL */
3469 int nUpper /* Number of characters in zUpperBound */
drh5eddca62001-06-30 21:53:53 +00003470){
3471 MemPage *pPage;
3472 int i, rc, depth, d2, pgno;
3473 char *zKey1, *zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003474 int nKey1, nKey2;
drh5eddca62001-06-30 21:53:53 +00003475 BtCursor cur;
drh0d316a42002-08-11 20:10:47 +00003476 Btree *pBt;
drh5eddca62001-06-30 21:53:53 +00003477 char zMsg[100];
3478 char zContext[100];
drhd0ba1932004-02-10 01:54:28 +00003479 char hit[SQLITE_USABLE_SIZE];
drh5eddca62001-06-30 21:53:53 +00003480
3481 /* Check that the page exists
3482 */
drh0d316a42002-08-11 20:10:47 +00003483 cur.pBt = pBt = pCheck->pBt;
drh5eddca62001-06-30 21:53:53 +00003484 if( iPage==0 ) return 0;
3485 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3486 sprintf(zContext, "On tree page %d: ", iPage);
drh4b70f112004-05-02 21:12:19 +00003487 if( (rc = getPage(pBt, (Pgno)iPage, &pPage))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003488 sprintf(zMsg, "unable to get the page. error code=%d", rc);
3489 checkAppendMsg(pCheck, zContext, zMsg);
3490 return 0;
3491 }
drh4b70f112004-05-02 21:12:19 +00003492 if( (rc = initPage(pPage, pParent))!=0 ){
drh5eddca62001-06-30 21:53:53 +00003493 sprintf(zMsg, "initPage() returns error code %d", rc);
3494 checkAppendMsg(pCheck, zContext, zMsg);
3495 sqlitepager_unref(pPage);
3496 return 0;
3497 }
3498
drh4b70f112004-05-02 21:12:19 +00003499#if 0
3500
drh5eddca62001-06-30 21:53:53 +00003501 /* Check out all the cells.
3502 */
3503 depth = 0;
drh1bffb9c2002-02-03 17:37:36 +00003504 if( zLowerBound ){
3505 zKey1 = sqliteMalloc( nLower+1 );
3506 memcpy(zKey1, zLowerBound, nLower);
3507 zKey1[nLower] = 0;
3508 }else{
3509 zKey1 = 0;
3510 }
3511 nKey1 = nLower;
drh5eddca62001-06-30 21:53:53 +00003512 cur.pPage = pPage;
drh5eddca62001-06-30 21:53:53 +00003513 for(i=0; i<pPage->nCell; i++){
3514 Cell *pCell = pPage->apCell[i];
3515 int sz;
3516
3517 /* Check payload overflow pages
3518 */
drh0d316a42002-08-11 20:10:47 +00003519 nKey2 = NKEY(pBt, pCell->h);
3520 sz = nKey2 + NDATA(pBt, pCell->h);
drh5eddca62001-06-30 21:53:53 +00003521 sprintf(zContext, "On page %d cell %d: ", iPage, i);
3522 if( sz>MX_LOCAL_PAYLOAD ){
3523 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
drh0d316a42002-08-11 20:10:47 +00003524 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00003525 }
3526
3527 /* Check that keys are in the right order
3528 */
3529 cur.idx = i;
drh8c1238a2003-01-02 14:43:55 +00003530 zKey2 = sqliteMallocRaw( nKey2+1 );
drh1bffb9c2002-02-03 17:37:36 +00003531 getPayload(&cur, 0, nKey2, zKey2);
3532 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
drh5eddca62001-06-30 21:53:53 +00003533 checkAppendMsg(pCheck, zContext, "Key is out of order");
3534 }
3535
3536 /* Check sanity of left child page.
3537 */
drh0d316a42002-08-11 20:10:47 +00003538 pgno = SWAB32(pBt, pCell->h.leftChild);
drh1bffb9c2002-02-03 17:37:36 +00003539 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
drh5eddca62001-06-30 21:53:53 +00003540 if( i>0 && d2!=depth ){
3541 checkAppendMsg(pCheck, zContext, "Child page depth differs");
3542 }
3543 depth = d2;
3544 sqliteFree(zKey1);
3545 zKey1 = zKey2;
drh1bffb9c2002-02-03 17:37:36 +00003546 nKey1 = nKey2;
drh5eddca62001-06-30 21:53:53 +00003547 }
drh0d316a42002-08-11 20:10:47 +00003548 pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
drh5eddca62001-06-30 21:53:53 +00003549 sprintf(zContext, "On page %d at right child: ", iPage);
drh1bffb9c2002-02-03 17:37:36 +00003550 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
drh5eddca62001-06-30 21:53:53 +00003551 sqliteFree(zKey1);
3552
3553 /* Check for complete coverage of the page
3554 */
3555 memset(hit, 0, sizeof(hit));
3556 memset(hit, 1, sizeof(PageHdr));
drhd0ba1932004-02-10 01:54:28 +00003557 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003558 Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3559 int j;
drh0d316a42002-08-11 20:10:47 +00003560 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3561 i = SWAB16(pBt, pCell->h.iNext);
drh5eddca62001-06-30 21:53:53 +00003562 }
drhd0ba1932004-02-10 01:54:28 +00003563 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){
drh5eddca62001-06-30 21:53:53 +00003564 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3565 int j;
drh0d316a42002-08-11 20:10:47 +00003566 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3567 i = SWAB16(pBt,pFBlk->iNext);
drh5eddca62001-06-30 21:53:53 +00003568 }
drhd0ba1932004-02-10 01:54:28 +00003569 for(i=0; i<SQLITE_USABLE_SIZE; i++){
drh5eddca62001-06-30 21:53:53 +00003570 if( hit[i]==0 ){
3571 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3572 checkAppendMsg(pCheck, zMsg, 0);
3573 break;
3574 }else if( hit[i]>1 ){
3575 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3576 checkAppendMsg(pCheck, zMsg, 0);
3577 break;
3578 }
3579 }
3580
drh6019e162001-07-02 17:51:45 +00003581#endif
3582
drh4b70f112004-05-02 21:12:19 +00003583 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00003584 return depth;
3585}
3586
3587/*
3588** This routine does a complete check of the given BTree file. aRoot[] is
3589** an array of pages numbers were each page number is the root page of
3590** a table. nRoot is the number of entries in aRoot.
3591**
3592** If everything checks out, this routine returns NULL. If something is
3593** amiss, an error message is written into memory obtained from malloc()
3594** and a pointer to that error message is returned. The calling function
3595** is responsible for freeing the error message when it is done.
3596*/
drh3aac2dd2004-04-26 14:10:20 +00003597char *sqlite3BtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
drh5eddca62001-06-30 21:53:53 +00003598 int i;
3599 int nRef;
drhaaab5722002-02-19 13:39:21 +00003600 IntegrityCk sCheck;
drh5eddca62001-06-30 21:53:53 +00003601
3602 nRef = *sqlitepager_stats(pBt->pPager);
drhefc251d2001-07-01 22:12:01 +00003603 if( lockBtree(pBt)!=SQLITE_OK ){
3604 return sqliteStrDup("Unable to acquire a read lock on the database");
3605 }
drh5eddca62001-06-30 21:53:53 +00003606 sCheck.pBt = pBt;
3607 sCheck.pPager = pBt->pPager;
3608 sCheck.nPage = sqlitepager_pagecount(sCheck.pPager);
drh0de8c112002-07-06 16:32:14 +00003609 if( sCheck.nPage==0 ){
3610 unlockBtreeIfUnused(pBt);
3611 return 0;
3612 }
drh8c1238a2003-01-02 14:43:55 +00003613 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
drh5eddca62001-06-30 21:53:53 +00003614 sCheck.anRef[1] = 1;
3615 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3616 sCheck.zErrMsg = 0;
3617
3618 /* Check the integrity of the freelist
3619 */
drh0d316a42002-08-11 20:10:47 +00003620 checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList),
3621 SWAB32(pBt, pBt->page1->nFree), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00003622
3623 /* Check all the tables.
3624 */
3625 for(i=0; i<nRoot; i++){
drh4ff6dfa2002-03-03 23:06:00 +00003626 if( aRoot[i]==0 ) continue;
drh1bffb9c2002-02-03 17:37:36 +00003627 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
drh5eddca62001-06-30 21:53:53 +00003628 }
3629
3630 /* Make sure every page in the file is referenced
3631 */
3632 for(i=1; i<=sCheck.nPage; i++){
3633 if( sCheck.anRef[i]==0 ){
3634 char zBuf[100];
3635 sprintf(zBuf, "Page %d is never used", i);
3636 checkAppendMsg(&sCheck, zBuf, 0);
3637 }
3638 }
3639
3640 /* Make sure this analysis did not leave any unref() pages
3641 */
drh5e00f6c2001-09-13 13:46:56 +00003642 unlockBtreeIfUnused(pBt);
drh5eddca62001-06-30 21:53:53 +00003643 if( nRef != *sqlitepager_stats(pBt->pPager) ){
3644 char zBuf[100];
3645 sprintf(zBuf,
3646 "Outstanding page count goes from %d to %d during this analysis",
3647 nRef, *sqlitepager_stats(pBt->pPager)
3648 );
3649 checkAppendMsg(&sCheck, zBuf, 0);
3650 }
3651
3652 /* Clean up and report errors.
3653 */
3654 sqliteFree(sCheck.anRef);
3655 return sCheck.zErrMsg;
3656}
paulb95a8862003-04-01 21:16:41 +00003657
drh73509ee2003-04-06 20:44:45 +00003658/*
3659** Return the full pathname of the underlying database file.
3660*/
drh3aac2dd2004-04-26 14:10:20 +00003661const char *sqlite3BtreeGetFilename(Btree *pBt){
drh73509ee2003-04-06 20:44:45 +00003662 assert( pBt->pPager!=0 );
3663 return sqlitepager_filename(pBt->pPager);
3664}
3665
3666/*
drhf7c57532003-04-25 13:22:51 +00003667** Copy the complete content of pBtFrom into pBtTo. A transaction
3668** must be active for both files.
3669**
3670** The size of file pBtFrom may be reduced by this operation.
3671** If anything goes wrong, the transaction on pBtFrom is rolled back.
drh73509ee2003-04-06 20:44:45 +00003672*/
drh3aac2dd2004-04-26 14:10:20 +00003673int sqlite3BtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
drhf7c57532003-04-25 13:22:51 +00003674 int rc = SQLITE_OK;
drh2e6d11b2003-04-25 15:37:57 +00003675 Pgno i, nPage, nToPage;
drhf7c57532003-04-25 13:22:51 +00003676
3677 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
3678 if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR;
3679 if( pBtTo->pCursor ) return SQLITE_BUSY;
drhd0ba1932004-02-10 01:54:28 +00003680 memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE);
drh2e6d11b2003-04-25 15:37:57 +00003681 rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1);
3682 nToPage = sqlitepager_pagecount(pBtTo->pPager);
drhf7c57532003-04-25 13:22:51 +00003683 nPage = sqlitepager_pagecount(pBtFrom->pPager);
drh2e6d11b2003-04-25 15:37:57 +00003684 for(i=2; rc==SQLITE_OK && i<=nPage; i++){
drhf7c57532003-04-25 13:22:51 +00003685 void *pPage;
3686 rc = sqlitepager_get(pBtFrom->pPager, i, &pPage);
3687 if( rc ) break;
drh2e6d11b2003-04-25 15:37:57 +00003688 rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage);
3689 if( rc ) break;
drhf7c57532003-04-25 13:22:51 +00003690 sqlitepager_unref(pPage);
3691 }
drh2e6d11b2003-04-25 15:37:57 +00003692 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3693 void *pPage;
3694 rc = sqlitepager_get(pBtTo->pPager, i, &pPage);
3695 if( rc ) break;
3696 rc = sqlitepager_write(pPage);
3697 sqlitepager_unref(pPage);
3698 sqlitepager_dont_write(pBtTo->pPager, i);
3699 }
3700 if( !rc && nPage<nToPage ){
3701 rc = sqlitepager_truncate(pBtTo->pPager, nPage);
3702 }
drhf7c57532003-04-25 13:22:51 +00003703 if( rc ){
drh3aac2dd2004-04-26 14:10:20 +00003704 sqlite3BtreeRollback(pBtTo);
drhf7c57532003-04-25 13:22:51 +00003705 }
3706 return rc;
drh73509ee2003-04-06 20:44:45 +00003707}