blob: 218c96835229074b1f6d1a959f9043ad12141868 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
shane5eff7cf2009-08-10 03:57:58 +000012** $Id: btree.c,v 1.705 2009/08/10 03:57:58 shane Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
drha3152892007-05-05 11:48:52 +000015** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
drha059ad02001-04-17 20:09:11 +000017*/
drha3152892007-05-05 11:48:52 +000018#include "btreeInt.h"
paulb95a8862003-04-01 21:16:41 +000019
drh8c42ca92001-06-22 19:15:00 +000020/*
drha3152892007-05-05 11:48:52 +000021** The header string that appears at the beginning of every
22** SQLite database.
drh556b2a22005-06-14 16:04:05 +000023*/
drh556b2a22005-06-14 16:04:05 +000024static const char zMagicHeader[] = SQLITE_FILE_HEADER;
drh08ed44e2001-04-29 23:32:55 +000025
drh8c42ca92001-06-22 19:15:00 +000026/*
drha3152892007-05-05 11:48:52 +000027** Set this global variable to 1 to enable tracing using the TRACE
28** macro.
drh615ae552005-01-16 23:21:00 +000029*/
drhe8f52c52008-07-12 14:52:20 +000030#if 0
danielk1977a50d9aa2009-06-08 14:49:45 +000031int sqlite3BtreeTrace=1; /* True to enable tracing */
drhe8f52c52008-07-12 14:52:20 +000032# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
33#else
34# define TRACE(X)
drh615ae552005-01-16 23:21:00 +000035#endif
drh615ae552005-01-16 23:21:00 +000036
drh86f8c192007-08-22 00:39:19 +000037
38
drhe53831d2007-08-17 01:14:38 +000039#ifndef SQLITE_OMIT_SHARED_CACHE
40/*
danielk1977502b4e02008-09-02 14:07:24 +000041** A list of BtShared objects that are eligible for participation
42** in shared cache. This variable has file scope during normal builds,
43** but the test harness needs to access it so we make it global for
44** test builds.
drh7555d8e2009-03-20 13:15:30 +000045**
46** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
drhe53831d2007-08-17 01:14:38 +000047*/
48#ifdef SQLITE_TEST
drh78f82d12008-09-02 00:52:52 +000049BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000050#else
drh78f82d12008-09-02 00:52:52 +000051static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000052#endif
drhe53831d2007-08-17 01:14:38 +000053#endif /* SQLITE_OMIT_SHARED_CACHE */
54
55#ifndef SQLITE_OMIT_SHARED_CACHE
56/*
57** Enable or disable the shared pager and schema features.
58**
59** This routine has no effect on existing database connections.
60** The shared cache setting effects only future calls to
61** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
62*/
63int sqlite3_enable_shared_cache(int enable){
danielk1977502b4e02008-09-02 14:07:24 +000064 sqlite3GlobalConfig.sharedCacheEnabled = enable;
drhe53831d2007-08-17 01:14:38 +000065 return SQLITE_OK;
66}
67#endif
68
drhd677b3d2007-08-20 22:48:41 +000069
danielk1977aef0bf62005-12-30 16:28:01 +000070
71#ifdef SQLITE_OMIT_SHARED_CACHE
72 /*
drhc25eabe2009-02-24 18:57:31 +000073 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
74 ** and clearAllSharedCacheTableLocks()
danielk1977aef0bf62005-12-30 16:28:01 +000075 ** manipulate entries in the BtShared.pLock linked list used to store
76 ** shared-cache table level locks. If the library is compiled with the
77 ** shared-cache feature disabled, then there is only ever one user
danielk1977da184232006-01-05 11:34:32 +000078 ** of each BtShared structure and so this locking is not necessary.
79 ** So define the lock related functions as no-ops.
danielk1977aef0bf62005-12-30 16:28:01 +000080 */
drhc25eabe2009-02-24 18:57:31 +000081 #define querySharedCacheTableLock(a,b,c) SQLITE_OK
82 #define setSharedCacheTableLock(a,b,c) SQLITE_OK
83 #define clearAllSharedCacheTableLocks(a)
danielk197794b30732009-07-02 17:21:57 +000084 #define downgradeAllSharedCacheTableLocks(a)
danielk197796d48e92009-06-29 06:00:37 +000085 #define hasSharedCacheTableLock(a,b,c,d) 1
86 #define hasReadConflicts(a, b) 0
drhe53831d2007-08-17 01:14:38 +000087#endif
danielk1977aef0bf62005-12-30 16:28:01 +000088
drhe53831d2007-08-17 01:14:38 +000089#ifndef SQLITE_OMIT_SHARED_CACHE
danielk197796d48e92009-06-29 06:00:37 +000090
91#ifdef SQLITE_DEBUG
92/*
93** This function is only used as part of an assert() statement. It checks
94** that connection p holds the required locks to read or write to the
95** b-tree with root page iRoot. If so, true is returned. Otherwise, false.
96** For example, when writing to a table b-tree with root-page iRoot via
97** Btree connection pBtree:
98**
99** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
100**
101** When writing to an index b-tree that resides in a sharable database, the
102** caller should have first obtained a lock specifying the root page of
103** the corresponding table b-tree. This makes things a bit more complicated,
104** as this module treats each b-tree as a separate structure. To determine
105** the table b-tree corresponding to the index b-tree being written, this
106** function has to search through the database schema.
107**
108** Instead of a lock on the b-tree rooted at page iRoot, the caller may
109** hold a write-lock on the schema table (root page 1). This is also
110** acceptable.
111*/
112static int hasSharedCacheTableLock(
113 Btree *pBtree, /* Handle that must hold lock */
114 Pgno iRoot, /* Root page of b-tree */
115 int isIndex, /* True if iRoot is the root of an index b-tree */
116 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
117){
118 Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
119 Pgno iTab = 0;
120 BtLock *pLock;
121
122 /* If this b-tree database is not shareable, or if the client is reading
123 ** and has the read-uncommitted flag set, then no lock is required.
124 ** In these cases return true immediately. If the client is reading
125 ** or writing an index b-tree, but the schema is not loaded, then return
126 ** true also. In this case the lock is required, but it is too difficult
127 ** to check if the client actually holds it. This doesn't happen very
128 ** often. */
129 if( (pBtree->sharable==0)
130 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
131 || (isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0 ))
132 ){
133 return 1;
134 }
135
136 /* Figure out the root-page that the lock should be held on. For table
137 ** b-trees, this is just the root page of the b-tree being read or
138 ** written. For index b-trees, it is the root page of the associated
139 ** table. */
140 if( isIndex ){
141 HashElem *p;
142 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
143 Index *pIdx = (Index *)sqliteHashData(p);
shane5eff7cf2009-08-10 03:57:58 +0000144 if( pIdx->tnum==(int)iRoot ){
145 iTab = pIdx->pTable->tnum;
danielk197796d48e92009-06-29 06:00:37 +0000146 }
147 }
148 }else{
149 iTab = iRoot;
150 }
151
152 /* Search for the required lock. Either a write-lock on root-page iTab, a
153 ** write-lock on the schema table, or (if the client is reading) a
154 ** read-lock on iTab will suffice. Return 1 if any of these are found. */
155 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
156 if( pLock->pBtree==pBtree
157 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
158 && pLock->eLock>=eLockType
159 ){
160 return 1;
161 }
162 }
163
164 /* Failed to find the required lock. */
165 return 0;
166}
167
168/*
169** This function is also used as part of assert() statements only. It
170** returns true if there exist one or more cursors open on the table
171** with root page iRoot that do not belong to either connection pBtree
172** or some other connection that has the read-uncommitted flag set.
173**
174** For example, before writing to page iRoot:
175**
176** assert( !hasReadConflicts(pBtree, iRoot) );
177*/
178static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
179 BtCursor *p;
180 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
181 if( p->pgnoRoot==iRoot
182 && p->pBtree!=pBtree
183 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
184 ){
185 return 1;
186 }
187 }
188 return 0;
189}
190#endif /* #ifdef SQLITE_DEBUG */
191
danielk1977da184232006-01-05 11:34:32 +0000192/*
danielk1977aef0bf62005-12-30 16:28:01 +0000193** Query to see if btree handle p may obtain a lock of type eLock
194** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
drhc25eabe2009-02-24 18:57:31 +0000195** SQLITE_OK if the lock may be obtained (by calling
196** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
danielk1977aef0bf62005-12-30 16:28:01 +0000197*/
drhc25eabe2009-02-24 18:57:31 +0000198static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
danielk1977aef0bf62005-12-30 16:28:01 +0000199 BtShared *pBt = p->pBt;
200 BtLock *pIter;
201
drh1fee73e2007-08-29 04:00:57 +0000202 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000203 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
204 assert( p->db!=0 );
danielk1977e0d9e6f2009-07-03 16:25:06 +0000205 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
drhd677b3d2007-08-20 22:48:41 +0000206
danielk19775b413d72009-04-01 09:41:54 +0000207 /* If requesting a write-lock, then the Btree must have an open write
208 ** transaction on this file. And, obviously, for this to be so there
209 ** must be an open write transaction on the file itself.
210 */
211 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
212 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
213
danielk1977da184232006-01-05 11:34:32 +0000214 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000215 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000216 return SQLITE_OK;
217 }
218
danielk1977641b0f42007-12-21 04:47:25 +0000219 /* If some other connection is holding an exclusive lock, the
220 ** requested lock may not be obtained.
221 */
danielk1977404ca072009-03-16 13:19:36 +0000222 if( pBt->pWriter!=p && pBt->isExclusive ){
223 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
224 return SQLITE_LOCKED_SHAREDCACHE;
danielk1977641b0f42007-12-21 04:47:25 +0000225 }
226
danielk1977e0d9e6f2009-07-03 16:25:06 +0000227 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
228 /* The condition (pIter->eLock!=eLock) in the following if(...)
229 ** statement is a simplification of:
230 **
231 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
232 **
233 ** since we know that if eLock==WRITE_LOCK, then no other connection
234 ** may hold a WRITE_LOCK on any table in this file (since there can
235 ** only be a single writer).
236 */
237 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
238 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
239 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
240 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
241 if( eLock==WRITE_LOCK ){
242 assert( p==pBt->pWriter );
243 pBt->isPending = 1;
danielk1977da184232006-01-05 11:34:32 +0000244 }
danielk1977e0d9e6f2009-07-03 16:25:06 +0000245 return SQLITE_LOCKED_SHAREDCACHE;
danielk1977aef0bf62005-12-30 16:28:01 +0000246 }
247 }
248 return SQLITE_OK;
249}
drhe53831d2007-08-17 01:14:38 +0000250#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000251
drhe53831d2007-08-17 01:14:38 +0000252#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000253/*
254** Add a lock on the table with root-page iTable to the shared-btree used
255** by Btree handle p. Parameter eLock must be either READ_LOCK or
256** WRITE_LOCK.
257**
danielk19779d104862009-07-09 08:27:14 +0000258** This function assumes the following:
259**
260** (a) The specified b-tree connection handle is connected to a sharable
261** b-tree database (one with the BtShared.sharable) flag set, and
262**
263** (b) No other b-tree connection handle holds a lock that conflicts
264** with the requested lock (i.e. querySharedCacheTableLock() has
265** already been called and returned SQLITE_OK).
266**
267** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
268** is returned if a malloc attempt fails.
danielk1977aef0bf62005-12-30 16:28:01 +0000269*/
drhc25eabe2009-02-24 18:57:31 +0000270static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
danielk1977aef0bf62005-12-30 16:28:01 +0000271 BtShared *pBt = p->pBt;
272 BtLock *pLock = 0;
273 BtLock *pIter;
274
drh1fee73e2007-08-29 04:00:57 +0000275 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000276 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
277 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000278
danielk1977e0d9e6f2009-07-03 16:25:06 +0000279 /* A connection with the read-uncommitted flag set will never try to
280 ** obtain a read-lock using this function. The only read-lock obtained
281 ** by a connection in read-uncommitted mode is on the sqlite_master
282 ** table, and that lock is obtained in BtreeBeginTrans(). */
283 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
284
danielk19779d104862009-07-09 08:27:14 +0000285 /* This function should only be called on a sharable b-tree after it
286 ** has been determined that no other b-tree holds a conflicting lock. */
287 assert( p->sharable );
drhc25eabe2009-02-24 18:57:31 +0000288 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
danielk1977aef0bf62005-12-30 16:28:01 +0000289
290 /* First search the list for an existing lock on this table. */
291 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
292 if( pIter->iTable==iTable && pIter->pBtree==p ){
293 pLock = pIter;
294 break;
295 }
296 }
297
298 /* If the above search did not find a BtLock struct associating Btree p
299 ** with table iTable, allocate one and link it into the list.
300 */
301 if( !pLock ){
drh17435752007-08-16 04:30:38 +0000302 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
danielk1977aef0bf62005-12-30 16:28:01 +0000303 if( !pLock ){
304 return SQLITE_NOMEM;
305 }
306 pLock->iTable = iTable;
307 pLock->pBtree = p;
308 pLock->pNext = pBt->pLock;
309 pBt->pLock = pLock;
310 }
311
312 /* Set the BtLock.eLock variable to the maximum of the current lock
313 ** and the requested lock. This means if a write-lock was already held
314 ** and a read-lock requested, we don't incorrectly downgrade the lock.
315 */
316 assert( WRITE_LOCK>READ_LOCK );
danielk19775118b912005-12-30 16:31:53 +0000317 if( eLock>pLock->eLock ){
318 pLock->eLock = eLock;
319 }
danielk1977aef0bf62005-12-30 16:28:01 +0000320
321 return SQLITE_OK;
322}
drhe53831d2007-08-17 01:14:38 +0000323#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000324
drhe53831d2007-08-17 01:14:38 +0000325#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000326/*
drhc25eabe2009-02-24 18:57:31 +0000327** Release all the table locks (locks obtained via calls to
328** the setSharedCacheTableLock() procedure) held by Btree handle p.
danielk1977fa542f12009-04-02 18:28:08 +0000329**
330** This function assumes that handle p has an open read or write
331** transaction. If it does not, then the BtShared.isPending variable
332** may be incorrectly cleared.
danielk1977aef0bf62005-12-30 16:28:01 +0000333*/
drhc25eabe2009-02-24 18:57:31 +0000334static void clearAllSharedCacheTableLocks(Btree *p){
danielk1977641b0f42007-12-21 04:47:25 +0000335 BtShared *pBt = p->pBt;
336 BtLock **ppIter = &pBt->pLock;
danielk1977da184232006-01-05 11:34:32 +0000337
drh1fee73e2007-08-29 04:00:57 +0000338 assert( sqlite3BtreeHoldsMutex(p) );
drhe53831d2007-08-17 01:14:38 +0000339 assert( p->sharable || 0==*ppIter );
danielk1977fa542f12009-04-02 18:28:08 +0000340 assert( p->inTrans>0 );
danielk1977da184232006-01-05 11:34:32 +0000341
danielk1977aef0bf62005-12-30 16:28:01 +0000342 while( *ppIter ){
343 BtLock *pLock = *ppIter;
danielk1977404ca072009-03-16 13:19:36 +0000344 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
danielk1977fa542f12009-04-02 18:28:08 +0000345 assert( pLock->pBtree->inTrans>=pLock->eLock );
danielk1977aef0bf62005-12-30 16:28:01 +0000346 if( pLock->pBtree==p ){
347 *ppIter = pLock->pNext;
danielk1977602b4662009-07-02 07:47:33 +0000348 assert( pLock->iTable!=1 || pLock==&p->lock );
349 if( pLock->iTable!=1 ){
350 sqlite3_free(pLock);
351 }
danielk1977aef0bf62005-12-30 16:28:01 +0000352 }else{
353 ppIter = &pLock->pNext;
354 }
355 }
danielk1977641b0f42007-12-21 04:47:25 +0000356
danielk1977404ca072009-03-16 13:19:36 +0000357 assert( pBt->isPending==0 || pBt->pWriter );
358 if( pBt->pWriter==p ){
359 pBt->pWriter = 0;
360 pBt->isExclusive = 0;
361 pBt->isPending = 0;
362 }else if( pBt->nTransaction==2 ){
363 /* This function is called when connection p is concluding its
364 ** transaction. If there currently exists a writer, and p is not
365 ** that writer, then the number of locks held by connections other
366 ** than the writer must be about to drop to zero. In this case
367 ** set the isPending flag to 0.
368 **
369 ** If there is not currently a writer, then BtShared.isPending must
370 ** be zero already. So this next line is harmless in that case.
371 */
372 pBt->isPending = 0;
danielk1977641b0f42007-12-21 04:47:25 +0000373 }
danielk1977aef0bf62005-12-30 16:28:01 +0000374}
danielk197794b30732009-07-02 17:21:57 +0000375
danielk1977e0d9e6f2009-07-03 16:25:06 +0000376/*
377** This function changes all write-locks held by connection p to read-locks.
378*/
danielk197794b30732009-07-02 17:21:57 +0000379static void downgradeAllSharedCacheTableLocks(Btree *p){
380 BtShared *pBt = p->pBt;
381 if( pBt->pWriter==p ){
382 BtLock *pLock;
383 pBt->pWriter = 0;
384 pBt->isExclusive = 0;
385 pBt->isPending = 0;
386 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
387 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
388 pLock->eLock = READ_LOCK;
389 }
390 }
391}
392
danielk1977aef0bf62005-12-30 16:28:01 +0000393#endif /* SQLITE_OMIT_SHARED_CACHE */
394
drh980b1a72006-08-16 16:42:48 +0000395static void releasePage(MemPage *pPage); /* Forward reference */
396
drh1fee73e2007-08-29 04:00:57 +0000397/*
398** Verify that the cursor holds a mutex on the BtShared
399*/
400#ifndef NDEBUG
401static int cursorHoldsMutex(BtCursor *p){
drhff0587c2007-08-29 17:43:19 +0000402 return sqlite3_mutex_held(p->pBt->mutex);
drh1fee73e2007-08-29 04:00:57 +0000403}
404#endif
405
406
danielk197792d4d7a2007-05-04 12:05:56 +0000407#ifndef SQLITE_OMIT_INCRBLOB
408/*
409** Invalidate the overflow page-list cache for cursor pCur, if any.
410*/
411static void invalidateOverflowCache(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000412 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000413 sqlite3_free(pCur->aOverflow);
danielk197792d4d7a2007-05-04 12:05:56 +0000414 pCur->aOverflow = 0;
415}
416
417/*
418** Invalidate the overflow page-list cache for all cursors opened
419** on the shared btree structure pBt.
420*/
421static void invalidateAllOverflowCache(BtShared *pBt){
422 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000423 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +0000424 for(p=pBt->pCursor; p; p=p->pNext){
425 invalidateOverflowCache(p);
426 }
427}
danielk197796d48e92009-06-29 06:00:37 +0000428
429/*
430** This function is called before modifying the contents of a table
431** b-tree to invalidate any incrblob cursors that are open on the
drheeb844a2009-08-08 18:01:07 +0000432** row or one of the rows being modified.
danielk197796d48e92009-06-29 06:00:37 +0000433**
434** If argument isClearTable is true, then the entire contents of the
435** table is about to be deleted. In this case invalidate all incrblob
436** cursors open on any row within the table with root-page pgnoRoot.
437**
438** Otherwise, if argument isClearTable is false, then the row with
439** rowid iRow is being replaced or deleted. In this case invalidate
440** only those incrblob cursors open on this specific row.
441*/
442static void invalidateIncrblobCursors(
443 Btree *pBtree, /* The database file to check */
danielk197796d48e92009-06-29 06:00:37 +0000444 i64 iRow, /* The rowid that might be changing */
445 int isClearTable /* True if all rows are being deleted */
446){
447 BtCursor *p;
448 BtShared *pBt = pBtree->pBt;
449 assert( sqlite3BtreeHoldsMutex(pBtree) );
450 for(p=pBt->pCursor; p; p=p->pNext){
451 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
452 p->eState = CURSOR_INVALID;
453 }
454 }
455}
456
danielk197792d4d7a2007-05-04 12:05:56 +0000457#else
458 #define invalidateOverflowCache(x)
459 #define invalidateAllOverflowCache(x)
drheeb844a2009-08-08 18:01:07 +0000460 #define invalidateIncrblobCursors(x,y,z)
danielk197792d4d7a2007-05-04 12:05:56 +0000461#endif
462
drh980b1a72006-08-16 16:42:48 +0000463/*
danielk1977bea2a942009-01-20 17:06:27 +0000464** Set bit pgno of the BtShared.pHasContent bitvec. This is called
465** when a page that previously contained data becomes a free-list leaf
466** page.
467**
468** The BtShared.pHasContent bitvec exists to work around an obscure
469** bug caused by the interaction of two useful IO optimizations surrounding
470** free-list leaf pages:
471**
472** 1) When all data is deleted from a page and the page becomes
473** a free-list leaf page, the page is not written to the database
474** (as free-list leaf pages contain no meaningful data). Sometimes
475** such a page is not even journalled (as it will not be modified,
476** why bother journalling it?).
477**
478** 2) When a free-list leaf page is reused, its content is not read
479** from the database or written to the journal file (why should it
480** be, if it is not at all meaningful?).
481**
482** By themselves, these optimizations work fine and provide a handy
483** performance boost to bulk delete or insert operations. However, if
484** a page is moved to the free-list and then reused within the same
485** transaction, a problem comes up. If the page is not journalled when
486** it is moved to the free-list and it is also not journalled when it
487** is extracted from the free-list and reused, then the original data
488** may be lost. In the event of a rollback, it may not be possible
489** to restore the database to its original configuration.
490**
491** The solution is the BtShared.pHasContent bitvec. Whenever a page is
492** moved to become a free-list leaf page, the corresponding bit is
493** set in the bitvec. Whenever a leaf page is extracted from the free-list,
494** optimization 2 above is ommitted if the corresponding bit is already
495** set in BtShared.pHasContent. The contents of the bitvec are cleared
496** at the end of every transaction.
497*/
498static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
499 int rc = SQLITE_OK;
500 if( !pBt->pHasContent ){
drh4c301aa2009-07-15 17:25:45 +0000501 int nPage = 100;
502 sqlite3PagerPagecount(pBt->pPager, &nPage);
503 /* If sqlite3PagerPagecount() fails there is no harm because the
504 ** nPage variable is unchanged from its default value of 100 */
505 pBt->pHasContent = sqlite3BitvecCreate((u32)nPage);
506 if( !pBt->pHasContent ){
507 rc = SQLITE_NOMEM;
danielk1977bea2a942009-01-20 17:06:27 +0000508 }
509 }
510 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
511 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
512 }
513 return rc;
514}
515
516/*
517** Query the BtShared.pHasContent vector.
518**
519** This function is called when a free-list leaf page is removed from the
520** free-list for reuse. It returns false if it is safe to retrieve the
521** page from the pager layer with the 'no-content' flag set. True otherwise.
522*/
523static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
524 Bitvec *p = pBt->pHasContent;
525 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
526}
527
528/*
529** Clear (destroy) the BtShared.pHasContent bitvec. This should be
530** invoked at the conclusion of each write-transaction.
531*/
532static void btreeClearHasContent(BtShared *pBt){
533 sqlite3BitvecDestroy(pBt->pHasContent);
534 pBt->pHasContent = 0;
535}
536
537/*
drh980b1a72006-08-16 16:42:48 +0000538** Save the current cursor position in the variables BtCursor.nKey
539** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
drhea8ffdf2009-07-22 00:35:23 +0000540**
541** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
542** prior to calling this routine.
drh980b1a72006-08-16 16:42:48 +0000543*/
544static int saveCursorPosition(BtCursor *pCur){
545 int rc;
546
547 assert( CURSOR_VALID==pCur->eState );
548 assert( 0==pCur->pKey );
drh1fee73e2007-08-29 04:00:57 +0000549 assert( cursorHoldsMutex(pCur) );
drh980b1a72006-08-16 16:42:48 +0000550
551 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
drhea8ffdf2009-07-22 00:35:23 +0000552 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */
drh980b1a72006-08-16 16:42:48 +0000553
554 /* If this is an intKey table, then the above call to BtreeKeySize()
555 ** stores the integer key in pCur->nKey. In this case this value is
556 ** all that is required. Otherwise, if pCur is not open on an intKey
557 ** table, then malloc space for and store the pCur->nKey bytes of key
558 ** data.
559 */
drh4c301aa2009-07-15 17:25:45 +0000560 if( 0==pCur->apPage[0]->intKey ){
drhf49661a2008-12-10 16:45:50 +0000561 void *pKey = sqlite3Malloc( (int)pCur->nKey );
drh980b1a72006-08-16 16:42:48 +0000562 if( pKey ){
drhf49661a2008-12-10 16:45:50 +0000563 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
drh980b1a72006-08-16 16:42:48 +0000564 if( rc==SQLITE_OK ){
565 pCur->pKey = pKey;
566 }else{
drh17435752007-08-16 04:30:38 +0000567 sqlite3_free(pKey);
drh980b1a72006-08-16 16:42:48 +0000568 }
569 }else{
570 rc = SQLITE_NOMEM;
571 }
572 }
danielk197771d5d2c2008-09-29 11:49:47 +0000573 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
drh980b1a72006-08-16 16:42:48 +0000574
575 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +0000576 int i;
577 for(i=0; i<=pCur->iPage; i++){
578 releasePage(pCur->apPage[i]);
579 pCur->apPage[i] = 0;
580 }
581 pCur->iPage = -1;
drh980b1a72006-08-16 16:42:48 +0000582 pCur->eState = CURSOR_REQUIRESEEK;
583 }
584
danielk197792d4d7a2007-05-04 12:05:56 +0000585 invalidateOverflowCache(pCur);
drh980b1a72006-08-16 16:42:48 +0000586 return rc;
587}
588
589/*
590** Save the positions of all cursors except pExcept open on the table
591** with root-page iRoot. Usually, this is called just before cursor
592** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
593*/
594static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
595 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000596 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +0000597 assert( pExcept==0 || pExcept->pBt==pBt );
drh980b1a72006-08-16 16:42:48 +0000598 for(p=pBt->pCursor; p; p=p->pNext){
599 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
600 p->eState==CURSOR_VALID ){
601 int rc = saveCursorPosition(p);
602 if( SQLITE_OK!=rc ){
603 return rc;
604 }
605 }
606 }
607 return SQLITE_OK;
608}
609
610/*
drhbf700f32007-03-31 02:36:44 +0000611** Clear the current cursor position.
612*/
danielk1977be51a652008-10-08 17:58:48 +0000613void sqlite3BtreeClearCursor(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000614 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000615 sqlite3_free(pCur->pKey);
drhbf700f32007-03-31 02:36:44 +0000616 pCur->pKey = 0;
617 pCur->eState = CURSOR_INVALID;
618}
619
620/*
danielk19773509a652009-07-06 18:56:13 +0000621** In this version of BtreeMoveto, pKey is a packed index record
622** such as is generated by the OP_MakeRecord opcode. Unpack the
623** record and then call BtreeMovetoUnpacked() to do the work.
624*/
625static int btreeMoveto(
626 BtCursor *pCur, /* Cursor open on the btree to be searched */
627 const void *pKey, /* Packed key if the btree is an index */
628 i64 nKey, /* Integer key for tables. Size of pKey for indices */
629 int bias, /* Bias search to the high end */
630 int *pRes /* Write search results here */
631){
632 int rc; /* Status code */
633 UnpackedRecord *pIdxKey; /* Unpacked index key */
634 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */
635
636 if( pKey ){
637 assert( nKey==(i64)(int)nKey );
638 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
639 aSpace, sizeof(aSpace));
640 if( pIdxKey==0 ) return SQLITE_NOMEM;
641 }else{
642 pIdxKey = 0;
643 }
644 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
645 if( pKey ){
646 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
647 }
648 return rc;
649}
650
651/*
drh980b1a72006-08-16 16:42:48 +0000652** Restore the cursor to the position it was in (or as close to as possible)
653** when saveCursorPosition() was called. Note that this call deletes the
654** saved position info stored by saveCursorPosition(), so there can be
drha3460582008-07-11 21:02:53 +0000655** at most one effective restoreCursorPosition() call after each
drh980b1a72006-08-16 16:42:48 +0000656** saveCursorPosition().
drh980b1a72006-08-16 16:42:48 +0000657*/
danielk197730548662009-07-09 05:07:37 +0000658static int btreeRestoreCursorPosition(BtCursor *pCur){
drhbf700f32007-03-31 02:36:44 +0000659 int rc;
drh1fee73e2007-08-29 04:00:57 +0000660 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +0000661 assert( pCur->eState>=CURSOR_REQUIRESEEK );
662 if( pCur->eState==CURSOR_FAULT ){
drh4c301aa2009-07-15 17:25:45 +0000663 return pCur->skipNext;
drhfb982642007-08-30 01:19:59 +0000664 }
drh980b1a72006-08-16 16:42:48 +0000665 pCur->eState = CURSOR_INVALID;
drh4c301aa2009-07-15 17:25:45 +0000666 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
drh980b1a72006-08-16 16:42:48 +0000667 if( rc==SQLITE_OK ){
drh17435752007-08-16 04:30:38 +0000668 sqlite3_free(pCur->pKey);
drh980b1a72006-08-16 16:42:48 +0000669 pCur->pKey = 0;
drhbf700f32007-03-31 02:36:44 +0000670 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
drh980b1a72006-08-16 16:42:48 +0000671 }
672 return rc;
673}
674
drha3460582008-07-11 21:02:53 +0000675#define restoreCursorPosition(p) \
drhfb982642007-08-30 01:19:59 +0000676 (p->eState>=CURSOR_REQUIRESEEK ? \
danielk197730548662009-07-09 05:07:37 +0000677 btreeRestoreCursorPosition(p) : \
drh16a9b832007-05-05 18:39:25 +0000678 SQLITE_OK)
drh980b1a72006-08-16 16:42:48 +0000679
drha3460582008-07-11 21:02:53 +0000680/*
681** Determine whether or not a cursor has moved from the position it
drhdfe88ec2008-11-03 20:55:06 +0000682** was last placed at. Cursors can move when the row they are pointing
drha3460582008-07-11 21:02:53 +0000683** at is deleted out from under them.
684**
685** This routine returns an error code if something goes wrong. The
686** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
687*/
688int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
689 int rc;
690
691 rc = restoreCursorPosition(pCur);
692 if( rc ){
693 *pHasMoved = 1;
694 return rc;
695 }
drh4c301aa2009-07-15 17:25:45 +0000696 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
drha3460582008-07-11 21:02:53 +0000697 *pHasMoved = 1;
698 }else{
699 *pHasMoved = 0;
700 }
701 return SQLITE_OK;
702}
703
danielk1977599fcba2004-11-08 07:13:13 +0000704#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977afcdd022004-10-31 16:25:42 +0000705/*
drha3152892007-05-05 11:48:52 +0000706** Given a page number of a regular database page, return the page
707** number for the pointer-map page that contains the entry for the
708** input page number.
danielk1977afcdd022004-10-31 16:25:42 +0000709*/
danielk1977266664d2006-02-10 08:24:21 +0000710static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
danielk197789d40042008-11-17 14:20:56 +0000711 int nPagesPerMapPage;
712 Pgno iPtrMap, ret;
drh1fee73e2007-08-29 04:00:57 +0000713 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000714 nPagesPerMapPage = (pBt->usableSize/5)+1;
715 iPtrMap = (pgno-2)/nPagesPerMapPage;
716 ret = (iPtrMap*nPagesPerMapPage) + 2;
danielk1977266664d2006-02-10 08:24:21 +0000717 if( ret==PENDING_BYTE_PAGE(pBt) ){
718 ret++;
719 }
720 return ret;
721}
danielk1977a19df672004-11-03 11:37:07 +0000722
danielk1977afcdd022004-10-31 16:25:42 +0000723/*
danielk1977afcdd022004-10-31 16:25:42 +0000724** Write an entry into the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000725**
726** This routine updates the pointer map entry for page number 'key'
727** so that it maps to type 'eType' and parent page number 'pgno'.
drh98add2e2009-07-20 17:11:49 +0000728**
729** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
730** a no-op. If an error occurs, the appropriate error code is written
731** into *pRC.
danielk1977afcdd022004-10-31 16:25:42 +0000732*/
drh98add2e2009-07-20 17:11:49 +0000733static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
danielk19773b8a05f2007-03-19 17:44:26 +0000734 DbPage *pDbPage; /* The pointer map page */
735 u8 *pPtrmap; /* The pointer map data */
736 Pgno iPtrmap; /* The pointer map page number */
737 int offset; /* Offset in pointer map page */
drh98add2e2009-07-20 17:11:49 +0000738 int rc; /* Return code from subfunctions */
739
740 if( *pRC ) return;
danielk1977afcdd022004-10-31 16:25:42 +0000741
drh1fee73e2007-08-29 04:00:57 +0000742 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977266664d2006-02-10 08:24:21 +0000743 /* The master-journal page number must never be used as a pointer map page */
744 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
745
danielk1977ac11ee62005-01-15 12:45:51 +0000746 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +0000747 if( key==0 ){
drh98add2e2009-07-20 17:11:49 +0000748 *pRC = SQLITE_CORRUPT_BKPT;
749 return;
danielk1977fdb7cdb2005-01-17 02:12:18 +0000750 }
danielk1977266664d2006-02-10 08:24:21 +0000751 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000752 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977687566d2004-11-02 12:56:41 +0000753 if( rc!=SQLITE_OK ){
drh98add2e2009-07-20 17:11:49 +0000754 *pRC = rc;
755 return;
danielk1977afcdd022004-10-31 16:25:42 +0000756 }
danielk19778c666b12008-07-18 09:34:57 +0000757 offset = PTRMAP_PTROFFSET(iPtrmap, key);
drhacfc72b2009-06-05 18:44:15 +0000758 if( offset<0 ){
drh98add2e2009-07-20 17:11:49 +0000759 *pRC = SQLITE_CORRUPT_BKPT;
drh4925a552009-07-07 11:39:58 +0000760 goto ptrmap_exit;
drhacfc72b2009-06-05 18:44:15 +0000761 }
danielk19773b8a05f2007-03-19 17:44:26 +0000762 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000763
drh615ae552005-01-16 23:21:00 +0000764 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
765 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
drh98add2e2009-07-20 17:11:49 +0000766 *pRC= rc = sqlite3PagerWrite(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000767 if( rc==SQLITE_OK ){
768 pPtrmap[offset] = eType;
769 put4byte(&pPtrmap[offset+1], parent);
danielk1977afcdd022004-10-31 16:25:42 +0000770 }
danielk1977afcdd022004-10-31 16:25:42 +0000771 }
772
drh4925a552009-07-07 11:39:58 +0000773ptrmap_exit:
danielk19773b8a05f2007-03-19 17:44:26 +0000774 sqlite3PagerUnref(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000775}
776
777/*
778** Read an entry from the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000779**
780** This routine retrieves the pointer map entry for page 'key', writing
781** the type and parent page number to *pEType and *pPgno respectively.
782** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000783*/
danielk1977aef0bf62005-12-30 16:28:01 +0000784static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
danielk19773b8a05f2007-03-19 17:44:26 +0000785 DbPage *pDbPage; /* The pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000786 int iPtrmap; /* Pointer map page index */
787 u8 *pPtrmap; /* Pointer map page data */
788 int offset; /* Offset of entry in pointer map */
789 int rc;
790
drh1fee73e2007-08-29 04:00:57 +0000791 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000792
danielk1977266664d2006-02-10 08:24:21 +0000793 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000794 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000795 if( rc!=0 ){
796 return rc;
797 }
danielk19773b8a05f2007-03-19 17:44:26 +0000798 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000799
danielk19778c666b12008-07-18 09:34:57 +0000800 offset = PTRMAP_PTROFFSET(iPtrmap, key);
drh43617e92006-03-06 20:55:46 +0000801 assert( pEType!=0 );
802 *pEType = pPtrmap[offset];
danielk1977687566d2004-11-02 12:56:41 +0000803 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
danielk1977afcdd022004-10-31 16:25:42 +0000804
danielk19773b8a05f2007-03-19 17:44:26 +0000805 sqlite3PagerUnref(pDbPage);
drh49285702005-09-17 15:20:26 +0000806 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
danielk1977afcdd022004-10-31 16:25:42 +0000807 return SQLITE_OK;
808}
809
danielk197785d90ca2008-07-19 14:25:15 +0000810#else /* if defined SQLITE_OMIT_AUTOVACUUM */
drh98add2e2009-07-20 17:11:49 +0000811 #define ptrmapPut(w,x,y,z,rc)
danielk197785d90ca2008-07-19 14:25:15 +0000812 #define ptrmapGet(w,x,y,z) SQLITE_OK
drh98add2e2009-07-20 17:11:49 +0000813 #define ptrmapPutOvflPtr(x, y, rc)
danielk197785d90ca2008-07-19 14:25:15 +0000814#endif
danielk1977afcdd022004-10-31 16:25:42 +0000815
drh0d316a42002-08-11 20:10:47 +0000816/*
drh271efa52004-05-30 19:19:05 +0000817** Given a btree page and a cell index (0 means the first cell on
818** the page, 1 means the second cell, and so forth) return a pointer
819** to the cell content.
820**
821** This routine works only for pages that do not contain overflow cells.
drh3aac2dd2004-04-26 14:10:20 +0000822*/
drh1688c862008-07-18 02:44:17 +0000823#define findCell(P,I) \
824 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
drh43605152004-05-29 21:46:49 +0000825
826/*
drh93a960a2008-07-10 00:32:42 +0000827** This a more complex version of findCell() that works for
drh0a45c272009-07-08 01:49:11 +0000828** pages that do contain overflow cells.
drh43605152004-05-29 21:46:49 +0000829*/
830static u8 *findOverflowCell(MemPage *pPage, int iCell){
831 int i;
drh1fee73e2007-08-29 04:00:57 +0000832 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +0000833 for(i=pPage->nOverflow-1; i>=0; i--){
drh6d08b4d2004-07-20 12:45:22 +0000834 int k;
835 struct _OvflCell *pOvfl;
836 pOvfl = &pPage->aOvfl[i];
837 k = pOvfl->idx;
838 if( k<=iCell ){
839 if( k==iCell ){
840 return pOvfl->pCell;
drh43605152004-05-29 21:46:49 +0000841 }
842 iCell--;
843 }
844 }
danielk19771cc5ed82007-05-16 17:28:43 +0000845 return findCell(pPage, iCell);
drh43605152004-05-29 21:46:49 +0000846}
847
848/*
849** Parse a cell content block and fill in the CellInfo structure. There
danielk197730548662009-07-09 05:07:37 +0000850** are two versions of this function. btreeParseCell() takes a
851** cell index as the second argument and btreeParseCellPtr()
drh16a9b832007-05-05 18:39:25 +0000852** takes a pointer to the body of the cell as its second argument.
danielk19771cc5ed82007-05-16 17:28:43 +0000853**
854** Within this file, the parseCell() macro can be called instead of
danielk197730548662009-07-09 05:07:37 +0000855** btreeParseCellPtr(). Using some compilers, this will be faster.
drh43605152004-05-29 21:46:49 +0000856*/
danielk197730548662009-07-09 05:07:37 +0000857static void btreeParseCellPtr(
drh3aac2dd2004-04-26 14:10:20 +0000858 MemPage *pPage, /* Page containing the cell */
drh43605152004-05-29 21:46:49 +0000859 u8 *pCell, /* Pointer to the cell text. */
drh6f11bef2004-05-13 01:12:56 +0000860 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000861){
drhf49661a2008-12-10 16:45:50 +0000862 u16 n; /* Number bytes in cell content header */
drh271efa52004-05-30 19:19:05 +0000863 u32 nPayload; /* Number of bytes of cell payload */
drh43605152004-05-29 21:46:49 +0000864
drh1fee73e2007-08-29 04:00:57 +0000865 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000866
drh43605152004-05-29 21:46:49 +0000867 pInfo->pCell = pCell;
drhab01f612004-05-22 02:55:23 +0000868 assert( pPage->leaf==0 || pPage->leaf==1 );
drh271efa52004-05-30 19:19:05 +0000869 n = pPage->childPtrSize;
870 assert( n==4-4*pPage->leaf );
drh504b6982006-01-22 21:52:56 +0000871 if( pPage->intKey ){
drh79df1f42008-07-18 00:57:33 +0000872 if( pPage->hasData ){
873 n += getVarint32(&pCell[n], nPayload);
874 }else{
875 nPayload = 0;
876 }
drh1bd10f82008-12-10 21:19:56 +0000877 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
drh79df1f42008-07-18 00:57:33 +0000878 pInfo->nData = nPayload;
drh504b6982006-01-22 21:52:56 +0000879 }else{
drh79df1f42008-07-18 00:57:33 +0000880 pInfo->nData = 0;
881 n += getVarint32(&pCell[n], nPayload);
882 pInfo->nKey = nPayload;
drh6f11bef2004-05-13 01:12:56 +0000883 }
drh72365832007-03-06 15:53:44 +0000884 pInfo->nPayload = nPayload;
drh504b6982006-01-22 21:52:56 +0000885 pInfo->nHeader = n;
drh0a45c272009-07-08 01:49:11 +0000886 testcase( nPayload==pPage->maxLocal );
887 testcase( nPayload==pPage->maxLocal+1 );
drh79df1f42008-07-18 00:57:33 +0000888 if( likely(nPayload<=pPage->maxLocal) ){
drh271efa52004-05-30 19:19:05 +0000889 /* This is the (easy) common case where the entire payload fits
890 ** on the local page. No overflow is required.
891 */
892 int nSize; /* Total size of cell content in bytes */
drh79df1f42008-07-18 00:57:33 +0000893 nSize = nPayload + n;
drhf49661a2008-12-10 16:45:50 +0000894 pInfo->nLocal = (u16)nPayload;
drh6f11bef2004-05-13 01:12:56 +0000895 pInfo->iOverflow = 0;
drh79df1f42008-07-18 00:57:33 +0000896 if( (nSize & ~3)==0 ){
drh271efa52004-05-30 19:19:05 +0000897 nSize = 4; /* Minimum cell size is 4 */
drh43605152004-05-29 21:46:49 +0000898 }
drh1bd10f82008-12-10 21:19:56 +0000899 pInfo->nSize = (u16)nSize;
drh6f11bef2004-05-13 01:12:56 +0000900 }else{
drh271efa52004-05-30 19:19:05 +0000901 /* If the payload will not fit completely on the local page, we have
902 ** to decide how much to store locally and how much to spill onto
903 ** overflow pages. The strategy is to minimize the amount of unused
904 ** space on overflow pages while keeping the amount of local storage
905 ** in between minLocal and maxLocal.
906 **
907 ** Warning: changing the way overflow payload is distributed in any
908 ** way will result in an incompatible file format.
909 */
910 int minLocal; /* Minimum amount of payload held locally */
911 int maxLocal; /* Maximum amount of payload held locally */
912 int surplus; /* Overflow payload available for local storage */
913
914 minLocal = pPage->minLocal;
915 maxLocal = pPage->maxLocal;
916 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
drh0a45c272009-07-08 01:49:11 +0000917 testcase( surplus==maxLocal );
918 testcase( surplus==maxLocal+1 );
drh6f11bef2004-05-13 01:12:56 +0000919 if( surplus <= maxLocal ){
drhf49661a2008-12-10 16:45:50 +0000920 pInfo->nLocal = (u16)surplus;
drh6f11bef2004-05-13 01:12:56 +0000921 }else{
drhf49661a2008-12-10 16:45:50 +0000922 pInfo->nLocal = (u16)minLocal;
drh6f11bef2004-05-13 01:12:56 +0000923 }
drhf49661a2008-12-10 16:45:50 +0000924 pInfo->iOverflow = (u16)(pInfo->nLocal + n);
drh6f11bef2004-05-13 01:12:56 +0000925 pInfo->nSize = pInfo->iOverflow + 4;
926 }
drh3aac2dd2004-04-26 14:10:20 +0000927}
danielk19771cc5ed82007-05-16 17:28:43 +0000928#define parseCell(pPage, iCell, pInfo) \
danielk197730548662009-07-09 05:07:37 +0000929 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
930static void btreeParseCell(
drh43605152004-05-29 21:46:49 +0000931 MemPage *pPage, /* Page containing the cell */
932 int iCell, /* The cell index. First cell is 0 */
933 CellInfo *pInfo /* Fill in this structure */
934){
danielk19771cc5ed82007-05-16 17:28:43 +0000935 parseCell(pPage, iCell, pInfo);
drh43605152004-05-29 21:46:49 +0000936}
drh3aac2dd2004-04-26 14:10:20 +0000937
938/*
drh43605152004-05-29 21:46:49 +0000939** Compute the total number of bytes that a Cell needs in the cell
940** data area of the btree-page. The return number includes the cell
941** data header and the local payload, but not any overflow page or
942** the space used by the cell pointer.
drh3b7511c2001-05-26 13:15:44 +0000943*/
danielk1977ae5558b2009-04-29 11:31:47 +0000944static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
945 u8 *pIter = &pCell[pPage->childPtrSize];
946 u32 nSize;
947
948#ifdef SQLITE_DEBUG
949 /* The value returned by this function should always be the same as
950 ** the (CellInfo.nSize) value found by doing a full parse of the
951 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
952 ** this function verifies that this invariant is not violated. */
953 CellInfo debuginfo;
danielk197730548662009-07-09 05:07:37 +0000954 btreeParseCellPtr(pPage, pCell, &debuginfo);
danielk1977ae5558b2009-04-29 11:31:47 +0000955#endif
956
957 if( pPage->intKey ){
958 u8 *pEnd;
959 if( pPage->hasData ){
960 pIter += getVarint32(pIter, nSize);
961 }else{
962 nSize = 0;
963 }
964
965 /* pIter now points at the 64-bit integer key value, a variable length
966 ** integer. The following block moves pIter to point at the first byte
967 ** past the end of the key value. */
968 pEnd = &pIter[9];
969 while( (*pIter++)&0x80 && pIter<pEnd );
970 }else{
971 pIter += getVarint32(pIter, nSize);
972 }
973
drh0a45c272009-07-08 01:49:11 +0000974 testcase( nSize==pPage->maxLocal );
975 testcase( nSize==pPage->maxLocal+1 );
danielk1977ae5558b2009-04-29 11:31:47 +0000976 if( nSize>pPage->maxLocal ){
977 int minLocal = pPage->minLocal;
978 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
drh0a45c272009-07-08 01:49:11 +0000979 testcase( nSize==pPage->maxLocal );
980 testcase( nSize==pPage->maxLocal+1 );
danielk1977ae5558b2009-04-29 11:31:47 +0000981 if( nSize>pPage->maxLocal ){
982 nSize = minLocal;
983 }
984 nSize += 4;
985 }
shane75ac1de2009-06-09 18:58:52 +0000986 nSize += (u32)(pIter - pCell);
danielk1977ae5558b2009-04-29 11:31:47 +0000987
988 /* The minimum size of any cell is 4 bytes. */
989 if( nSize<4 ){
990 nSize = 4;
991 }
992
993 assert( nSize==debuginfo.nSize );
shane60a4b532009-05-06 18:57:09 +0000994 return (u16)nSize;
danielk1977ae5558b2009-04-29 11:31:47 +0000995}
danielk1977bc6ada42004-06-30 08:20:16 +0000996#ifndef NDEBUG
drha9121e42008-02-19 14:59:35 +0000997static u16 cellSize(MemPage *pPage, int iCell){
danielk1977ae5558b2009-04-29 11:31:47 +0000998 return cellSizePtr(pPage, findCell(pPage, iCell));
drh43605152004-05-29 21:46:49 +0000999}
danielk1977bc6ada42004-06-30 08:20:16 +00001000#endif
drh3b7511c2001-05-26 13:15:44 +00001001
danielk197779a40da2005-01-16 08:00:01 +00001002#ifndef SQLITE_OMIT_AUTOVACUUM
drh3b7511c2001-05-26 13:15:44 +00001003/*
danielk197726836652005-01-17 01:33:13 +00001004** If the cell pCell, part of page pPage contains a pointer
danielk197779a40da2005-01-16 08:00:01 +00001005** to an overflow page, insert an entry into the pointer-map
1006** for the overflow page.
danielk1977ac11ee62005-01-15 12:45:51 +00001007*/
drh98add2e2009-07-20 17:11:49 +00001008static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
drhfa67c3c2008-07-11 02:21:40 +00001009 CellInfo info;
drh98add2e2009-07-20 17:11:49 +00001010 if( *pRC ) return;
drhfa67c3c2008-07-11 02:21:40 +00001011 assert( pCell!=0 );
danielk197730548662009-07-09 05:07:37 +00001012 btreeParseCellPtr(pPage, pCell, &info);
drhfa67c3c2008-07-11 02:21:40 +00001013 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
danielk19774dbaa892009-06-16 16:50:22 +00001014 if( info.iOverflow ){
drhfa67c3c2008-07-11 02:21:40 +00001015 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
drh98add2e2009-07-20 17:11:49 +00001016 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
danielk1977ac11ee62005-01-15 12:45:51 +00001017 }
danielk1977ac11ee62005-01-15 12:45:51 +00001018}
danielk197779a40da2005-01-16 08:00:01 +00001019#endif
1020
danielk1977ac11ee62005-01-15 12:45:51 +00001021
drhda200cc2004-05-09 11:51:38 +00001022/*
drh72f82862001-05-24 21:06:34 +00001023** Defragment the page given. All Cells are moved to the
drh3a4a2d42005-11-24 14:24:28 +00001024** end of the page and all free space is collected into one
1025** big FreeBlk that occurs in between the header and cell
drh31beae92005-11-24 14:34:36 +00001026** pointer array and the cell content area.
drh365d68f2001-05-11 11:02:46 +00001027*/
shane0af3f892008-11-12 04:55:34 +00001028static int defragmentPage(MemPage *pPage){
drh43605152004-05-29 21:46:49 +00001029 int i; /* Loop counter */
1030 int pc; /* Address of a i-th cell */
drh43605152004-05-29 21:46:49 +00001031 int hdr; /* Offset to the page header */
1032 int size; /* Size of a cell */
1033 int usableSize; /* Number of usable bytes on a page */
1034 int cellOffset; /* Offset to the cell pointer array */
drh281b21d2008-08-22 12:57:08 +00001035 int cbrk; /* Offset to the cell content area */
drh43605152004-05-29 21:46:49 +00001036 int nCell; /* Number of cells on the page */
drh2e38c322004-09-03 18:38:44 +00001037 unsigned char *data; /* The page data */
1038 unsigned char *temp; /* Temp area for cell content */
drh17146622009-07-07 17:38:38 +00001039 int iCellFirst; /* First allowable cell index */
1040 int iCellLast; /* Last possible cell index */
1041
drh2af926b2001-05-15 00:39:25 +00001042
danielk19773b8a05f2007-03-19 17:44:26 +00001043 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +00001044 assert( pPage->pBt!=0 );
drh90f5ecb2004-07-22 01:19:35 +00001045 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
drh43605152004-05-29 21:46:49 +00001046 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +00001047 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh26b79942007-11-28 16:19:56 +00001048 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
drh43605152004-05-29 21:46:49 +00001049 data = pPage->aData;
drh9e572e62004-04-23 23:43:10 +00001050 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00001051 cellOffset = pPage->cellOffset;
1052 nCell = pPage->nCell;
1053 assert( nCell==get2byte(&data[hdr+3]) );
1054 usableSize = pPage->pBt->usableSize;
drh281b21d2008-08-22 12:57:08 +00001055 cbrk = get2byte(&data[hdr+5]);
1056 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1057 cbrk = usableSize;
drh17146622009-07-07 17:38:38 +00001058 iCellFirst = cellOffset + 2*nCell;
1059 iCellLast = usableSize - 4;
drh43605152004-05-29 21:46:49 +00001060 for(i=0; i<nCell; i++){
1061 u8 *pAddr; /* The i-th cell pointer */
1062 pAddr = &data[cellOffset + i*2];
1063 pc = get2byte(pAddr);
drh0a45c272009-07-08 01:49:11 +00001064 testcase( pc==iCellFirst );
1065 testcase( pc==iCellLast );
drh17146622009-07-07 17:38:38 +00001066#if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
danielk197730548662009-07-09 05:07:37 +00001067 /* These conditions have already been verified in btreeInitPage()
drh17146622009-07-07 17:38:38 +00001068 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1069 */
1070 if( pc<iCellFirst || pc>iCellLast ){
shane0af3f892008-11-12 04:55:34 +00001071 return SQLITE_CORRUPT_BKPT;
1072 }
drh17146622009-07-07 17:38:38 +00001073#endif
1074 assert( pc>=iCellFirst && pc<=iCellLast );
drh43605152004-05-29 21:46:49 +00001075 size = cellSizePtr(pPage, &temp[pc]);
drh281b21d2008-08-22 12:57:08 +00001076 cbrk -= size;
drh17146622009-07-07 17:38:38 +00001077#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1078 if( cbrk<iCellFirst ){
shane0af3f892008-11-12 04:55:34 +00001079 return SQLITE_CORRUPT_BKPT;
1080 }
drh17146622009-07-07 17:38:38 +00001081#else
1082 if( cbrk<iCellFirst || pc+size>usableSize ){
1083 return SQLITE_CORRUPT_BKPT;
1084 }
1085#endif
drh7157e1d2009-07-09 13:25:32 +00001086 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
drh0a45c272009-07-08 01:49:11 +00001087 testcase( cbrk+size==usableSize );
drh0a45c272009-07-08 01:49:11 +00001088 testcase( pc+size==usableSize );
drh281b21d2008-08-22 12:57:08 +00001089 memcpy(&data[cbrk], &temp[pc], size);
1090 put2byte(pAddr, cbrk);
drh2af926b2001-05-15 00:39:25 +00001091 }
drh17146622009-07-07 17:38:38 +00001092 assert( cbrk>=iCellFirst );
drh281b21d2008-08-22 12:57:08 +00001093 put2byte(&data[hdr+5], cbrk);
drh43605152004-05-29 21:46:49 +00001094 data[hdr+1] = 0;
1095 data[hdr+2] = 0;
1096 data[hdr+7] = 0;
drh17146622009-07-07 17:38:38 +00001097 memset(&data[iCellFirst], 0, cbrk-iCellFirst);
drhc5053fb2008-11-27 02:22:10 +00001098 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh17146622009-07-07 17:38:38 +00001099 if( cbrk-iCellFirst!=pPage->nFree ){
danielk1977360e6342008-11-12 08:49:51 +00001100 return SQLITE_CORRUPT_BKPT;
1101 }
shane0af3f892008-11-12 04:55:34 +00001102 return SQLITE_OK;
drh365d68f2001-05-11 11:02:46 +00001103}
1104
drha059ad02001-04-17 20:09:11 +00001105/*
danielk19776011a752009-04-01 16:25:32 +00001106** Allocate nByte bytes of space from within the B-Tree page passed
drh0a45c272009-07-08 01:49:11 +00001107** as the first argument. Write into *pIdx the index into pPage->aData[]
1108** of the first byte of allocated space. Return either SQLITE_OK or
1109** an error code (usually SQLITE_CORRUPT).
drhbd03cae2001-06-02 02:40:57 +00001110**
drh0a45c272009-07-08 01:49:11 +00001111** The caller guarantees that there is sufficient space to make the
1112** allocation. This routine might need to defragment in order to bring
1113** all the space together, however. This routine will avoid using
1114** the first two bytes past the cell pointer area since presumably this
1115** allocation is being made in order to insert a new cell, so we will
1116** also end up needing a new cell pointer.
drh7e3b0a02001-04-28 16:52:40 +00001117*/
drh0a45c272009-07-08 01:49:11 +00001118static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
danielk19776011a752009-04-01 16:25:32 +00001119 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
1120 u8 * const data = pPage->aData; /* Local cache of pPage->aData */
1121 int nFrag; /* Number of fragmented bytes on pPage */
drh0a45c272009-07-08 01:49:11 +00001122 int top; /* First byte of cell content area */
1123 int gap; /* First byte of gap between cell pointers and cell content */
1124 int rc; /* Integer return code */
drh43605152004-05-29 21:46:49 +00001125
danielk19773b8a05f2007-03-19 17:44:26 +00001126 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +00001127 assert( pPage->pBt );
drh1fee73e2007-08-29 04:00:57 +00001128 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa67c3c2008-07-11 02:21:40 +00001129 assert( nByte>=0 ); /* Minimum cell size is 4 */
1130 assert( pPage->nFree>=nByte );
1131 assert( pPage->nOverflow==0 );
drhc314dc72009-07-21 11:52:34 +00001132 assert( nByte<pPage->pBt->usableSize-8 );
drh43605152004-05-29 21:46:49 +00001133
1134 nFrag = data[hdr+7];
drh0a45c272009-07-08 01:49:11 +00001135 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1136 gap = pPage->cellOffset + 2*pPage->nCell;
1137 top = get2byte(&data[hdr+5]);
drh7157e1d2009-07-09 13:25:32 +00001138 if( gap>top ) return SQLITE_CORRUPT_BKPT;
drh0a45c272009-07-08 01:49:11 +00001139 testcase( gap+2==top );
1140 testcase( gap+1==top );
1141 testcase( gap==top );
1142
danielk19776011a752009-04-01 16:25:32 +00001143 if( nFrag>=60 ){
drh0a45c272009-07-08 01:49:11 +00001144 /* Always defragment highly fragmented pages */
1145 rc = defragmentPage(pPage);
1146 if( rc ) return rc;
1147 top = get2byte(&data[hdr+5]);
1148 }else if( gap+2<=top ){
danielk19776011a752009-04-01 16:25:32 +00001149 /* Search the freelist looking for a free slot big enough to satisfy
1150 ** the request. The allocation is made from the first free slot in
1151 ** the list that is large enough to accomadate it.
1152 */
1153 int pc, addr;
1154 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1155 int size = get2byte(&data[pc+2]); /* Size of free slot */
drh43605152004-05-29 21:46:49 +00001156 if( size>=nByte ){
drhf49661a2008-12-10 16:45:50 +00001157 int x = size - nByte;
drh0a45c272009-07-08 01:49:11 +00001158 testcase( x==4 );
1159 testcase( x==3 );
danielk19776011a752009-04-01 16:25:32 +00001160 if( x<4 ){
danielk1977fad91942009-04-29 17:49:59 +00001161 /* Remove the slot from the free-list. Update the number of
1162 ** fragmented bytes within the page. */
drh43605152004-05-29 21:46:49 +00001163 memcpy(&data[addr], &data[pc], 2);
drhf49661a2008-12-10 16:45:50 +00001164 data[hdr+7] = (u8)(nFrag + x);
drh43605152004-05-29 21:46:49 +00001165 }else{
danielk1977fad91942009-04-29 17:49:59 +00001166 /* The slot remains on the free-list. Reduce its size to account
1167 ** for the portion used by the new allocation. */
drhf49661a2008-12-10 16:45:50 +00001168 put2byte(&data[pc+2], x);
drh43605152004-05-29 21:46:49 +00001169 }
drh0a45c272009-07-08 01:49:11 +00001170 *pIdx = pc + x;
1171 return SQLITE_OK;
drh43605152004-05-29 21:46:49 +00001172 }
drh9e572e62004-04-23 23:43:10 +00001173 }
1174 }
drh43605152004-05-29 21:46:49 +00001175
drh0a45c272009-07-08 01:49:11 +00001176 /* Check to make sure there is enough space in the gap to satisfy
1177 ** the allocation. If not, defragment.
1178 */
1179 testcase( gap+2+nByte==top );
1180 if( gap+2+nByte>top ){
1181 rc = defragmentPage(pPage);
1182 if( rc ) return rc;
1183 top = get2byte(&data[hdr+5]);
1184 assert( gap+nByte<=top );
1185 }
1186
1187
drh43605152004-05-29 21:46:49 +00001188 /* Allocate memory from the gap in between the cell pointer array
drhc314dc72009-07-21 11:52:34 +00001189 ** and the cell content area. The btreeInitPage() call has already
1190 ** validated the freelist. Given that the freelist is valid, there
1191 ** is no way that the allocation can extend off the end of the page.
1192 ** The assert() below verifies the previous sentence.
drh43605152004-05-29 21:46:49 +00001193 */
drh0a45c272009-07-08 01:49:11 +00001194 top -= nByte;
drh43605152004-05-29 21:46:49 +00001195 put2byte(&data[hdr+5], top);
drhc314dc72009-07-21 11:52:34 +00001196 assert( top+nByte <= pPage->pBt->usableSize );
drh0a45c272009-07-08 01:49:11 +00001197 *pIdx = top;
1198 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001199}
1200
1201/*
drh9e572e62004-04-23 23:43:10 +00001202** Return a section of the pPage->aData to the freelist.
1203** The first byte of the new free block is pPage->aDisk[start]
1204** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +00001205**
1206** Most of the effort here is involved in coalesing adjacent
1207** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +00001208*/
shanedcc50b72008-11-13 18:29:50 +00001209static int freeSpace(MemPage *pPage, int start, int size){
drh43605152004-05-29 21:46:49 +00001210 int addr, pbegin, hdr;
drh0a45c272009-07-08 01:49:11 +00001211 int iLast; /* Largest possible freeblock offset */
drh9e572e62004-04-23 23:43:10 +00001212 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +00001213
drh9e572e62004-04-23 23:43:10 +00001214 assert( pPage->pBt!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00001215 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drhc046e3e2009-07-15 11:26:44 +00001216 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
danielk1977bc6ada42004-06-30 08:20:16 +00001217 assert( (start + size)<=pPage->pBt->usableSize );
drh1fee73e2007-08-29 04:00:57 +00001218 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh34004ce2008-07-11 16:15:17 +00001219 assert( size>=0 ); /* Minimum cell size is 4 */
drh9e572e62004-04-23 23:43:10 +00001220
drhfcce93f2006-02-22 03:08:32 +00001221#ifdef SQLITE_SECURE_DELETE
1222 /* Overwrite deleted information with zeros when the SECURE_DELETE
1223 ** option is enabled at compile-time */
1224 memset(&data[start], 0, size);
1225#endif
1226
drh0a45c272009-07-08 01:49:11 +00001227 /* Add the space back into the linked list of freeblocks. Note that
danielk197730548662009-07-09 05:07:37 +00001228 ** even though the freeblock list was checked by btreeInitPage(),
1229 ** btreeInitPage() did not detect overlapping cells or
drhb908d762009-07-08 16:54:40 +00001230 ** freeblocks that overlapped cells. Nor does it detect when the
1231 ** cell content area exceeds the value in the page header. If these
1232 ** situations arise, then subsequent insert operations might corrupt
1233 ** the freelist. So we do need to check for corruption while scanning
1234 ** the freelist.
drh0a45c272009-07-08 01:49:11 +00001235 */
drh43605152004-05-29 21:46:49 +00001236 hdr = pPage->hdrOffset;
1237 addr = hdr + 1;
drh0a45c272009-07-08 01:49:11 +00001238 iLast = pPage->pBt->usableSize - 4;
drh35a25da2009-07-08 15:14:50 +00001239 assert( start<=iLast );
drh3aac2dd2004-04-26 14:10:20 +00001240 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drh35a25da2009-07-08 15:14:50 +00001241 if( pbegin<addr+4 ){
shanedcc50b72008-11-13 18:29:50 +00001242 return SQLITE_CORRUPT_BKPT;
1243 }
drh3aac2dd2004-04-26 14:10:20 +00001244 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +00001245 }
drh0a45c272009-07-08 01:49:11 +00001246 if( pbegin>iLast ){
shanedcc50b72008-11-13 18:29:50 +00001247 return SQLITE_CORRUPT_BKPT;
1248 }
drh3aac2dd2004-04-26 14:10:20 +00001249 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +00001250 put2byte(&data[addr], start);
1251 put2byte(&data[start], pbegin);
1252 put2byte(&data[start+2], size);
shane36840fd2009-06-26 16:32:13 +00001253 pPage->nFree = pPage->nFree + (u16)size;
drh9e572e62004-04-23 23:43:10 +00001254
1255 /* Coalesce adjacent free blocks */
drh0a45c272009-07-08 01:49:11 +00001256 addr = hdr + 1;
drh3aac2dd2004-04-26 14:10:20 +00001257 while( (pbegin = get2byte(&data[addr]))>0 ){
drhf49661a2008-12-10 16:45:50 +00001258 int pnext, psize, x;
drh3aac2dd2004-04-26 14:10:20 +00001259 assert( pbegin>addr );
drh43605152004-05-29 21:46:49 +00001260 assert( pbegin<=pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +00001261 pnext = get2byte(&data[pbegin]);
1262 psize = get2byte(&data[pbegin+2]);
1263 if( pbegin + psize + 3 >= pnext && pnext>0 ){
1264 int frag = pnext - (pbegin+psize);
drh0a45c272009-07-08 01:49:11 +00001265 if( (frag<0) || (frag>(int)data[hdr+7]) ){
shanedcc50b72008-11-13 18:29:50 +00001266 return SQLITE_CORRUPT_BKPT;
1267 }
drh0a45c272009-07-08 01:49:11 +00001268 data[hdr+7] -= (u8)frag;
drhf49661a2008-12-10 16:45:50 +00001269 x = get2byte(&data[pnext]);
1270 put2byte(&data[pbegin], x);
1271 x = pnext + get2byte(&data[pnext+2]) - pbegin;
1272 put2byte(&data[pbegin+2], x);
drh9e572e62004-04-23 23:43:10 +00001273 }else{
drh3aac2dd2004-04-26 14:10:20 +00001274 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +00001275 }
1276 }
drh7e3b0a02001-04-28 16:52:40 +00001277
drh43605152004-05-29 21:46:49 +00001278 /* If the cell content area begins with a freeblock, remove it. */
1279 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1280 int top;
1281 pbegin = get2byte(&data[hdr+1]);
1282 memcpy(&data[hdr+1], &data[pbegin], 2);
drhf49661a2008-12-10 16:45:50 +00001283 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1284 put2byte(&data[hdr+5], top);
drh4b70f112004-05-02 21:12:19 +00001285 }
drhc5053fb2008-11-27 02:22:10 +00001286 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
shanedcc50b72008-11-13 18:29:50 +00001287 return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +00001288}
1289
1290/*
drh271efa52004-05-30 19:19:05 +00001291** Decode the flags byte (the first byte of the header) for a page
1292** and initialize fields of the MemPage structure accordingly.
drh44845222008-07-17 18:39:57 +00001293**
1294** Only the following combinations are supported. Anything different
1295** indicates a corrupt database files:
1296**
1297** PTF_ZERODATA
1298** PTF_ZERODATA | PTF_LEAF
1299** PTF_LEAFDATA | PTF_INTKEY
1300** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
drh271efa52004-05-30 19:19:05 +00001301*/
drh44845222008-07-17 18:39:57 +00001302static int decodeFlags(MemPage *pPage, int flagByte){
danielk1977aef0bf62005-12-30 16:28:01 +00001303 BtShared *pBt; /* A copy of pPage->pBt */
drh271efa52004-05-30 19:19:05 +00001304
1305 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
drh1fee73e2007-08-29 04:00:57 +00001306 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf49661a2008-12-10 16:45:50 +00001307 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );
drh44845222008-07-17 18:39:57 +00001308 flagByte &= ~PTF_LEAF;
1309 pPage->childPtrSize = 4-4*pPage->leaf;
drh271efa52004-05-30 19:19:05 +00001310 pBt = pPage->pBt;
drh44845222008-07-17 18:39:57 +00001311 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1312 pPage->intKey = 1;
1313 pPage->hasData = pPage->leaf;
drh271efa52004-05-30 19:19:05 +00001314 pPage->maxLocal = pBt->maxLeaf;
1315 pPage->minLocal = pBt->minLeaf;
drh44845222008-07-17 18:39:57 +00001316 }else if( flagByte==PTF_ZERODATA ){
1317 pPage->intKey = 0;
1318 pPage->hasData = 0;
drh271efa52004-05-30 19:19:05 +00001319 pPage->maxLocal = pBt->maxLocal;
1320 pPage->minLocal = pBt->minLocal;
drh44845222008-07-17 18:39:57 +00001321 }else{
1322 return SQLITE_CORRUPT_BKPT;
drh271efa52004-05-30 19:19:05 +00001323 }
drh44845222008-07-17 18:39:57 +00001324 return SQLITE_OK;
drh271efa52004-05-30 19:19:05 +00001325}
1326
1327/*
drh7e3b0a02001-04-28 16:52:40 +00001328** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +00001329**
1330** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +00001331** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +00001332** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1333** guarantee that the page is well-formed. It only shows that
1334** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +00001335*/
danielk197730548662009-07-09 05:07:37 +00001336static int btreeInitPage(MemPage *pPage){
drh2af926b2001-05-15 00:39:25 +00001337
danielk197771d5d2c2008-09-29 11:49:47 +00001338 assert( pPage->pBt!=0 );
1339 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +00001340 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
drhbf4bca52007-09-06 22:19:14 +00001341 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1342 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +00001343
1344 if( !pPage->isInit ){
drhf49661a2008-12-10 16:45:50 +00001345 u16 pc; /* Address of a freeblock within pPage->aData[] */
1346 u8 hdr; /* Offset to beginning of page header */
danielk197771d5d2c2008-09-29 11:49:47 +00001347 u8 *data; /* Equal to pPage->aData */
1348 BtShared *pBt; /* The main btree structure */
drhf49661a2008-12-10 16:45:50 +00001349 u16 usableSize; /* Amount of usable space on each page */
1350 u16 cellOffset; /* Offset from start of page to first cell pointer */
1351 u16 nFree; /* Number of unused bytes on the page */
1352 u16 top; /* First byte of the cell content area */
drh0a45c272009-07-08 01:49:11 +00001353 int iCellFirst; /* First allowable cell or freeblock offset */
1354 int iCellLast; /* Last possible cell or freeblock offset */
danielk197771d5d2c2008-09-29 11:49:47 +00001355
1356 pBt = pPage->pBt;
1357
danielk1977eaa06f62008-09-18 17:34:44 +00001358 hdr = pPage->hdrOffset;
1359 data = pPage->aData;
1360 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1361 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1362 pPage->maskPage = pBt->pageSize - 1;
1363 pPage->nOverflow = 0;
danielk1977eaa06f62008-09-18 17:34:44 +00001364 usableSize = pBt->usableSize;
1365 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1366 top = get2byte(&data[hdr+5]);
1367 pPage->nCell = get2byte(&data[hdr+3]);
1368 if( pPage->nCell>MX_CELL(pBt) ){
1369 /* To many cells for a single page. The page must be corrupt */
1370 return SQLITE_CORRUPT_BKPT;
1371 }
drhb908d762009-07-08 16:54:40 +00001372 testcase( pPage->nCell==MX_CELL(pBt) );
drh69e931e2009-06-03 21:04:35 +00001373
shane5eff7cf2009-08-10 03:57:58 +00001374 /* A malformed database page might cause us to read past the end
drh69e931e2009-06-03 21:04:35 +00001375 ** of page when parsing a cell.
1376 **
1377 ** The following block of code checks early to see if a cell extends
1378 ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1379 ** returned if it does.
1380 */
drh0a45c272009-07-08 01:49:11 +00001381 iCellFirst = cellOffset + 2*pPage->nCell;
1382 iCellLast = usableSize - 4;
drh3b2a3fa2009-06-09 13:42:24 +00001383#if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
drh69e931e2009-06-03 21:04:35 +00001384 {
drh69e931e2009-06-03 21:04:35 +00001385 int i; /* Index into the cell pointer array */
1386 int sz; /* Size of a cell */
1387
drh69e931e2009-06-03 21:04:35 +00001388 if( !pPage->leaf ) iCellLast--;
1389 for(i=0; i<pPage->nCell; i++){
1390 pc = get2byte(&data[cellOffset+i*2]);
drh0a45c272009-07-08 01:49:11 +00001391 testcase( pc==iCellFirst );
1392 testcase( pc==iCellLast );
drh69e931e2009-06-03 21:04:35 +00001393 if( pc<iCellFirst || pc>iCellLast ){
1394 return SQLITE_CORRUPT_BKPT;
1395 }
1396 sz = cellSizePtr(pPage, &data[pc]);
drh0a45c272009-07-08 01:49:11 +00001397 testcase( pc+sz==usableSize );
drh69e931e2009-06-03 21:04:35 +00001398 if( pc+sz>usableSize ){
1399 return SQLITE_CORRUPT_BKPT;
1400 }
1401 }
drh0a45c272009-07-08 01:49:11 +00001402 if( !pPage->leaf ) iCellLast++;
drh69e931e2009-06-03 21:04:35 +00001403 }
1404#endif
1405
danielk1977eaa06f62008-09-18 17:34:44 +00001406 /* Compute the total free space on the page */
1407 pc = get2byte(&data[hdr+1]);
danielk197793c829c2009-06-03 17:26:17 +00001408 nFree = data[hdr+7] + top;
danielk1977eaa06f62008-09-18 17:34:44 +00001409 while( pc>0 ){
drh1bd10f82008-12-10 21:19:56 +00001410 u16 next, size;
drh0a45c272009-07-08 01:49:11 +00001411 if( pc<iCellFirst || pc>iCellLast ){
dan4361e792009-08-14 17:01:22 +00001412 /* Start of free block is off the page */
danielk1977eaa06f62008-09-18 17:34:44 +00001413 return SQLITE_CORRUPT_BKPT;
1414 }
1415 next = get2byte(&data[pc]);
1416 size = get2byte(&data[pc+2]);
dan4361e792009-08-14 17:01:22 +00001417 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1418 /* Free blocks must be in ascending order. And the last byte of
1419 ** the free-block must lie on the database page. */
danielk1977eaa06f62008-09-18 17:34:44 +00001420 return SQLITE_CORRUPT_BKPT;
1421 }
shane85095702009-06-15 16:27:08 +00001422 nFree = nFree + size;
danielk1977eaa06f62008-09-18 17:34:44 +00001423 pc = next;
1424 }
danielk197793c829c2009-06-03 17:26:17 +00001425
1426 /* At this point, nFree contains the sum of the offset to the start
1427 ** of the cell-content area plus the number of free bytes within
1428 ** the cell-content area. If this is greater than the usable-size
1429 ** of the page, then the page must be corrupted. This check also
1430 ** serves to verify that the offset to the start of the cell-content
1431 ** area, according to the page header, lies within the page.
1432 */
1433 if( nFree>usableSize ){
drh49285702005-09-17 15:20:26 +00001434 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001435 }
shane5eff7cf2009-08-10 03:57:58 +00001436 pPage->nFree = (u16)(nFree - iCellFirst);
danielk197771d5d2c2008-09-29 11:49:47 +00001437 pPage->isInit = 1;
1438 }
drh9e572e62004-04-23 23:43:10 +00001439 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001440}
1441
1442/*
drh8b2f49b2001-06-08 00:21:52 +00001443** Set up a raw page so that it looks like a database page holding
1444** no entries.
drhbd03cae2001-06-02 02:40:57 +00001445*/
drh9e572e62004-04-23 23:43:10 +00001446static void zeroPage(MemPage *pPage, int flags){
1447 unsigned char *data = pPage->aData;
danielk1977aef0bf62005-12-30 16:28:01 +00001448 BtShared *pBt = pPage->pBt;
drhf49661a2008-12-10 16:45:50 +00001449 u8 hdr = pPage->hdrOffset;
1450 u16 first;
drh9e572e62004-04-23 23:43:10 +00001451
danielk19773b8a05f2007-03-19 17:44:26 +00001452 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
drhbf4bca52007-09-06 22:19:14 +00001453 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1454 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
danielk19773b8a05f2007-03-19 17:44:26 +00001455 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00001456 assert( sqlite3_mutex_held(pBt->mutex) );
drh1af4a6e2008-07-18 03:32:51 +00001457 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
drh1bd10f82008-12-10 21:19:56 +00001458 data[hdr] = (char)flags;
1459 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
drh43605152004-05-29 21:46:49 +00001460 memset(&data[hdr+1], 0, 4);
1461 data[hdr+7] = 0;
1462 put2byte(&data[hdr+5], pBt->usableSize);
drhb6f41482004-05-14 01:58:11 +00001463 pPage->nFree = pBt->usableSize - first;
drh271efa52004-05-30 19:19:05 +00001464 decodeFlags(pPage, flags);
drh9e572e62004-04-23 23:43:10 +00001465 pPage->hdrOffset = hdr;
drh43605152004-05-29 21:46:49 +00001466 pPage->cellOffset = first;
1467 pPage->nOverflow = 0;
drh1688c862008-07-18 02:44:17 +00001468 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1469 pPage->maskPage = pBt->pageSize - 1;
drh43605152004-05-29 21:46:49 +00001470 pPage->nCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001471 pPage->isInit = 1;
drhbd03cae2001-06-02 02:40:57 +00001472}
1473
drh897a8202008-09-18 01:08:15 +00001474
1475/*
1476** Convert a DbPage obtained from the pager into a MemPage used by
1477** the btree layer.
1478*/
1479static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1480 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1481 pPage->aData = sqlite3PagerGetData(pDbPage);
1482 pPage->pDbPage = pDbPage;
1483 pPage->pBt = pBt;
1484 pPage->pgno = pgno;
1485 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1486 return pPage;
1487}
1488
drhbd03cae2001-06-02 02:40:57 +00001489/*
drh3aac2dd2004-04-26 14:10:20 +00001490** Get a page from the pager. Initialize the MemPage.pBt and
1491** MemPage.aData elements if needed.
drh538f5702007-04-13 02:14:30 +00001492**
1493** If the noContent flag is set, it means that we do not care about
1494** the content of the page at this time. So do not go to the disk
1495** to fetch the content. Just fill in the content with zeros for now.
1496** If in the future we call sqlite3PagerWrite() on this page, that
1497** means we have started to be concerned about content and the disk
1498** read should occur at that point.
drh3aac2dd2004-04-26 14:10:20 +00001499*/
danielk197730548662009-07-09 05:07:37 +00001500static int btreeGetPage(
drh16a9b832007-05-05 18:39:25 +00001501 BtShared *pBt, /* The btree */
1502 Pgno pgno, /* Number of the page to fetch */
1503 MemPage **ppPage, /* Return the page in this parameter */
1504 int noContent /* Do not load page content if true */
1505){
drh3aac2dd2004-04-26 14:10:20 +00001506 int rc;
danielk19773b8a05f2007-03-19 17:44:26 +00001507 DbPage *pDbPage;
1508
drh1fee73e2007-08-29 04:00:57 +00001509 assert( sqlite3_mutex_held(pBt->mutex) );
drh538f5702007-04-13 02:14:30 +00001510 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
drh3aac2dd2004-04-26 14:10:20 +00001511 if( rc ) return rc;
drh897a8202008-09-18 01:08:15 +00001512 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
drh3aac2dd2004-04-26 14:10:20 +00001513 return SQLITE_OK;
1514}
1515
1516/*
danielk1977bea2a942009-01-20 17:06:27 +00001517** Retrieve a page from the pager cache. If the requested page is not
1518** already in the pager cache return NULL. Initialize the MemPage.pBt and
1519** MemPage.aData elements if needed.
1520*/
1521static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1522 DbPage *pDbPage;
1523 assert( sqlite3_mutex_held(pBt->mutex) );
1524 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1525 if( pDbPage ){
1526 return btreePageFromDbPage(pDbPage, pgno, pBt);
1527 }
1528 return 0;
1529}
1530
1531/*
danielk197789d40042008-11-17 14:20:56 +00001532** Return the size of the database file in pages. If there is any kind of
1533** error, return ((unsigned int)-1).
danielk197767fd7a92008-09-10 17:53:35 +00001534*/
danielk197789d40042008-11-17 14:20:56 +00001535static Pgno pagerPagecount(BtShared *pBt){
1536 int nPage = -1;
danielk197767fd7a92008-09-10 17:53:35 +00001537 int rc;
danielk197789d40042008-11-17 14:20:56 +00001538 assert( pBt->pPage1 );
1539 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1540 assert( rc==SQLITE_OK || nPage==-1 );
1541 return (Pgno)nPage;
danielk197767fd7a92008-09-10 17:53:35 +00001542}
1543
1544/*
danielk197789bc4bc2009-07-21 19:25:24 +00001545** Get a page from the pager and initialize it. This routine is just a
1546** convenience wrapper around separate calls to btreeGetPage() and
1547** btreeInitPage().
1548**
1549** If an error occurs, then the value *ppPage is set to is undefined. It
1550** may remain unchanged, or it may be set to an invalid value.
drhde647132004-05-07 17:57:49 +00001551*/
1552static int getAndInitPage(
danielk1977aef0bf62005-12-30 16:28:01 +00001553 BtShared *pBt, /* The database file */
drhde647132004-05-07 17:57:49 +00001554 Pgno pgno, /* Number of the page to get */
danielk197771d5d2c2008-09-29 11:49:47 +00001555 MemPage **ppPage /* Write the page pointer here */
drhde647132004-05-07 17:57:49 +00001556){
1557 int rc;
danielk197789bc4bc2009-07-21 19:25:24 +00001558 TESTONLY( Pgno iLastPg = pagerPagecount(pBt); )
drh1fee73e2007-08-29 04:00:57 +00001559 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197789bc4bc2009-07-21 19:25:24 +00001560
1561 rc = btreeGetPage(pBt, pgno, ppPage, 0);
1562 if( rc==SQLITE_OK ){
1563 rc = btreeInitPage(*ppPage);
1564 if( rc!=SQLITE_OK ){
1565 releasePage(*ppPage);
1566 }
drhee696e22004-08-30 16:52:17 +00001567 }
danielk19779f580ad2008-09-10 14:45:57 +00001568
danielk197789bc4bc2009-07-21 19:25:24 +00001569 /* If the requested page number was either 0 or greater than the page
1570 ** number of the last page in the database, this function should return
1571 ** SQLITE_CORRUPT or some other error (i.e. SQLITE_FULL). Check that this
1572 ** is the case. */
1573 assert( (pgno>0 && pgno<=iLastPg) || rc!=SQLITE_OK );
1574 testcase( pgno==0 );
1575 testcase( pgno==iLastPg );
1576
drhde647132004-05-07 17:57:49 +00001577 return rc;
1578}
1579
1580/*
drh3aac2dd2004-04-26 14:10:20 +00001581** Release a MemPage. This should be called once for each prior
danielk197730548662009-07-09 05:07:37 +00001582** call to btreeGetPage.
drh3aac2dd2004-04-26 14:10:20 +00001583*/
drh4b70f112004-05-02 21:12:19 +00001584static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +00001585 if( pPage ){
drh30df0092008-12-23 15:58:06 +00001586 assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 );
drh3aac2dd2004-04-26 14:10:20 +00001587 assert( pPage->aData );
1588 assert( pPage->pBt );
drhbf4bca52007-09-06 22:19:14 +00001589 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1590 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
drh1fee73e2007-08-29 04:00:57 +00001591 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +00001592 sqlite3PagerUnref(pPage->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00001593 }
1594}
1595
1596/*
drha6abd042004-06-09 17:37:22 +00001597** During a rollback, when the pager reloads information into the cache
1598** so that the cache is restored to its original state at the start of
1599** the transaction, for each page restored this routine is called.
1600**
1601** This routine needs to reset the extra data section at the end of the
1602** page to agree with the restored data.
1603*/
danielk1977eaa06f62008-09-18 17:34:44 +00001604static void pageReinit(DbPage *pData){
drh07d183d2005-05-01 22:52:42 +00001605 MemPage *pPage;
danielk19773b8a05f2007-03-19 17:44:26 +00001606 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
danielk1977d217e6f2009-04-01 17:13:51 +00001607 assert( sqlite3PagerPageRefcount(pData)>0 );
danielk197771d5d2c2008-09-29 11:49:47 +00001608 if( pPage->isInit ){
drh1fee73e2007-08-29 04:00:57 +00001609 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drha6abd042004-06-09 17:37:22 +00001610 pPage->isInit = 0;
danielk1977d217e6f2009-04-01 17:13:51 +00001611 if( sqlite3PagerPageRefcount(pData)>1 ){
drh5e8d8872009-03-30 17:19:48 +00001612 /* pPage might not be a btree page; it might be an overflow page
1613 ** or ptrmap page or a free page. In those cases, the following
danielk197730548662009-07-09 05:07:37 +00001614 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
drh5e8d8872009-03-30 17:19:48 +00001615 ** But no harm is done by this. And it is very important that
danielk197730548662009-07-09 05:07:37 +00001616 ** btreeInitPage() be called on every btree page so we make
drh5e8d8872009-03-30 17:19:48 +00001617 ** the call for every page that comes in for re-initing. */
danielk197730548662009-07-09 05:07:37 +00001618 btreeInitPage(pPage);
danielk197771d5d2c2008-09-29 11:49:47 +00001619 }
drha6abd042004-06-09 17:37:22 +00001620 }
1621}
1622
1623/*
drhe5fe6902007-12-07 18:55:28 +00001624** Invoke the busy handler for a btree.
1625*/
danielk19771ceedd32008-11-19 10:22:33 +00001626static int btreeInvokeBusyHandler(void *pArg){
drhe5fe6902007-12-07 18:55:28 +00001627 BtShared *pBt = (BtShared*)pArg;
1628 assert( pBt->db );
1629 assert( sqlite3_mutex_held(pBt->db->mutex) );
1630 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1631}
1632
1633/*
drhad3e0102004-09-03 23:32:18 +00001634** Open a database file.
1635**
drh382c0242001-10-06 16:33:02 +00001636** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +00001637** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +00001638** database file will be deleted when sqlite3BtreeClose() is called.
drhe53831d2007-08-17 01:14:38 +00001639** If zFilename is ":memory:" then an in-memory database is created
1640** that is automatically destroyed when it is closed.
drhc47fd8e2009-04-30 13:30:32 +00001641**
1642** If the database is already opened in the same database connection
1643** and we are in shared cache mode, then the open will fail with an
1644** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
1645** objects in the same database connection since doing so will lead
1646** to problems with locking.
drha059ad02001-04-17 20:09:11 +00001647*/
drh23e11ca2004-05-04 17:27:28 +00001648int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +00001649 const char *zFilename, /* Name of the file containing the BTree database */
drhe5fe6902007-12-07 18:55:28 +00001650 sqlite3 *db, /* Associated database handle */
drh3aac2dd2004-04-26 14:10:20 +00001651 Btree **ppBtree, /* Pointer to new Btree object written here */
drh33f4e022007-09-03 15:19:34 +00001652 int flags, /* Options */
1653 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
drh6019e162001-07-02 17:51:45 +00001654){
drh7555d8e2009-03-20 13:15:30 +00001655 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1656 BtShared *pBt = 0; /* Shared part of btree structure */
1657 Btree *p; /* Handle to return */
1658 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
1659 int rc = SQLITE_OK; /* Result code from this function */
1660 u8 nReserve; /* Byte of unused space on each page */
1661 unsigned char zDbHeader[100]; /* Database header content */
danielk1977aef0bf62005-12-30 16:28:01 +00001662
1663 /* Set the variable isMemdb to true for an in-memory database, or
1664 ** false for a file-based database. This symbol is only required if
1665 ** either of the shared-data or autovacuum features are compiled
1666 ** into the library.
1667 */
1668#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1669 #ifdef SQLITE_OMIT_MEMORYDB
drh980b1a72006-08-16 16:42:48 +00001670 const int isMemdb = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00001671 #else
drh980b1a72006-08-16 16:42:48 +00001672 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
danielk1977aef0bf62005-12-30 16:28:01 +00001673 #endif
1674#endif
1675
drhe5fe6902007-12-07 18:55:28 +00001676 assert( db!=0 );
1677 assert( sqlite3_mutex_held(db->mutex) );
drh153c62c2007-08-24 03:51:33 +00001678
drhe5fe6902007-12-07 18:55:28 +00001679 pVfs = db->pVfs;
drh17435752007-08-16 04:30:38 +00001680 p = sqlite3MallocZero(sizeof(Btree));
danielk1977aef0bf62005-12-30 16:28:01 +00001681 if( !p ){
1682 return SQLITE_NOMEM;
1683 }
1684 p->inTrans = TRANS_NONE;
drhe5fe6902007-12-07 18:55:28 +00001685 p->db = db;
danielk1977602b4662009-07-02 07:47:33 +00001686#ifndef SQLITE_OMIT_SHARED_CACHE
1687 p->lock.pBtree = p;
1688 p->lock.iTable = 1;
1689#endif
danielk1977aef0bf62005-12-30 16:28:01 +00001690
drh198bf392006-01-06 21:52:49 +00001691#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001692 /*
1693 ** If this Btree is a candidate for shared cache, try to find an
1694 ** existing BtShared object that we can share with
1695 */
danielk197720c6cc22009-04-01 18:03:00 +00001696 if( isMemdb==0 && zFilename && zFilename[0] ){
drhf1f12682009-09-09 14:17:52 +00001697 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
danielk1977adfb9b02007-09-17 07:02:56 +00001698 int nFullPathname = pVfs->mxPathname+1;
drhe5ae5732008-06-15 02:51:47 +00001699 char *zFullPathname = sqlite3Malloc(nFullPathname);
drhff0587c2007-08-29 17:43:19 +00001700 sqlite3_mutex *mutexShared;
1701 p->sharable = 1;
drhff0587c2007-08-29 17:43:19 +00001702 if( !zFullPathname ){
1703 sqlite3_free(p);
1704 return SQLITE_NOMEM;
1705 }
danielk1977adfb9b02007-09-17 07:02:56 +00001706 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
drh7555d8e2009-03-20 13:15:30 +00001707 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1708 sqlite3_mutex_enter(mutexOpen);
danielk197759f8c082008-06-18 17:09:10 +00001709 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhff0587c2007-08-29 17:43:19 +00001710 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001711 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
drhff0587c2007-08-29 17:43:19 +00001712 assert( pBt->nRef>0 );
1713 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1714 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
drhc47fd8e2009-04-30 13:30:32 +00001715 int iDb;
1716 for(iDb=db->nDb-1; iDb>=0; iDb--){
1717 Btree *pExisting = db->aDb[iDb].pBt;
1718 if( pExisting && pExisting->pBt==pBt ){
1719 sqlite3_mutex_leave(mutexShared);
1720 sqlite3_mutex_leave(mutexOpen);
1721 sqlite3_free(zFullPathname);
1722 sqlite3_free(p);
1723 return SQLITE_CONSTRAINT;
1724 }
1725 }
drhff0587c2007-08-29 17:43:19 +00001726 p->pBt = pBt;
1727 pBt->nRef++;
1728 break;
1729 }
1730 }
1731 sqlite3_mutex_leave(mutexShared);
1732 sqlite3_free(zFullPathname);
danielk1977aef0bf62005-12-30 16:28:01 +00001733 }
drhff0587c2007-08-29 17:43:19 +00001734#ifdef SQLITE_DEBUG
1735 else{
1736 /* In debug mode, we mark all persistent databases as sharable
1737 ** even when they are not. This exercises the locking code and
1738 ** gives more opportunity for asserts(sqlite3_mutex_held())
1739 ** statements to find locking problems.
1740 */
1741 p->sharable = 1;
1742 }
1743#endif
danielk1977aef0bf62005-12-30 16:28:01 +00001744 }
1745#endif
drha059ad02001-04-17 20:09:11 +00001746 if( pBt==0 ){
drhe53831d2007-08-17 01:14:38 +00001747 /*
1748 ** The following asserts make sure that structures used by the btree are
1749 ** the right size. This is to guard against size changes that result
1750 ** when compiling on a different architecture.
danielk197703aded42004-11-22 05:26:27 +00001751 */
drhe53831d2007-08-17 01:14:38 +00001752 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1753 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1754 assert( sizeof(u32)==4 );
1755 assert( sizeof(u16)==2 );
1756 assert( sizeof(Pgno)==4 );
1757
1758 pBt = sqlite3MallocZero( sizeof(*pBt) );
1759 if( pBt==0 ){
1760 rc = SQLITE_NOMEM;
1761 goto btree_open_out;
1762 }
danielk197771d5d2c2008-09-29 11:49:47 +00001763 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
drh4775ecd2009-07-24 19:01:19 +00001764 EXTRA_SIZE, flags, vfsFlags, pageReinit);
drhe53831d2007-08-17 01:14:38 +00001765 if( rc==SQLITE_OK ){
1766 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1767 }
1768 if( rc!=SQLITE_OK ){
1769 goto btree_open_out;
1770 }
danielk19772a50ff02009-04-10 09:47:06 +00001771 pBt->db = db;
danielk19771ceedd32008-11-19 10:22:33 +00001772 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
drhe53831d2007-08-17 01:14:38 +00001773 p->pBt = pBt;
1774
drhe53831d2007-08-17 01:14:38 +00001775 pBt->pCursor = 0;
1776 pBt->pPage1 = 0;
1777 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1778 pBt->pageSize = get2byte(&zDbHeader[16]);
1779 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1780 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
danielk1977a1644fd2007-08-29 12:31:25 +00001781 pBt->pageSize = 0;
drhe53831d2007-08-17 01:14:38 +00001782#ifndef SQLITE_OMIT_AUTOVACUUM
1783 /* If the magic name ":memory:" will create an in-memory database, then
1784 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1785 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1786 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1787 ** regular file-name. In this case the auto-vacuum applies as per normal.
1788 */
1789 if( zFilename && !isMemdb ){
1790 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1791 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1792 }
1793#endif
1794 nReserve = 0;
1795 }else{
1796 nReserve = zDbHeader[20];
drhe53831d2007-08-17 01:14:38 +00001797 pBt->pageSizeFixed = 1;
1798#ifndef SQLITE_OMIT_AUTOVACUUM
1799 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1800 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1801#endif
1802 }
drhfa9601a2009-06-18 17:22:39 +00001803 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
drhc0b61812009-04-30 01:22:41 +00001804 if( rc ) goto btree_open_out;
drhe53831d2007-08-17 01:14:38 +00001805 pBt->usableSize = pBt->pageSize - nReserve;
1806 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
drhe53831d2007-08-17 01:14:38 +00001807
1808#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1809 /* Add the new BtShared object to the linked list sharable BtShareds.
1810 */
1811 if( p->sharable ){
1812 sqlite3_mutex *mutexShared;
1813 pBt->nRef = 1;
danielk197759f8c082008-06-18 17:09:10 +00001814 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
danielk1977075c23a2008-09-01 18:34:20 +00001815 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
danielk197759f8c082008-06-18 17:09:10 +00001816 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
drh3285db22007-09-03 22:00:39 +00001817 if( pBt->mutex==0 ){
1818 rc = SQLITE_NOMEM;
drhe5fe6902007-12-07 18:55:28 +00001819 db->mallocFailed = 0;
drh3285db22007-09-03 22:00:39 +00001820 goto btree_open_out;
1821 }
drhff0587c2007-08-29 17:43:19 +00001822 }
drhe53831d2007-08-17 01:14:38 +00001823 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001824 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1825 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
drhe53831d2007-08-17 01:14:38 +00001826 sqlite3_mutex_leave(mutexShared);
danielk1977951af802004-11-05 15:45:09 +00001827 }
drheee46cf2004-11-06 00:02:48 +00001828#endif
drh90f5ecb2004-07-22 01:19:35 +00001829 }
danielk1977aef0bf62005-12-30 16:28:01 +00001830
drhcfed7bc2006-03-13 14:28:05 +00001831#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001832 /* If the new Btree uses a sharable pBtShared, then link the new
1833 ** Btree into the list of all sharable Btrees for the same connection.
drhabddb0c2007-08-20 13:14:28 +00001834 ** The list is kept in ascending order by pBt address.
danielk197754f01982006-01-18 15:25:17 +00001835 */
drhe53831d2007-08-17 01:14:38 +00001836 if( p->sharable ){
1837 int i;
1838 Btree *pSib;
drhe5fe6902007-12-07 18:55:28 +00001839 for(i=0; i<db->nDb; i++){
1840 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
drhe53831d2007-08-17 01:14:38 +00001841 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1842 if( p->pBt<pSib->pBt ){
1843 p->pNext = pSib;
1844 p->pPrev = 0;
1845 pSib->pPrev = p;
1846 }else{
drhabddb0c2007-08-20 13:14:28 +00001847 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
drhe53831d2007-08-17 01:14:38 +00001848 pSib = pSib->pNext;
1849 }
1850 p->pNext = pSib->pNext;
1851 p->pPrev = pSib;
1852 if( p->pNext ){
1853 p->pNext->pPrev = p;
1854 }
1855 pSib->pNext = p;
1856 }
1857 break;
1858 }
1859 }
danielk1977aef0bf62005-12-30 16:28:01 +00001860 }
danielk1977aef0bf62005-12-30 16:28:01 +00001861#endif
1862 *ppBtree = p;
danielk1977dddbcdc2007-04-26 14:42:34 +00001863
1864btree_open_out:
1865 if( rc!=SQLITE_OK ){
1866 if( pBt && pBt->pPager ){
1867 sqlite3PagerClose(pBt->pPager);
1868 }
drh17435752007-08-16 04:30:38 +00001869 sqlite3_free(pBt);
1870 sqlite3_free(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001871 *ppBtree = 0;
1872 }
drh7555d8e2009-03-20 13:15:30 +00001873 if( mutexOpen ){
1874 assert( sqlite3_mutex_held(mutexOpen) );
1875 sqlite3_mutex_leave(mutexOpen);
1876 }
danielk1977dddbcdc2007-04-26 14:42:34 +00001877 return rc;
drha059ad02001-04-17 20:09:11 +00001878}
1879
1880/*
drhe53831d2007-08-17 01:14:38 +00001881** Decrement the BtShared.nRef counter. When it reaches zero,
1882** remove the BtShared structure from the sharing list. Return
1883** true if the BtShared.nRef counter reaches zero and return
1884** false if it is still positive.
1885*/
1886static int removeFromSharingList(BtShared *pBt){
1887#ifndef SQLITE_OMIT_SHARED_CACHE
1888 sqlite3_mutex *pMaster;
1889 BtShared *pList;
1890 int removed = 0;
1891
drhd677b3d2007-08-20 22:48:41 +00001892 assert( sqlite3_mutex_notheld(pBt->mutex) );
danielk197759f8c082008-06-18 17:09:10 +00001893 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhe53831d2007-08-17 01:14:38 +00001894 sqlite3_mutex_enter(pMaster);
1895 pBt->nRef--;
1896 if( pBt->nRef<=0 ){
drh78f82d12008-09-02 00:52:52 +00001897 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1898 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
drhe53831d2007-08-17 01:14:38 +00001899 }else{
drh78f82d12008-09-02 00:52:52 +00001900 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
drh34004ce2008-07-11 16:15:17 +00001901 while( ALWAYS(pList) && pList->pNext!=pBt ){
drhe53831d2007-08-17 01:14:38 +00001902 pList=pList->pNext;
1903 }
drh34004ce2008-07-11 16:15:17 +00001904 if( ALWAYS(pList) ){
drhe53831d2007-08-17 01:14:38 +00001905 pList->pNext = pBt->pNext;
1906 }
1907 }
drh3285db22007-09-03 22:00:39 +00001908 if( SQLITE_THREADSAFE ){
1909 sqlite3_mutex_free(pBt->mutex);
1910 }
drhe53831d2007-08-17 01:14:38 +00001911 removed = 1;
1912 }
1913 sqlite3_mutex_leave(pMaster);
1914 return removed;
1915#else
1916 return 1;
1917#endif
1918}
1919
1920/*
drhf7141992008-06-19 00:16:08 +00001921** Make sure pBt->pTmpSpace points to an allocation of
1922** MX_CELL_SIZE(pBt) bytes.
1923*/
1924static void allocateTempSpace(BtShared *pBt){
1925 if( !pBt->pTmpSpace ){
1926 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1927 }
1928}
1929
1930/*
1931** Free the pBt->pTmpSpace allocation
1932*/
1933static void freeTempSpace(BtShared *pBt){
1934 sqlite3PageFree( pBt->pTmpSpace);
1935 pBt->pTmpSpace = 0;
1936}
1937
1938/*
drha059ad02001-04-17 20:09:11 +00001939** Close an open database and invalidate all cursors.
1940*/
danielk1977aef0bf62005-12-30 16:28:01 +00001941int sqlite3BtreeClose(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00001942 BtShared *pBt = p->pBt;
1943 BtCursor *pCur;
1944
danielk1977aef0bf62005-12-30 16:28:01 +00001945 /* Close all cursors opened via this handle. */
drhe5fe6902007-12-07 18:55:28 +00001946 assert( sqlite3_mutex_held(p->db->mutex) );
drhe53831d2007-08-17 01:14:38 +00001947 sqlite3BtreeEnter(p);
danielk1977aef0bf62005-12-30 16:28:01 +00001948 pCur = pBt->pCursor;
1949 while( pCur ){
1950 BtCursor *pTmp = pCur;
1951 pCur = pCur->pNext;
1952 if( pTmp->pBtree==p ){
1953 sqlite3BtreeCloseCursor(pTmp);
1954 }
drha059ad02001-04-17 20:09:11 +00001955 }
danielk1977aef0bf62005-12-30 16:28:01 +00001956
danielk19778d34dfd2006-01-24 16:37:57 +00001957 /* Rollback any active transaction and free the handle structure.
1958 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1959 ** this handle.
1960 */
danielk1977b597f742006-01-15 11:39:18 +00001961 sqlite3BtreeRollback(p);
drhe53831d2007-08-17 01:14:38 +00001962 sqlite3BtreeLeave(p);
danielk1977aef0bf62005-12-30 16:28:01 +00001963
danielk1977aef0bf62005-12-30 16:28:01 +00001964 /* If there are still other outstanding references to the shared-btree
1965 ** structure, return now. The remainder of this procedure cleans
1966 ** up the shared-btree.
1967 */
drhe53831d2007-08-17 01:14:38 +00001968 assert( p->wantToLock==0 && p->locked==0 );
1969 if( !p->sharable || removeFromSharingList(pBt) ){
1970 /* The pBt is no longer on the sharing list, so we can access
1971 ** it without having to hold the mutex.
1972 **
1973 ** Clean out and delete the BtShared object.
1974 */
1975 assert( !pBt->pCursor );
drhe53831d2007-08-17 01:14:38 +00001976 sqlite3PagerClose(pBt->pPager);
1977 if( pBt->xFreeSchema && pBt->pSchema ){
1978 pBt->xFreeSchema(pBt->pSchema);
1979 }
1980 sqlite3_free(pBt->pSchema);
drhf7141992008-06-19 00:16:08 +00001981 freeTempSpace(pBt);
drh65bbf292008-06-19 01:03:17 +00001982 sqlite3_free(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00001983 }
1984
drhe53831d2007-08-17 01:14:38 +00001985#ifndef SQLITE_OMIT_SHARED_CACHE
drhcab5ed72007-08-22 11:41:18 +00001986 assert( p->wantToLock==0 );
1987 assert( p->locked==0 );
1988 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1989 if( p->pNext ) p->pNext->pPrev = p->pPrev;
danielk1977aef0bf62005-12-30 16:28:01 +00001990#endif
1991
drhe53831d2007-08-17 01:14:38 +00001992 sqlite3_free(p);
drha059ad02001-04-17 20:09:11 +00001993 return SQLITE_OK;
1994}
1995
1996/*
drhda47d772002-12-02 04:25:19 +00001997** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +00001998**
1999** The maximum number of cache pages is set to the absolute
2000** value of mxPage. If mxPage is negative, the pager will
2001** operate asynchronously - it will not stop to do fsync()s
2002** to insure data is written to the disk surface before
2003** continuing. Transactions still work if synchronous is off,
2004** and the database cannot be corrupted if this program
2005** crashes. But if the operating system crashes or there is
2006** an abrupt power failure when synchronous is off, the database
2007** could be left in an inconsistent and unrecoverable state.
2008** Synchronous is on by default so database corruption is not
2009** normally a worry.
drhf57b14a2001-09-14 18:54:08 +00002010*/
danielk1977aef0bf62005-12-30 16:28:01 +00002011int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2012 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00002013 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00002014 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00002015 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
drhd677b3d2007-08-20 22:48:41 +00002016 sqlite3BtreeLeave(p);
drhf57b14a2001-09-14 18:54:08 +00002017 return SQLITE_OK;
2018}
2019
2020/*
drh973b6e32003-02-12 14:09:42 +00002021** Change the way data is synced to disk in order to increase or decrease
2022** how well the database resists damage due to OS crashes and power
2023** failures. Level 1 is the same as asynchronous (no syncs() occur and
2024** there is a high probability of damage) Level 2 is the default. There
2025** is a very low but non-zero probability of damage. Level 3 reduces the
2026** probability of damage to near zero but with a write performance reduction.
2027*/
danielk197793758c82005-01-21 08:13:14 +00002028#ifndef SQLITE_OMIT_PAGER_PRAGMAS
drhac530b12006-02-11 01:25:50 +00002029int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
danielk1977aef0bf62005-12-30 16:28:01 +00002030 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00002031 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00002032 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00002033 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
drhd677b3d2007-08-20 22:48:41 +00002034 sqlite3BtreeLeave(p);
drh973b6e32003-02-12 14:09:42 +00002035 return SQLITE_OK;
2036}
danielk197793758c82005-01-21 08:13:14 +00002037#endif
drh973b6e32003-02-12 14:09:42 +00002038
drh2c8997b2005-08-27 16:36:48 +00002039/*
2040** Return TRUE if the given btree is set to safety level 1. In other
2041** words, return TRUE if no sync() occurs on the disk files.
2042*/
danielk1977aef0bf62005-12-30 16:28:01 +00002043int sqlite3BtreeSyncDisabled(Btree *p){
2044 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002045 int rc;
drhe5fe6902007-12-07 18:55:28 +00002046 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00002047 sqlite3BtreeEnter(p);
drhd0679ed2007-08-28 22:24:34 +00002048 assert( pBt && pBt->pPager );
drhd677b3d2007-08-20 22:48:41 +00002049 rc = sqlite3PagerNosync(pBt->pPager);
2050 sqlite3BtreeLeave(p);
2051 return rc;
drh2c8997b2005-08-27 16:36:48 +00002052}
2053
danielk1977576ec6b2005-01-21 11:55:25 +00002054#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
drh973b6e32003-02-12 14:09:42 +00002055/*
drh90f5ecb2004-07-22 01:19:35 +00002056** Change the default pages size and the number of reserved bytes per page.
drhce4869f2009-04-02 20:16:58 +00002057** Or, if the page size has already been fixed, return SQLITE_READONLY
2058** without changing anything.
drh06f50212004-11-02 14:24:33 +00002059**
2060** The page size must be a power of 2 between 512 and 65536. If the page
2061** size supplied does not meet this constraint then the page size is not
2062** changed.
2063**
2064** Page sizes are constrained to be a power of two so that the region
2065** of the database file used for locking (beginning at PENDING_BYTE,
2066** the first byte past the 1GB boundary, 0x40000000) needs to occur
2067** at the beginning of a page.
danielk197728129562005-01-11 10:25:06 +00002068**
2069** If parameter nReserve is less than zero, then the number of reserved
2070** bytes per page is left unchanged.
drhce4869f2009-04-02 20:16:58 +00002071**
2072** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
2073** and autovacuum mode can no longer be changed.
drh90f5ecb2004-07-22 01:19:35 +00002074*/
drhce4869f2009-04-02 20:16:58 +00002075int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
danielk1977a1644fd2007-08-29 12:31:25 +00002076 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002077 BtShared *pBt = p->pBt;
drhf49661a2008-12-10 16:45:50 +00002078 assert( nReserve>=-1 && nReserve<=255 );
drhd677b3d2007-08-20 22:48:41 +00002079 sqlite3BtreeEnter(p);
drh90f5ecb2004-07-22 01:19:35 +00002080 if( pBt->pageSizeFixed ){
drhd677b3d2007-08-20 22:48:41 +00002081 sqlite3BtreeLeave(p);
drh90f5ecb2004-07-22 01:19:35 +00002082 return SQLITE_READONLY;
2083 }
2084 if( nReserve<0 ){
2085 nReserve = pBt->pageSize - pBt->usableSize;
2086 }
drhf49661a2008-12-10 16:45:50 +00002087 assert( nReserve>=0 && nReserve<=255 );
drh06f50212004-11-02 14:24:33 +00002088 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2089 ((pageSize-1)&pageSize)==0 ){
drh07d183d2005-05-01 22:52:42 +00002090 assert( (pageSize & 7)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00002091 assert( !pBt->pPage1 && !pBt->pCursor );
drh1bd10f82008-12-10 21:19:56 +00002092 pBt->pageSize = (u16)pageSize;
drhf7141992008-06-19 00:16:08 +00002093 freeTempSpace(pBt);
drh90f5ecb2004-07-22 01:19:35 +00002094 }
drhfa9601a2009-06-18 17:22:39 +00002095 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
drhf49661a2008-12-10 16:45:50 +00002096 pBt->usableSize = pBt->pageSize - (u16)nReserve;
drhce4869f2009-04-02 20:16:58 +00002097 if( iFix ) pBt->pageSizeFixed = 1;
drhd677b3d2007-08-20 22:48:41 +00002098 sqlite3BtreeLeave(p);
danielk1977a1644fd2007-08-29 12:31:25 +00002099 return rc;
drh90f5ecb2004-07-22 01:19:35 +00002100}
2101
2102/*
2103** Return the currently defined page size
2104*/
danielk1977aef0bf62005-12-30 16:28:01 +00002105int sqlite3BtreeGetPageSize(Btree *p){
2106 return p->pBt->pageSize;
drh90f5ecb2004-07-22 01:19:35 +00002107}
drh7f751222009-03-17 22:33:00 +00002108
2109/*
2110** Return the number of bytes of space at the end of every page that
2111** are intentually left unused. This is the "reserved" space that is
2112** sometimes used by extensions.
2113*/
danielk1977aef0bf62005-12-30 16:28:01 +00002114int sqlite3BtreeGetReserve(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00002115 int n;
2116 sqlite3BtreeEnter(p);
2117 n = p->pBt->pageSize - p->pBt->usableSize;
2118 sqlite3BtreeLeave(p);
2119 return n;
drh2011d5f2004-07-22 02:40:37 +00002120}
drhf8e632b2007-05-08 14:51:36 +00002121
2122/*
2123** Set the maximum page count for a database if mxPage is positive.
2124** No changes are made if mxPage is 0 or negative.
2125** Regardless of the value of mxPage, return the maximum page count.
2126*/
2127int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
drhd677b3d2007-08-20 22:48:41 +00002128 int n;
2129 sqlite3BtreeEnter(p);
2130 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2131 sqlite3BtreeLeave(p);
2132 return n;
drhf8e632b2007-05-08 14:51:36 +00002133}
danielk1977576ec6b2005-01-21 11:55:25 +00002134#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
drh90f5ecb2004-07-22 01:19:35 +00002135
2136/*
danielk1977951af802004-11-05 15:45:09 +00002137** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2138** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2139** is disabled. The default value for the auto-vacuum property is
2140** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2141*/
danielk1977aef0bf62005-12-30 16:28:01 +00002142int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
danielk1977951af802004-11-05 15:45:09 +00002143#ifdef SQLITE_OMIT_AUTOVACUUM
drheee46cf2004-11-06 00:02:48 +00002144 return SQLITE_READONLY;
danielk1977951af802004-11-05 15:45:09 +00002145#else
danielk1977dddbcdc2007-04-26 14:42:34 +00002146 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002147 int rc = SQLITE_OK;
drh076d4662009-02-18 20:31:18 +00002148 u8 av = (u8)autoVacuum;
drhd677b3d2007-08-20 22:48:41 +00002149
2150 sqlite3BtreeEnter(p);
drh076d4662009-02-18 20:31:18 +00002151 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00002152 rc = SQLITE_READONLY;
2153 }else{
drh076d4662009-02-18 20:31:18 +00002154 pBt->autoVacuum = av ?1:0;
2155 pBt->incrVacuum = av==2 ?1:0;
danielk1977951af802004-11-05 15:45:09 +00002156 }
drhd677b3d2007-08-20 22:48:41 +00002157 sqlite3BtreeLeave(p);
2158 return rc;
danielk1977951af802004-11-05 15:45:09 +00002159#endif
2160}
2161
2162/*
2163** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2164** enabled 1 is returned. Otherwise 0.
2165*/
danielk1977aef0bf62005-12-30 16:28:01 +00002166int sqlite3BtreeGetAutoVacuum(Btree *p){
danielk1977951af802004-11-05 15:45:09 +00002167#ifdef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00002168 return BTREE_AUTOVACUUM_NONE;
danielk1977951af802004-11-05 15:45:09 +00002169#else
drhd677b3d2007-08-20 22:48:41 +00002170 int rc;
2171 sqlite3BtreeEnter(p);
2172 rc = (
danielk1977dddbcdc2007-04-26 14:42:34 +00002173 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2174 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2175 BTREE_AUTOVACUUM_INCR
2176 );
drhd677b3d2007-08-20 22:48:41 +00002177 sqlite3BtreeLeave(p);
2178 return rc;
danielk1977951af802004-11-05 15:45:09 +00002179#endif
2180}
2181
2182
2183/*
drha34b6762004-05-07 13:30:42 +00002184** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +00002185** also acquire a readlock on that file.
2186**
2187** SQLITE_OK is returned on success. If the file is not a
2188** well-formed database file, then SQLITE_CORRUPT is returned.
2189** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
drh4f0ee682007-03-30 20:43:40 +00002190** is returned if we run out of memory.
drh306dc212001-05-21 13:45:10 +00002191*/
danielk1977aef0bf62005-12-30 16:28:01 +00002192static int lockBtree(BtShared *pBt){
danielk1977f653d782008-03-20 11:04:21 +00002193 int rc;
drh3aac2dd2004-04-26 14:10:20 +00002194 MemPage *pPage1;
danielk197793f7af92008-05-09 16:57:50 +00002195 int nPage;
drhd677b3d2007-08-20 22:48:41 +00002196
drh1fee73e2007-08-29 04:00:57 +00002197 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977295dc102009-04-01 19:07:03 +00002198 assert( pBt->pPage1==0 );
danielk197789bc4bc2009-07-21 19:25:24 +00002199 rc = sqlite3PagerSharedLock(pBt->pPager);
2200 if( rc!=SQLITE_OK ) return rc;
danielk197730548662009-07-09 05:07:37 +00002201 rc = btreeGetPage(pBt, 1, &pPage1, 0);
drh306dc212001-05-21 13:45:10 +00002202 if( rc!=SQLITE_OK ) return rc;
drh306dc212001-05-21 13:45:10 +00002203
2204 /* Do some checking to help insure the file we opened really is
2205 ** a valid database file.
2206 */
danielk1977ad0132d2008-06-07 08:58:22 +00002207 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
2208 if( rc!=SQLITE_OK ){
danielk197793f7af92008-05-09 16:57:50 +00002209 goto page1_init_failed;
2210 }else if( nPage>0 ){
danielk1977f653d782008-03-20 11:04:21 +00002211 int pageSize;
2212 int usableSize;
drhb6f41482004-05-14 01:58:11 +00002213 u8 *page1 = pPage1->aData;
danielk1977ad0132d2008-06-07 08:58:22 +00002214 rc = SQLITE_NOTADB;
drhb6f41482004-05-14 01:58:11 +00002215 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00002216 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00002217 }
drh309169a2007-04-24 17:27:51 +00002218 if( page1[18]>1 ){
2219 pBt->readOnly = 1;
2220 }
2221 if( page1[19]>1 ){
drhb6f41482004-05-14 01:58:11 +00002222 goto page1_init_failed;
2223 }
drhe5ae5732008-06-15 02:51:47 +00002224
2225 /* The maximum embedded fraction must be exactly 25%. And the minimum
2226 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2227 ** The original design allowed these amounts to vary, but as of
2228 ** version 3.6.0, we require them to be fixed.
2229 */
2230 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2231 goto page1_init_failed;
2232 }
drh07d183d2005-05-01 22:52:42 +00002233 pageSize = get2byte(&page1[16]);
drh7dc385e2007-09-06 23:39:36 +00002234 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
2235 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
2236 ){
drh07d183d2005-05-01 22:52:42 +00002237 goto page1_init_failed;
2238 }
2239 assert( (pageSize & 7)==0 );
danielk1977f653d782008-03-20 11:04:21 +00002240 usableSize = pageSize - page1[20];
2241 if( pageSize!=pBt->pageSize ){
2242 /* After reading the first page of the database assuming a page size
2243 ** of BtShared.pageSize, we have discovered that the page-size is
2244 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2245 ** zero and return SQLITE_OK. The caller will call this function
2246 ** again with the correct page-size.
2247 */
2248 releasePage(pPage1);
drhf49661a2008-12-10 16:45:50 +00002249 pBt->usableSize = (u16)usableSize;
2250 pBt->pageSize = (u16)pageSize;
drhf7141992008-06-19 00:16:08 +00002251 freeTempSpace(pBt);
drhfa9601a2009-06-18 17:22:39 +00002252 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2253 pageSize-usableSize);
drh5e483932009-07-10 16:51:30 +00002254 return rc;
danielk1977f653d782008-03-20 11:04:21 +00002255 }
drhb33e1b92009-06-18 11:29:20 +00002256 if( usableSize<480 ){
drhb6f41482004-05-14 01:58:11 +00002257 goto page1_init_failed;
2258 }
drh1bd10f82008-12-10 21:19:56 +00002259 pBt->pageSize = (u16)pageSize;
2260 pBt->usableSize = (u16)usableSize;
drh057cd3a2005-02-15 16:23:02 +00002261#ifndef SQLITE_OMIT_AUTOVACUUM
2262 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
danielk197727b1f952007-06-25 08:16:58 +00002263 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
drh057cd3a2005-02-15 16:23:02 +00002264#endif
drh306dc212001-05-21 13:45:10 +00002265 }
drhb6f41482004-05-14 01:58:11 +00002266
2267 /* maxLocal is the maximum amount of payload to store locally for
2268 ** a cell. Make sure it is small enough so that at least minFanout
2269 ** cells can will fit on one page. We assume a 10-byte page header.
2270 ** Besides the payload, the cell must store:
drh43605152004-05-29 21:46:49 +00002271 ** 2-byte pointer to the cell
drhb6f41482004-05-14 01:58:11 +00002272 ** 4-byte child pointer
2273 ** 9-byte nKey value
2274 ** 4-byte nData value
2275 ** 4-byte overflow page pointer
drh43605152004-05-29 21:46:49 +00002276 ** So a cell consists of a 2-byte poiner, a header which is as much as
2277 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2278 ** page pointer.
drhb6f41482004-05-14 01:58:11 +00002279 */
drhe5ae5732008-06-15 02:51:47 +00002280 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
2281 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
drh43605152004-05-29 21:46:49 +00002282 pBt->maxLeaf = pBt->usableSize - 35;
drhe5ae5732008-06-15 02:51:47 +00002283 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
drh2e38c322004-09-03 18:38:44 +00002284 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00002285 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00002286 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00002287
drh72f82862001-05-24 21:06:34 +00002288page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00002289 releasePage(pPage1);
2290 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00002291 return rc;
drh306dc212001-05-21 13:45:10 +00002292}
2293
2294/*
drhb8ca3072001-12-05 00:21:20 +00002295** If there are no outstanding cursors and we are not in the middle
2296** of a transaction but there is a read lock on the database, then
2297** this routine unrefs the first page of the database file which
2298** has the effect of releasing the read lock.
2299**
drhb8ca3072001-12-05 00:21:20 +00002300** If there is a transaction in progress, this routine is a no-op.
2301*/
danielk1977aef0bf62005-12-30 16:28:01 +00002302static void unlockBtreeIfUnused(BtShared *pBt){
drh1fee73e2007-08-29 04:00:57 +00002303 assert( sqlite3_mutex_held(pBt->mutex) );
danielk19771bc9ee92009-07-04 15:41:02 +00002304 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2305 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
danielk1977c1761e82009-06-25 09:40:03 +00002306 assert( pBt->pPage1->aData );
2307 assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2308 assert( pBt->pPage1->aData );
2309 releasePage(pBt->pPage1);
drh3aac2dd2004-04-26 14:10:20 +00002310 pBt->pPage1 = 0;
drhb8ca3072001-12-05 00:21:20 +00002311 }
2312}
2313
2314/*
drhe39f2f92009-07-23 01:43:59 +00002315** If pBt points to an empty file then convert that empty file
2316** into a new empty database by initializing the first page of
2317** the database.
drh8b2f49b2001-06-08 00:21:52 +00002318*/
danielk1977aef0bf62005-12-30 16:28:01 +00002319static int newDatabase(BtShared *pBt){
drh9e572e62004-04-23 23:43:10 +00002320 MemPage *pP1;
2321 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00002322 int rc;
danielk1977ad0132d2008-06-07 08:58:22 +00002323 int nPage;
drhd677b3d2007-08-20 22:48:41 +00002324
drh1fee73e2007-08-29 04:00:57 +00002325 assert( sqlite3_mutex_held(pBt->mutex) );
drhe39f2f92009-07-23 01:43:59 +00002326 /* The database size has already been measured and cached, so failure
2327 ** is impossible here. If the original size measurement failed, then
2328 ** processing aborts before entering this routine. */
danielk1977ad0132d2008-06-07 08:58:22 +00002329 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
drhe39f2f92009-07-23 01:43:59 +00002330 if( NEVER(rc!=SQLITE_OK) || nPage>0 ){
danielk1977ad0132d2008-06-07 08:58:22 +00002331 return rc;
2332 }
drh3aac2dd2004-04-26 14:10:20 +00002333 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00002334 assert( pP1!=0 );
2335 data = pP1->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00002336 rc = sqlite3PagerWrite(pP1->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00002337 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00002338 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2339 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00002340 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00002341 data[18] = 1;
2342 data[19] = 1;
drhf49661a2008-12-10 16:45:50 +00002343 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2344 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
drhe5ae5732008-06-15 02:51:47 +00002345 data[21] = 64;
2346 data[22] = 32;
2347 data[23] = 32;
drhb6f41482004-05-14 01:58:11 +00002348 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00002349 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drhf2a611c2004-09-05 00:33:43 +00002350 pBt->pageSizeFixed = 1;
danielk1977003ba062004-11-04 02:57:33 +00002351#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00002352 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
danielk1977418899a2007-06-24 10:14:00 +00002353 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
danielk1977dddbcdc2007-04-26 14:42:34 +00002354 put4byte(&data[36 + 4*4], pBt->autoVacuum);
danielk1977418899a2007-06-24 10:14:00 +00002355 put4byte(&data[36 + 7*4], pBt->incrVacuum);
danielk1977003ba062004-11-04 02:57:33 +00002356#endif
drh8b2f49b2001-06-08 00:21:52 +00002357 return SQLITE_OK;
2358}
2359
2360/*
danielk1977ee5741e2004-05-31 10:01:34 +00002361** Attempt to start a new transaction. A write-transaction
drh684917c2004-10-05 02:41:42 +00002362** is started if the second argument is nonzero, otherwise a read-
2363** transaction. If the second argument is 2 or more and exclusive
2364** transaction is started, meaning that no other process is allowed
2365** to access the database. A preexisting transaction may not be
drhb8ef32c2005-03-14 02:01:49 +00002366** upgraded to exclusive by calling this routine a second time - the
drh684917c2004-10-05 02:41:42 +00002367** exclusivity flag only works for a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00002368**
danielk1977ee5741e2004-05-31 10:01:34 +00002369** A write-transaction must be started before attempting any
2370** changes to the database. None of the following routines
2371** will work unless a transaction is started first:
drh8b2f49b2001-06-08 00:21:52 +00002372**
drh23e11ca2004-05-04 17:27:28 +00002373** sqlite3BtreeCreateTable()
2374** sqlite3BtreeCreateIndex()
2375** sqlite3BtreeClearTable()
2376** sqlite3BtreeDropTable()
2377** sqlite3BtreeInsert()
2378** sqlite3BtreeDelete()
2379** sqlite3BtreeUpdateMeta()
danielk197713adf8a2004-06-03 16:08:41 +00002380**
drhb8ef32c2005-03-14 02:01:49 +00002381** If an initial attempt to acquire the lock fails because of lock contention
2382** and the database was previously unlocked, then invoke the busy handler
2383** if there is one. But if there was previously a read-lock, do not
2384** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
2385** returned when there is already a read-lock in order to avoid a deadlock.
2386**
2387** Suppose there are two processes A and B. A has a read lock and B has
2388** a reserved lock. B tries to promote to exclusive but is blocked because
2389** of A's read lock. A tries to promote to reserved but is blocked by B.
2390** One or the other of the two processes must give way or there can be
2391** no progress. By returning SQLITE_BUSY and not invoking the busy callback
2392** when A already has a read lock, we encourage A to give up and let B
2393** proceed.
drha059ad02001-04-17 20:09:11 +00002394*/
danielk1977aef0bf62005-12-30 16:28:01 +00002395int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
danielk1977404ca072009-03-16 13:19:36 +00002396 sqlite3 *pBlock = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00002397 BtShared *pBt = p->pBt;
danielk1977ee5741e2004-05-31 10:01:34 +00002398 int rc = SQLITE_OK;
2399
drhd677b3d2007-08-20 22:48:41 +00002400 sqlite3BtreeEnter(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002401 btreeIntegrity(p);
2402
danielk1977ee5741e2004-05-31 10:01:34 +00002403 /* If the btree is already in a write-transaction, or it
2404 ** is already in a read-transaction and a read-transaction
2405 ** is requested, this is a no-op.
2406 */
danielk1977aef0bf62005-12-30 16:28:01 +00002407 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
drhd677b3d2007-08-20 22:48:41 +00002408 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00002409 }
drhb8ef32c2005-03-14 02:01:49 +00002410
2411 /* Write transactions are not possible on a read-only database */
danielk1977ee5741e2004-05-31 10:01:34 +00002412 if( pBt->readOnly && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00002413 rc = SQLITE_READONLY;
2414 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00002415 }
2416
danielk1977404ca072009-03-16 13:19:36 +00002417#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +00002418 /* If another database handle has already opened a write transaction
2419 ** on this shared-btree structure and a second write transaction is
danielk1977404ca072009-03-16 13:19:36 +00002420 ** requested, return SQLITE_LOCKED.
danielk1977aef0bf62005-12-30 16:28:01 +00002421 */
danielk1977404ca072009-03-16 13:19:36 +00002422 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2423 pBlock = pBt->pWriter->db;
2424 }else if( wrflag>1 ){
danielk1977641b0f42007-12-21 04:47:25 +00002425 BtLock *pIter;
2426 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2427 if( pIter->pBtree!=p ){
danielk1977404ca072009-03-16 13:19:36 +00002428 pBlock = pIter->pBtree->db;
2429 break;
danielk1977641b0f42007-12-21 04:47:25 +00002430 }
2431 }
2432 }
danielk1977404ca072009-03-16 13:19:36 +00002433 if( pBlock ){
2434 sqlite3ConnectionBlocked(p->db, pBlock);
2435 rc = SQLITE_LOCKED_SHAREDCACHE;
2436 goto trans_begun;
2437 }
danielk1977641b0f42007-12-21 04:47:25 +00002438#endif
2439
danielk1977602b4662009-07-02 07:47:33 +00002440 /* Any read-only or read-write transaction implies a read-lock on
2441 ** page 1. So if some other shared-cache client already has a write-lock
2442 ** on page 1, the transaction cannot be opened. */
drh4c301aa2009-07-15 17:25:45 +00002443 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2444 if( SQLITE_OK!=rc ) goto trans_begun;
danielk1977602b4662009-07-02 07:47:33 +00002445
drhb8ef32c2005-03-14 02:01:49 +00002446 do {
danielk1977295dc102009-04-01 19:07:03 +00002447 /* Call lockBtree() until either pBt->pPage1 is populated or
2448 ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2449 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2450 ** reading page 1 it discovers that the page-size of the database
2451 ** file is not pBt->pageSize. In this case lockBtree() will update
2452 ** pBt->pageSize to the page-size of the file on disk.
2453 */
2454 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
drh309169a2007-04-24 17:27:51 +00002455
drhb8ef32c2005-03-14 02:01:49 +00002456 if( rc==SQLITE_OK && wrflag ){
drh309169a2007-04-24 17:27:51 +00002457 if( pBt->readOnly ){
2458 rc = SQLITE_READONLY;
2459 }else{
danielk1977d8293352009-04-30 09:10:37 +00002460 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
drh309169a2007-04-24 17:27:51 +00002461 if( rc==SQLITE_OK ){
2462 rc = newDatabase(pBt);
2463 }
drhb8ef32c2005-03-14 02:01:49 +00002464 }
2465 }
2466
danielk1977bd434552009-03-18 10:33:00 +00002467 if( rc!=SQLITE_OK ){
drhb8ef32c2005-03-14 02:01:49 +00002468 unlockBtreeIfUnused(pBt);
2469 }
danielk1977aef0bf62005-12-30 16:28:01 +00002470 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
danielk19771ceedd32008-11-19 10:22:33 +00002471 btreeInvokeBusyHandler(pBt) );
danielk1977aef0bf62005-12-30 16:28:01 +00002472
2473 if( rc==SQLITE_OK ){
2474 if( p->inTrans==TRANS_NONE ){
2475 pBt->nTransaction++;
danielk1977602b4662009-07-02 07:47:33 +00002476#ifndef SQLITE_OMIT_SHARED_CACHE
2477 if( p->sharable ){
2478 assert( p->lock.pBtree==p && p->lock.iTable==1 );
2479 p->lock.eLock = READ_LOCK;
2480 p->lock.pNext = pBt->pLock;
2481 pBt->pLock = &p->lock;
2482 }
2483#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002484 }
2485 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2486 if( p->inTrans>pBt->inTransaction ){
2487 pBt->inTransaction = p->inTrans;
2488 }
danielk1977641b0f42007-12-21 04:47:25 +00002489#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977404ca072009-03-16 13:19:36 +00002490 if( wrflag ){
2491 assert( !pBt->pWriter );
2492 pBt->pWriter = p;
shaneca18d202009-03-23 02:34:32 +00002493 pBt->isExclusive = (u8)(wrflag>1);
danielk1977641b0f42007-12-21 04:47:25 +00002494 }
2495#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002496 }
2497
drhd677b3d2007-08-20 22:48:41 +00002498
2499trans_begun:
danielk1977fd7f0452008-12-17 17:30:26 +00002500 if( rc==SQLITE_OK && wrflag ){
danielk197712dd5492008-12-18 15:45:07 +00002501 /* This call makes sure that the pager has the correct number of
2502 ** open savepoints. If the second parameter is greater than 0 and
2503 ** the sub-journal is not already open, then it will be opened here.
2504 */
danielk1977fd7f0452008-12-17 17:30:26 +00002505 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2506 }
danielk197712dd5492008-12-18 15:45:07 +00002507
danielk1977aef0bf62005-12-30 16:28:01 +00002508 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002509 sqlite3BtreeLeave(p);
drhb8ca3072001-12-05 00:21:20 +00002510 return rc;
drha059ad02001-04-17 20:09:11 +00002511}
2512
danielk1977687566d2004-11-02 12:56:41 +00002513#ifndef SQLITE_OMIT_AUTOVACUUM
2514
2515/*
2516** Set the pointer-map entries for all children of page pPage. Also, if
2517** pPage contains cells that point to overflow pages, set the pointer
2518** map entries for the overflow pages as well.
2519*/
2520static int setChildPtrmaps(MemPage *pPage){
2521 int i; /* Counter variable */
2522 int nCell; /* Number of cells in page pPage */
danielk19772df71c72007-05-24 07:22:42 +00002523 int rc; /* Return code */
danielk1977aef0bf62005-12-30 16:28:01 +00002524 BtShared *pBt = pPage->pBt;
drhf49661a2008-12-10 16:45:50 +00002525 u8 isInitOrig = pPage->isInit;
danielk1977687566d2004-11-02 12:56:41 +00002526 Pgno pgno = pPage->pgno;
2527
drh1fee73e2007-08-29 04:00:57 +00002528 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197730548662009-07-09 05:07:37 +00002529 rc = btreeInitPage(pPage);
danielk19772df71c72007-05-24 07:22:42 +00002530 if( rc!=SQLITE_OK ){
2531 goto set_child_ptrmaps_out;
2532 }
danielk1977687566d2004-11-02 12:56:41 +00002533 nCell = pPage->nCell;
2534
2535 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002536 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002537
drh98add2e2009-07-20 17:11:49 +00002538 ptrmapPutOvflPtr(pPage, pCell, &rc);
danielk197726836652005-01-17 01:33:13 +00002539
danielk1977687566d2004-11-02 12:56:41 +00002540 if( !pPage->leaf ){
2541 Pgno childPgno = get4byte(pCell);
drh98add2e2009-07-20 17:11:49 +00002542 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
danielk1977687566d2004-11-02 12:56:41 +00002543 }
2544 }
2545
2546 if( !pPage->leaf ){
2547 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh98add2e2009-07-20 17:11:49 +00002548 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
danielk1977687566d2004-11-02 12:56:41 +00002549 }
2550
2551set_child_ptrmaps_out:
2552 pPage->isInit = isInitOrig;
2553 return rc;
2554}
2555
2556/*
drhf3aed592009-07-08 18:12:49 +00002557** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
2558** that it points to iTo. Parameter eType describes the type of pointer to
2559** be modified, as follows:
danielk1977687566d2004-11-02 12:56:41 +00002560**
2561** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2562** page of pPage.
2563**
2564** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2565** page pointed to by one of the cells on pPage.
2566**
2567** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2568** overflow page in the list.
2569*/
danielk1977fdb7cdb2005-01-17 02:12:18 +00002570static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
drh1fee73e2007-08-29 04:00:57 +00002571 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhc5053fb2008-11-27 02:22:10 +00002572 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk1977687566d2004-11-02 12:56:41 +00002573 if( eType==PTRMAP_OVERFLOW2 ){
danielk1977f78fc082004-11-02 14:40:32 +00002574 /* The pointer is always the first 4 bytes of the page in this case. */
danielk1977fdb7cdb2005-01-17 02:12:18 +00002575 if( get4byte(pPage->aData)!=iFrom ){
drh49285702005-09-17 15:20:26 +00002576 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002577 }
danielk1977f78fc082004-11-02 14:40:32 +00002578 put4byte(pPage->aData, iTo);
danielk1977687566d2004-11-02 12:56:41 +00002579 }else{
drhf49661a2008-12-10 16:45:50 +00002580 u8 isInitOrig = pPage->isInit;
danielk1977687566d2004-11-02 12:56:41 +00002581 int i;
2582 int nCell;
2583
danielk197730548662009-07-09 05:07:37 +00002584 btreeInitPage(pPage);
danielk1977687566d2004-11-02 12:56:41 +00002585 nCell = pPage->nCell;
2586
danielk1977687566d2004-11-02 12:56:41 +00002587 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002588 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002589 if( eType==PTRMAP_OVERFLOW1 ){
2590 CellInfo info;
danielk197730548662009-07-09 05:07:37 +00002591 btreeParseCellPtr(pPage, pCell, &info);
danielk1977687566d2004-11-02 12:56:41 +00002592 if( info.iOverflow ){
2593 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2594 put4byte(&pCell[info.iOverflow], iTo);
2595 break;
2596 }
2597 }
2598 }else{
2599 if( get4byte(pCell)==iFrom ){
2600 put4byte(pCell, iTo);
2601 break;
2602 }
2603 }
2604 }
2605
2606 if( i==nCell ){
danielk1977fdb7cdb2005-01-17 02:12:18 +00002607 if( eType!=PTRMAP_BTREE ||
2608 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
drh49285702005-09-17 15:20:26 +00002609 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002610 }
danielk1977687566d2004-11-02 12:56:41 +00002611 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2612 }
2613
2614 pPage->isInit = isInitOrig;
2615 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002616 return SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002617}
2618
danielk1977003ba062004-11-04 02:57:33 +00002619
danielk19777701e812005-01-10 12:59:51 +00002620/*
2621** Move the open database page pDbPage to location iFreePage in the
2622** database. The pDbPage reference remains valid.
drhe64ca7b2009-07-16 18:21:17 +00002623**
2624** The isCommit flag indicates that there is no need to remember that
2625** the journal needs to be sync()ed before database page pDbPage->pgno
2626** can be written to. The caller has already promised not to write to that
2627** page.
danielk19777701e812005-01-10 12:59:51 +00002628*/
danielk1977003ba062004-11-04 02:57:33 +00002629static int relocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00002630 BtShared *pBt, /* Btree */
danielk19777701e812005-01-10 12:59:51 +00002631 MemPage *pDbPage, /* Open page to move */
2632 u8 eType, /* Pointer map 'type' entry for pDbPage */
2633 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
danielk19774c999992008-07-16 18:17:55 +00002634 Pgno iFreePage, /* The location to move pDbPage to */
drhe64ca7b2009-07-16 18:21:17 +00002635 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
danielk1977003ba062004-11-04 02:57:33 +00002636){
2637 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2638 Pgno iDbPage = pDbPage->pgno;
2639 Pager *pPager = pBt->pPager;
2640 int rc;
2641
danielk1977a0bf2652004-11-04 14:30:04 +00002642 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2643 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
drh1fee73e2007-08-29 04:00:57 +00002644 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +00002645 assert( pDbPage->pBt==pBt );
danielk1977003ba062004-11-04 02:57:33 +00002646
drh85b623f2007-12-13 21:54:09 +00002647 /* Move page iDbPage from its current location to page number iFreePage */
danielk1977003ba062004-11-04 02:57:33 +00002648 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2649 iDbPage, iFreePage, iPtrPage, eType));
danielk19774c999992008-07-16 18:17:55 +00002650 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
danielk1977003ba062004-11-04 02:57:33 +00002651 if( rc!=SQLITE_OK ){
2652 return rc;
2653 }
2654 pDbPage->pgno = iFreePage;
2655
2656 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2657 ** that point to overflow pages. The pointer map entries for all these
2658 ** pages need to be changed.
2659 **
2660 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2661 ** pointer to a subsequent overflow page. If this is the case, then
2662 ** the pointer map needs to be updated for the subsequent overflow page.
2663 */
danielk1977a0bf2652004-11-04 14:30:04 +00002664 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00002665 rc = setChildPtrmaps(pDbPage);
2666 if( rc!=SQLITE_OK ){
2667 return rc;
2668 }
2669 }else{
2670 Pgno nextOvfl = get4byte(pDbPage->aData);
2671 if( nextOvfl!=0 ){
drh98add2e2009-07-20 17:11:49 +00002672 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
danielk1977003ba062004-11-04 02:57:33 +00002673 if( rc!=SQLITE_OK ){
2674 return rc;
2675 }
2676 }
2677 }
2678
2679 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2680 ** that it points at iFreePage. Also fix the pointer map entry for
2681 ** iPtrPage.
2682 */
danielk1977a0bf2652004-11-04 14:30:04 +00002683 if( eType!=PTRMAP_ROOTPAGE ){
danielk197730548662009-07-09 05:07:37 +00002684 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00002685 if( rc!=SQLITE_OK ){
2686 return rc;
2687 }
danielk19773b8a05f2007-03-19 17:44:26 +00002688 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
danielk1977a0bf2652004-11-04 14:30:04 +00002689 if( rc!=SQLITE_OK ){
2690 releasePage(pPtrPage);
2691 return rc;
2692 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002693 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
danielk1977003ba062004-11-04 02:57:33 +00002694 releasePage(pPtrPage);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002695 if( rc==SQLITE_OK ){
drh98add2e2009-07-20 17:11:49 +00002696 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002697 }
danielk1977003ba062004-11-04 02:57:33 +00002698 }
danielk1977003ba062004-11-04 02:57:33 +00002699 return rc;
2700}
2701
danielk1977dddbcdc2007-04-26 14:42:34 +00002702/* Forward declaration required by incrVacuumStep(). */
drh4f0c5872007-03-26 22:05:01 +00002703static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
danielk1977687566d2004-11-02 12:56:41 +00002704
2705/*
danielk1977dddbcdc2007-04-26 14:42:34 +00002706** Perform a single step of an incremental-vacuum. If successful,
2707** return SQLITE_OK. If there is no work to do (and therefore no
2708** point in calling this function again), return SQLITE_DONE.
2709**
2710** More specificly, this function attempts to re-organize the
2711** database so that the last page of the file currently in use
2712** is no longer in use.
2713**
drhea8ffdf2009-07-22 00:35:23 +00002714** If the nFin parameter is non-zero, this function assumes
danielk1977dddbcdc2007-04-26 14:42:34 +00002715** that the caller will keep calling incrVacuumStep() until
2716** it returns SQLITE_DONE or an error, and that nFin is the
2717** number of pages the database file will contain after this
drhea8ffdf2009-07-22 00:35:23 +00002718** process is complete. If nFin is zero, it is assumed that
2719** incrVacuumStep() will be called a finite amount of times
2720** which may or may not empty the freelist. A full autovacuum
2721** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0.
danielk1977dddbcdc2007-04-26 14:42:34 +00002722*/
danielk19773460d192008-12-27 15:23:13 +00002723static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
danielk1977dddbcdc2007-04-26 14:42:34 +00002724 Pgno nFreeList; /* Number of pages still on the free-list */
2725
drh1fee73e2007-08-29 04:00:57 +00002726 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977fa542f12009-04-02 18:28:08 +00002727 assert( iLastPg>nFin );
danielk1977dddbcdc2007-04-26 14:42:34 +00002728
2729 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2730 int rc;
2731 u8 eType;
2732 Pgno iPtrPage;
2733
2734 nFreeList = get4byte(&pBt->pPage1->aData[36]);
danielk1977fa542f12009-04-02 18:28:08 +00002735 if( nFreeList==0 ){
danielk1977dddbcdc2007-04-26 14:42:34 +00002736 return SQLITE_DONE;
2737 }
2738
2739 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2740 if( rc!=SQLITE_OK ){
2741 return rc;
2742 }
2743 if( eType==PTRMAP_ROOTPAGE ){
2744 return SQLITE_CORRUPT_BKPT;
2745 }
2746
2747 if( eType==PTRMAP_FREEPAGE ){
2748 if( nFin==0 ){
2749 /* Remove the page from the files free-list. This is not required
danielk19774ef24492007-05-23 09:52:41 +00002750 ** if nFin is non-zero. In that case, the free-list will be
danielk1977dddbcdc2007-04-26 14:42:34 +00002751 ** truncated to zero after this function returns, so it doesn't
2752 ** matter if it still contains some garbage entries.
2753 */
2754 Pgno iFreePg;
2755 MemPage *pFreePg;
2756 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2757 if( rc!=SQLITE_OK ){
2758 return rc;
2759 }
2760 assert( iFreePg==iLastPg );
2761 releasePage(pFreePg);
2762 }
2763 } else {
2764 Pgno iFreePg; /* Index of free page to move pLastPg to */
2765 MemPage *pLastPg;
2766
danielk197730548662009-07-09 05:07:37 +00002767 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002768 if( rc!=SQLITE_OK ){
2769 return rc;
2770 }
2771
danielk1977b4626a32007-04-28 15:47:43 +00002772 /* If nFin is zero, this loop runs exactly once and page pLastPg
2773 ** is swapped with the first free page pulled off the free list.
2774 **
2775 ** On the other hand, if nFin is greater than zero, then keep
2776 ** looping until a free-page located within the first nFin pages
2777 ** of the file is found.
2778 */
danielk1977dddbcdc2007-04-26 14:42:34 +00002779 do {
2780 MemPage *pFreePg;
2781 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2782 if( rc!=SQLITE_OK ){
2783 releasePage(pLastPg);
2784 return rc;
2785 }
2786 releasePage(pFreePg);
2787 }while( nFin!=0 && iFreePg>nFin );
2788 assert( iFreePg<iLastPg );
danielk1977b4626a32007-04-28 15:47:43 +00002789
2790 rc = sqlite3PagerWrite(pLastPg->pDbPage);
danielk1977662278e2007-11-05 15:30:12 +00002791 if( rc==SQLITE_OK ){
danielk19774c999992008-07-16 18:17:55 +00002792 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
danielk1977662278e2007-11-05 15:30:12 +00002793 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002794 releasePage(pLastPg);
2795 if( rc!=SQLITE_OK ){
2796 return rc;
danielk1977662278e2007-11-05 15:30:12 +00002797 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002798 }
2799 }
2800
danielk19773460d192008-12-27 15:23:13 +00002801 if( nFin==0 ){
2802 iLastPg--;
2803 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
danielk1977f4027782009-03-30 18:50:04 +00002804 if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2805 MemPage *pPg;
danielk197730548662009-07-09 05:07:37 +00002806 int rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
danielk1977f4027782009-03-30 18:50:04 +00002807 if( rc!=SQLITE_OK ){
2808 return rc;
2809 }
2810 rc = sqlite3PagerWrite(pPg->pDbPage);
2811 releasePage(pPg);
2812 if( rc!=SQLITE_OK ){
2813 return rc;
2814 }
2815 }
danielk19773460d192008-12-27 15:23:13 +00002816 iLastPg--;
2817 }
2818 sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
danielk1977dddbcdc2007-04-26 14:42:34 +00002819 }
2820 return SQLITE_OK;
2821}
2822
2823/*
2824** A write-transaction must be opened before calling this function.
2825** It performs a single unit of work towards an incremental vacuum.
2826**
2827** If the incremental vacuum is finished after this function has run,
shanebe217792009-03-05 04:20:31 +00002828** SQLITE_DONE is returned. If it is not finished, but no error occurred,
danielk1977dddbcdc2007-04-26 14:42:34 +00002829** SQLITE_OK is returned. Otherwise an SQLite error code.
2830*/
2831int sqlite3BtreeIncrVacuum(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00002832 int rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002833 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002834
2835 sqlite3BtreeEnter(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00002836 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2837 if( !pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00002838 rc = SQLITE_DONE;
2839 }else{
2840 invalidateAllOverflowCache(pBt);
danielk1977bea2a942009-01-20 17:06:27 +00002841 rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt));
danielk1977dddbcdc2007-04-26 14:42:34 +00002842 }
drhd677b3d2007-08-20 22:48:41 +00002843 sqlite3BtreeLeave(p);
2844 return rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002845}
2846
2847/*
danielk19773b8a05f2007-03-19 17:44:26 +00002848** This routine is called prior to sqlite3PagerCommit when a transaction
danielk1977687566d2004-11-02 12:56:41 +00002849** is commited for an auto-vacuum database.
danielk197724168722007-04-02 05:07:47 +00002850**
2851** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2852** the database file should be truncated to during the commit process.
2853** i.e. the database has been reorganized so that only the first *pnTrunc
2854** pages are in use.
danielk1977687566d2004-11-02 12:56:41 +00002855*/
danielk19773460d192008-12-27 15:23:13 +00002856static int autoVacuumCommit(BtShared *pBt){
danielk1977dddbcdc2007-04-26 14:42:34 +00002857 int rc = SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002858 Pager *pPager = pBt->pPager;
drhf94a1732008-09-30 17:18:17 +00002859 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002860
drh1fee73e2007-08-29 04:00:57 +00002861 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +00002862 invalidateAllOverflowCache(pBt);
danielk1977dddbcdc2007-04-26 14:42:34 +00002863 assert(pBt->autoVacuum);
2864 if( !pBt->incrVacuum ){
drhea8ffdf2009-07-22 00:35:23 +00002865 Pgno nFin; /* Number of pages in database after autovacuuming */
2866 Pgno nFree; /* Number of pages on the freelist initially */
drh41d628c2009-07-11 17:04:08 +00002867 Pgno nPtrmap; /* Number of PtrMap pages to be freed */
2868 Pgno iFree; /* The next page to be freed */
2869 int nEntry; /* Number of entries on one ptrmap page */
2870 Pgno nOrig; /* Database size before freeing */
danielk1977687566d2004-11-02 12:56:41 +00002871
drh41d628c2009-07-11 17:04:08 +00002872 nOrig = pagerPagecount(pBt);
danielk1977ef165ce2009-04-06 17:50:03 +00002873 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
2874 /* It is not possible to create a database for which the final page
2875 ** is either a pointer-map page or the pending-byte page. If one
2876 ** is encountered, this indicates corruption.
2877 */
danielk19773460d192008-12-27 15:23:13 +00002878 return SQLITE_CORRUPT_BKPT;
2879 }
danielk1977ef165ce2009-04-06 17:50:03 +00002880
danielk19773460d192008-12-27 15:23:13 +00002881 nFree = get4byte(&pBt->pPage1->aData[36]);
drh41d628c2009-07-11 17:04:08 +00002882 nEntry = pBt->usableSize/5;
2883 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
danielk19773460d192008-12-27 15:23:13 +00002884 nFin = nOrig - nFree - nPtrmap;
danielk1977ef165ce2009-04-06 17:50:03 +00002885 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
danielk19773460d192008-12-27 15:23:13 +00002886 nFin--;
2887 }
2888 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2889 nFin--;
danielk1977dddbcdc2007-04-26 14:42:34 +00002890 }
drhc5e47ac2009-06-04 00:11:56 +00002891 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
danielk1977687566d2004-11-02 12:56:41 +00002892
danielk19773460d192008-12-27 15:23:13 +00002893 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
2894 rc = incrVacuumStep(pBt, nFin, iFree);
danielk1977dddbcdc2007-04-26 14:42:34 +00002895 }
danielk19773460d192008-12-27 15:23:13 +00002896 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
danielk1977dddbcdc2007-04-26 14:42:34 +00002897 rc = SQLITE_OK;
danielk19773460d192008-12-27 15:23:13 +00002898 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2899 put4byte(&pBt->pPage1->aData[32], 0);
2900 put4byte(&pBt->pPage1->aData[36], 0);
2901 sqlite3PagerTruncateImage(pBt->pPager, nFin);
danielk1977dddbcdc2007-04-26 14:42:34 +00002902 }
2903 if( rc!=SQLITE_OK ){
2904 sqlite3PagerRollback(pPager);
2905 }
danielk1977687566d2004-11-02 12:56:41 +00002906 }
2907
danielk19773b8a05f2007-03-19 17:44:26 +00002908 assert( nRef==sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002909 return rc;
2910}
danielk1977dddbcdc2007-04-26 14:42:34 +00002911
danielk1977a50d9aa2009-06-08 14:49:45 +00002912#else /* ifndef SQLITE_OMIT_AUTOVACUUM */
2913# define setChildPtrmaps(x) SQLITE_OK
2914#endif
danielk1977687566d2004-11-02 12:56:41 +00002915
2916/*
drh80e35f42007-03-30 14:06:34 +00002917** This routine does the first phase of a two-phase commit. This routine
2918** causes a rollback journal to be created (if it does not already exist)
2919** and populated with enough information so that if a power loss occurs
2920** the database can be restored to its original state by playing back
2921** the journal. Then the contents of the journal are flushed out to
2922** the disk. After the journal is safely on oxide, the changes to the
2923** database are written into the database file and flushed to oxide.
2924** At the end of this call, the rollback journal still exists on the
2925** disk and we are still holding all locks, so the transaction has not
drh51898cf2009-04-19 20:51:06 +00002926** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
drh80e35f42007-03-30 14:06:34 +00002927** commit process.
2928**
2929** This call is a no-op if no write-transaction is currently active on pBt.
2930**
2931** Otherwise, sync the database file for the btree pBt. zMaster points to
2932** the name of a master journal file that should be written into the
2933** individual journal file, or is NULL, indicating no master journal file
2934** (single database transaction).
2935**
2936** When this is called, the master journal should already have been
2937** created, populated with this journal pointer and synced to disk.
2938**
2939** Once this is routine has returned, the only thing required to commit
2940** the write-transaction for this database file is to delete the journal.
2941*/
2942int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2943 int rc = SQLITE_OK;
2944 if( p->inTrans==TRANS_WRITE ){
2945 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002946 sqlite3BtreeEnter(p);
drh80e35f42007-03-30 14:06:34 +00002947#ifndef SQLITE_OMIT_AUTOVACUUM
2948 if( pBt->autoVacuum ){
danielk19773460d192008-12-27 15:23:13 +00002949 rc = autoVacuumCommit(pBt);
drh80e35f42007-03-30 14:06:34 +00002950 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002951 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002952 return rc;
2953 }
2954 }
2955#endif
drh49b9d332009-01-02 18:10:42 +00002956 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
drhd677b3d2007-08-20 22:48:41 +00002957 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002958 }
2959 return rc;
2960}
2961
2962/*
danielk197794b30732009-07-02 17:21:57 +00002963** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
2964** at the conclusion of a transaction.
2965*/
2966static void btreeEndTransaction(Btree *p){
2967 BtShared *pBt = p->pBt;
danielk197794b30732009-07-02 17:21:57 +00002968 assert( sqlite3BtreeHoldsMutex(p) );
2969
danielk197794b30732009-07-02 17:21:57 +00002970 btreeClearHasContent(pBt);
danfa401de2009-10-16 14:55:03 +00002971 if( p->inTrans>TRANS_NONE && p->db->activeVdbeCnt>1 ){
2972 /* If there are other active statements that belong to this database
2973 ** handle, downgrade to a read-only transaction. The other statements
2974 ** may still be reading from the database. */
danielk197794b30732009-07-02 17:21:57 +00002975 downgradeAllSharedCacheTableLocks(p);
2976 p->inTrans = TRANS_READ;
2977 }else{
2978 /* If the handle had any kind of transaction open, decrement the
2979 ** transaction count of the shared btree. If the transaction count
2980 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
2981 ** call below will unlock the pager. */
2982 if( p->inTrans!=TRANS_NONE ){
2983 clearAllSharedCacheTableLocks(p);
2984 pBt->nTransaction--;
2985 if( 0==pBt->nTransaction ){
2986 pBt->inTransaction = TRANS_NONE;
2987 }
2988 }
2989
2990 /* Set the current transaction state to TRANS_NONE and unlock the
2991 ** pager if this call closed the only read or write transaction. */
2992 p->inTrans = TRANS_NONE;
2993 unlockBtreeIfUnused(pBt);
2994 }
2995
2996 btreeIntegrity(p);
2997}
2998
2999/*
drh2aa679f2001-06-25 02:11:07 +00003000** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00003001**
drh6e345992007-03-30 11:12:08 +00003002** This routine implements the second phase of a 2-phase commit. The
drh51898cf2009-04-19 20:51:06 +00003003** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3004** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
3005** routine did all the work of writing information out to disk and flushing the
drh6e345992007-03-30 11:12:08 +00003006** contents so that they are written onto the disk platter. All this
drh51898cf2009-04-19 20:51:06 +00003007** routine has to do is delete or truncate or zero the header in the
3008** the rollback journal (which causes the transaction to commit) and
3009** drop locks.
drh6e345992007-03-30 11:12:08 +00003010**
drh5e00f6c2001-09-13 13:46:56 +00003011** This will release the write lock on the database file. If there
3012** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00003013*/
drh80e35f42007-03-30 14:06:34 +00003014int sqlite3BtreeCommitPhaseTwo(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00003015 BtShared *pBt = p->pBt;
3016
drhd677b3d2007-08-20 22:48:41 +00003017 sqlite3BtreeEnter(p);
danielk1977aef0bf62005-12-30 16:28:01 +00003018 btreeIntegrity(p);
danielk1977aef0bf62005-12-30 16:28:01 +00003019
3020 /* If the handle has a write-transaction open, commit the shared-btrees
3021 ** transaction and set the shared state to TRANS_READ.
3022 */
3023 if( p->inTrans==TRANS_WRITE ){
danielk19777f7bc662006-01-23 13:47:47 +00003024 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00003025 assert( pBt->inTransaction==TRANS_WRITE );
3026 assert( pBt->nTransaction>0 );
drh80e35f42007-03-30 14:06:34 +00003027 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
danielk19777f7bc662006-01-23 13:47:47 +00003028 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00003029 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00003030 return rc;
3031 }
danielk1977aef0bf62005-12-30 16:28:01 +00003032 pBt->inTransaction = TRANS_READ;
danielk1977ee5741e2004-05-31 10:01:34 +00003033 }
danielk1977aef0bf62005-12-30 16:28:01 +00003034
danielk197794b30732009-07-02 17:21:57 +00003035 btreeEndTransaction(p);
drhd677b3d2007-08-20 22:48:41 +00003036 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00003037 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00003038}
3039
drh80e35f42007-03-30 14:06:34 +00003040/*
3041** Do both phases of a commit.
3042*/
3043int sqlite3BtreeCommit(Btree *p){
3044 int rc;
drhd677b3d2007-08-20 22:48:41 +00003045 sqlite3BtreeEnter(p);
drh80e35f42007-03-30 14:06:34 +00003046 rc = sqlite3BtreeCommitPhaseOne(p, 0);
3047 if( rc==SQLITE_OK ){
3048 rc = sqlite3BtreeCommitPhaseTwo(p);
3049 }
drhd677b3d2007-08-20 22:48:41 +00003050 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00003051 return rc;
3052}
3053
danielk1977fbcd5852004-06-15 02:44:18 +00003054#ifndef NDEBUG
3055/*
3056** Return the number of write-cursors open on this handle. This is for use
3057** in assert() expressions, so it is only compiled if NDEBUG is not
3058** defined.
drhfb982642007-08-30 01:19:59 +00003059**
3060** For the purposes of this routine, a write-cursor is any cursor that
3061** is capable of writing to the databse. That means the cursor was
3062** originally opened for writing and the cursor has not be disabled
3063** by having its state changed to CURSOR_FAULT.
danielk1977fbcd5852004-06-15 02:44:18 +00003064*/
danielk1977aef0bf62005-12-30 16:28:01 +00003065static int countWriteCursors(BtShared *pBt){
danielk1977fbcd5852004-06-15 02:44:18 +00003066 BtCursor *pCur;
3067 int r = 0;
3068 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drhfb982642007-08-30 01:19:59 +00003069 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
danielk1977fbcd5852004-06-15 02:44:18 +00003070 }
3071 return r;
3072}
3073#endif
3074
drhc39e0002004-05-07 23:50:57 +00003075/*
drhfb982642007-08-30 01:19:59 +00003076** This routine sets the state to CURSOR_FAULT and the error
3077** code to errCode for every cursor on BtShared that pBtree
3078** references.
3079**
3080** Every cursor is tripped, including cursors that belong
3081** to other database connections that happen to be sharing
3082** the cache with pBtree.
3083**
3084** This routine gets called when a rollback occurs.
3085** All cursors using the same cache must be tripped
3086** to prevent them from trying to use the btree after
3087** the rollback. The rollback may have deleted tables
3088** or moved root pages, so it is not sufficient to
3089** save the state of the cursor. The cursor must be
3090** invalidated.
3091*/
3092void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3093 BtCursor *p;
3094 sqlite3BtreeEnter(pBtree);
3095 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
danielk1977bc2ca9e2008-11-13 14:28:28 +00003096 int i;
danielk1977be51a652008-10-08 17:58:48 +00003097 sqlite3BtreeClearCursor(p);
drhfb982642007-08-30 01:19:59 +00003098 p->eState = CURSOR_FAULT;
drh4c301aa2009-07-15 17:25:45 +00003099 p->skipNext = errCode;
danielk1977bc2ca9e2008-11-13 14:28:28 +00003100 for(i=0; i<=p->iPage; i++){
3101 releasePage(p->apPage[i]);
3102 p->apPage[i] = 0;
3103 }
drhfb982642007-08-30 01:19:59 +00003104 }
3105 sqlite3BtreeLeave(pBtree);
3106}
3107
3108/*
drhecdc7532001-09-23 02:35:53 +00003109** Rollback the transaction in progress. All cursors will be
3110** invalided by this operation. Any attempt to use a cursor
3111** that was open at the beginning of this operation will result
3112** in an error.
drh5e00f6c2001-09-13 13:46:56 +00003113**
3114** This will release the write lock on the database file. If there
3115** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00003116*/
danielk1977aef0bf62005-12-30 16:28:01 +00003117int sqlite3BtreeRollback(Btree *p){
danielk19778d34dfd2006-01-24 16:37:57 +00003118 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00003119 BtShared *pBt = p->pBt;
drh24cd67e2004-05-10 16:18:47 +00003120 MemPage *pPage1;
danielk1977aef0bf62005-12-30 16:28:01 +00003121
drhd677b3d2007-08-20 22:48:41 +00003122 sqlite3BtreeEnter(p);
danielk19772b8c13e2006-01-24 14:21:24 +00003123 rc = saveAllCursors(pBt, 0, 0);
danielk19778d34dfd2006-01-24 16:37:57 +00003124#ifndef SQLITE_OMIT_SHARED_CACHE
danielk19772b8c13e2006-01-24 14:21:24 +00003125 if( rc!=SQLITE_OK ){
shanebe217792009-03-05 04:20:31 +00003126 /* This is a horrible situation. An IO or malloc() error occurred whilst
danielk19778d34dfd2006-01-24 16:37:57 +00003127 ** trying to save cursor positions. If this is an automatic rollback (as
3128 ** the result of a constraint, malloc() failure or IO error) then
3129 ** the cache may be internally inconsistent (not contain valid trees) so
3130 ** we cannot simply return the error to the caller. Instead, abort
3131 ** all queries that may be using any of the cursors that failed to save.
3132 */
drhfb982642007-08-30 01:19:59 +00003133 sqlite3BtreeTripAllCursors(p, rc);
danielk19772b8c13e2006-01-24 14:21:24 +00003134 }
danielk19778d34dfd2006-01-24 16:37:57 +00003135#endif
danielk1977aef0bf62005-12-30 16:28:01 +00003136 btreeIntegrity(p);
danielk1977aef0bf62005-12-30 16:28:01 +00003137
3138 if( p->inTrans==TRANS_WRITE ){
danielk19778d34dfd2006-01-24 16:37:57 +00003139 int rc2;
danielk1977aef0bf62005-12-30 16:28:01 +00003140
danielk19778d34dfd2006-01-24 16:37:57 +00003141 assert( TRANS_WRITE==pBt->inTransaction );
danielk19773b8a05f2007-03-19 17:44:26 +00003142 rc2 = sqlite3PagerRollback(pBt->pPager);
danielk19778d34dfd2006-01-24 16:37:57 +00003143 if( rc2!=SQLITE_OK ){
3144 rc = rc2;
3145 }
3146
drh24cd67e2004-05-10 16:18:47 +00003147 /* The rollback may have destroyed the pPage1->aData value. So
danielk197730548662009-07-09 05:07:37 +00003148 ** call btreeGetPage() on page 1 again to make
drh16a9b832007-05-05 18:39:25 +00003149 ** sure pPage1->aData is set correctly. */
danielk197730548662009-07-09 05:07:37 +00003150 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
drh24cd67e2004-05-10 16:18:47 +00003151 releasePage(pPage1);
3152 }
danielk1977fbcd5852004-06-15 02:44:18 +00003153 assert( countWriteCursors(pBt)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00003154 pBt->inTransaction = TRANS_READ;
drh24cd67e2004-05-10 16:18:47 +00003155 }
danielk1977aef0bf62005-12-30 16:28:01 +00003156
danielk197794b30732009-07-02 17:21:57 +00003157 btreeEndTransaction(p);
drhd677b3d2007-08-20 22:48:41 +00003158 sqlite3BtreeLeave(p);
drha059ad02001-04-17 20:09:11 +00003159 return rc;
3160}
3161
3162/*
danielk1977bd434552009-03-18 10:33:00 +00003163** Start a statement subtransaction. The subtransaction can can be rolled
3164** back independently of the main transaction. You must start a transaction
3165** before starting a subtransaction. The subtransaction is ended automatically
3166** if the main transaction commits or rolls back.
drhab01f612004-05-22 02:55:23 +00003167**
3168** Statement subtransactions are used around individual SQL statements
3169** that are contained within a BEGIN...COMMIT block. If a constraint
3170** error occurs within the statement, the effect of that one statement
3171** can be rolled back without having to rollback the entire transaction.
danielk1977bd434552009-03-18 10:33:00 +00003172**
3173** A statement sub-transaction is implemented as an anonymous savepoint. The
3174** value passed as the second parameter is the total number of savepoints,
3175** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3176** are no active savepoints and no other statement-transactions open,
3177** iStatement is 1. This anonymous savepoint can be released or rolled back
3178** using the sqlite3BtreeSavepoint() function.
drh663fc632002-02-02 18:49:19 +00003179*/
danielk1977bd434552009-03-18 10:33:00 +00003180int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
drh663fc632002-02-02 18:49:19 +00003181 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00003182 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00003183 sqlite3BtreeEnter(p);
drh64022502009-01-09 14:11:04 +00003184 assert( p->inTrans==TRANS_WRITE );
drh64022502009-01-09 14:11:04 +00003185 assert( pBt->readOnly==0 );
danielk1977bd434552009-03-18 10:33:00 +00003186 assert( iStatement>0 );
3187 assert( iStatement>p->db->nSavepoint );
3188 if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){
drh64022502009-01-09 14:11:04 +00003189 rc = SQLITE_INTERNAL;
drhd677b3d2007-08-20 22:48:41 +00003190 }else{
3191 assert( pBt->inTransaction==TRANS_WRITE );
drh64022502009-01-09 14:11:04 +00003192 /* At the pager level, a statement transaction is a savepoint with
3193 ** an index greater than all savepoints created explicitly using
3194 ** SQL statements. It is illegal to open, release or rollback any
3195 ** such savepoints while the statement transaction savepoint is active.
3196 */
danielk1977bd434552009-03-18 10:33:00 +00003197 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
danielk197797a227c2006-01-20 16:32:04 +00003198 }
drhd677b3d2007-08-20 22:48:41 +00003199 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00003200 return rc;
3201}
3202
3203/*
danielk1977fd7f0452008-12-17 17:30:26 +00003204** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3205** or SAVEPOINT_RELEASE. This function either releases or rolls back the
danielk197712dd5492008-12-18 15:45:07 +00003206** savepoint identified by parameter iSavepoint, depending on the value
3207** of op.
3208**
3209** Normally, iSavepoint is greater than or equal to zero. However, if op is
3210** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3211** contents of the entire transaction are rolled back. This is different
3212** from a normal transaction rollback, as no locks are released and the
3213** transaction remains open.
danielk1977fd7f0452008-12-17 17:30:26 +00003214*/
3215int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3216 int rc = SQLITE_OK;
3217 if( p && p->inTrans==TRANS_WRITE ){
3218 BtShared *pBt = p->pBt;
danielk1977fd7f0452008-12-17 17:30:26 +00003219 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3220 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3221 sqlite3BtreeEnter(p);
danielk1977fd7f0452008-12-17 17:30:26 +00003222 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
drh9f0bbf92009-01-02 21:08:09 +00003223 if( rc==SQLITE_OK ){
3224 rc = newDatabase(pBt);
3225 }
danielk1977fd7f0452008-12-17 17:30:26 +00003226 sqlite3BtreeLeave(p);
3227 }
3228 return rc;
3229}
3230
3231/*
drh8b2f49b2001-06-08 00:21:52 +00003232** Create a new cursor for the BTree whose root is on the page
danielk19773e8add92009-07-04 17:16:00 +00003233** iTable. If a read-only cursor is requested, it is assumed that
3234** the caller already has at least a read-only transaction open
3235** on the database already. If a write-cursor is requested, then
3236** the caller is assumed to have an open write transaction.
drh1bee3d72001-10-15 00:44:35 +00003237**
3238** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00003239** If wrFlag==1, then the cursor can be used for reading or for
3240** writing if other conditions for writing are also met. These
3241** are the conditions that must be met in order for writing to
3242** be allowed:
drh6446c4d2001-12-15 14:22:18 +00003243**
drhf74b8d92002-09-01 23:20:45 +00003244** 1: The cursor must have been opened with wrFlag==1
3245**
drhfe5d71d2007-03-19 11:54:10 +00003246** 2: Other database connections that share the same pager cache
3247** but which are not in the READ_UNCOMMITTED state may not have
3248** cursors open with wrFlag==0 on the same table. Otherwise
3249** the changes made by this write cursor would be visible to
3250** the read cursors in the other database connection.
drhf74b8d92002-09-01 23:20:45 +00003251**
3252** 3: The database must be writable (not on read-only media)
3253**
3254** 4: There must be an active transaction.
3255**
drh6446c4d2001-12-15 14:22:18 +00003256** No checking is done to make sure that page iTable really is the
3257** root page of a b-tree. If it is not, then the cursor acquired
3258** will not work correctly.
danielk197771d5d2c2008-09-29 11:49:47 +00003259**
3260** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
3261** pointed to by pCur have been zeroed by the caller.
drha059ad02001-04-17 20:09:11 +00003262*/
drhd677b3d2007-08-20 22:48:41 +00003263static int btreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00003264 Btree *p, /* The btree */
3265 int iTable, /* Root page of table to open */
3266 int wrFlag, /* 1 to write. 0 read-only */
3267 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
3268 BtCursor *pCur /* Space for new cursor */
drh3aac2dd2004-04-26 14:10:20 +00003269){
danielk19773e8add92009-07-04 17:16:00 +00003270 BtShared *pBt = p->pBt; /* Shared b-tree handle */
drhecdc7532001-09-23 02:35:53 +00003271
drh1fee73e2007-08-29 04:00:57 +00003272 assert( sqlite3BtreeHoldsMutex(p) );
drhf49661a2008-12-10 16:45:50 +00003273 assert( wrFlag==0 || wrFlag==1 );
danielk197796d48e92009-06-29 06:00:37 +00003274
danielk1977602b4662009-07-02 07:47:33 +00003275 /* The following assert statements verify that if this is a sharable
3276 ** b-tree database, the connection is holding the required table locks,
3277 ** and that no other connection has any open cursor that conflicts with
3278 ** this lock. */
3279 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
danielk197796d48e92009-06-29 06:00:37 +00003280 assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3281
danielk19773e8add92009-07-04 17:16:00 +00003282 /* Assert that the caller has opened the required transaction. */
3283 assert( p->inTrans>TRANS_NONE );
3284 assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3285 assert( pBt->pPage1 && pBt->pPage1->aData );
3286
danielk197796d48e92009-06-29 06:00:37 +00003287 if( NEVER(wrFlag && pBt->readOnly) ){
3288 return SQLITE_READONLY;
drha0c9a112004-03-10 13:42:37 +00003289 }
danielk19773e8add92009-07-04 17:16:00 +00003290 if( iTable==1 && pagerPagecount(pBt)==0 ){
3291 return SQLITE_EMPTY;
3292 }
danielk1977aef0bf62005-12-30 16:28:01 +00003293
danielk1977aef0bf62005-12-30 16:28:01 +00003294 /* Now that no other errors can occur, finish filling in the BtCursor
danielk19773e8add92009-07-04 17:16:00 +00003295 ** variables and link the cursor into the BtShared list. */
danielk1977172114a2009-07-07 15:47:12 +00003296 pCur->pgnoRoot = (Pgno)iTable;
3297 pCur->iPage = -1;
drh1e968a02008-03-25 00:22:21 +00003298 pCur->pKeyInfo = pKeyInfo;
danielk1977aef0bf62005-12-30 16:28:01 +00003299 pCur->pBtree = p;
drhd0679ed2007-08-28 22:24:34 +00003300 pCur->pBt = pBt;
drhf49661a2008-12-10 16:45:50 +00003301 pCur->wrFlag = (u8)wrFlag;
drha059ad02001-04-17 20:09:11 +00003302 pCur->pNext = pBt->pCursor;
3303 if( pCur->pNext ){
3304 pCur->pNext->pPrev = pCur;
3305 }
3306 pBt->pCursor = pCur;
danielk1977da184232006-01-05 11:34:32 +00003307 pCur->eState = CURSOR_INVALID;
drh7f751222009-03-17 22:33:00 +00003308 pCur->cachedRowid = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00003309 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00003310}
drhd677b3d2007-08-20 22:48:41 +00003311int sqlite3BtreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00003312 Btree *p, /* The btree */
3313 int iTable, /* Root page of table to open */
3314 int wrFlag, /* 1 to write. 0 read-only */
3315 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
3316 BtCursor *pCur /* Write new cursor here */
drhd677b3d2007-08-20 22:48:41 +00003317){
3318 int rc;
3319 sqlite3BtreeEnter(p);
danielk1977cd3e8f72008-03-25 09:47:35 +00003320 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
drhd677b3d2007-08-20 22:48:41 +00003321 sqlite3BtreeLeave(p);
3322 return rc;
3323}
drh7f751222009-03-17 22:33:00 +00003324
3325/*
3326** Return the size of a BtCursor object in bytes.
3327**
3328** This interfaces is needed so that users of cursors can preallocate
3329** sufficient storage to hold a cursor. The BtCursor object is opaque
3330** to users so they cannot do the sizeof() themselves - they must call
3331** this routine.
3332*/
3333int sqlite3BtreeCursorSize(void){
danielk1977cd3e8f72008-03-25 09:47:35 +00003334 return sizeof(BtCursor);
3335}
3336
drh7f751222009-03-17 22:33:00 +00003337/*
3338** Set the cached rowid value of every cursor in the same database file
3339** as pCur and having the same root page number as pCur. The value is
3340** set to iRowid.
3341**
3342** Only positive rowid values are considered valid for this cache.
3343** The cache is initialized to zero, indicating an invalid cache.
3344** A btree will work fine with zero or negative rowids. We just cannot
3345** cache zero or negative rowids, which means tables that use zero or
3346** negative rowids might run a little slower. But in practice, zero
3347** or negative rowids are very uncommon so this should not be a problem.
3348*/
3349void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3350 BtCursor *p;
3351 for(p=pCur->pBt->pCursor; p; p=p->pNext){
3352 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3353 }
3354 assert( pCur->cachedRowid==iRowid );
3355}
drhd677b3d2007-08-20 22:48:41 +00003356
drh7f751222009-03-17 22:33:00 +00003357/*
3358** Return the cached rowid for the given cursor. A negative or zero
3359** return value indicates that the rowid cache is invalid and should be
3360** ignored. If the rowid cache has never before been set, then a
3361** zero is returned.
3362*/
3363sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3364 return pCur->cachedRowid;
3365}
drha059ad02001-04-17 20:09:11 +00003366
3367/*
drh5e00f6c2001-09-13 13:46:56 +00003368** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00003369** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00003370*/
drh3aac2dd2004-04-26 14:10:20 +00003371int sqlite3BtreeCloseCursor(BtCursor *pCur){
drhff0587c2007-08-29 17:43:19 +00003372 Btree *pBtree = pCur->pBtree;
danielk1977cd3e8f72008-03-25 09:47:35 +00003373 if( pBtree ){
danielk197771d5d2c2008-09-29 11:49:47 +00003374 int i;
danielk1977cd3e8f72008-03-25 09:47:35 +00003375 BtShared *pBt = pCur->pBt;
3376 sqlite3BtreeEnter(pBtree);
danielk1977be51a652008-10-08 17:58:48 +00003377 sqlite3BtreeClearCursor(pCur);
danielk1977cd3e8f72008-03-25 09:47:35 +00003378 if( pCur->pPrev ){
3379 pCur->pPrev->pNext = pCur->pNext;
3380 }else{
3381 pBt->pCursor = pCur->pNext;
3382 }
3383 if( pCur->pNext ){
3384 pCur->pNext->pPrev = pCur->pPrev;
3385 }
danielk197771d5d2c2008-09-29 11:49:47 +00003386 for(i=0; i<=pCur->iPage; i++){
3387 releasePage(pCur->apPage[i]);
3388 }
danielk1977cd3e8f72008-03-25 09:47:35 +00003389 unlockBtreeIfUnused(pBt);
3390 invalidateOverflowCache(pCur);
3391 /* sqlite3_free(pCur); */
3392 sqlite3BtreeLeave(pBtree);
drha059ad02001-04-17 20:09:11 +00003393 }
drh8c42ca92001-06-22 19:15:00 +00003394 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00003395}
3396
drh5e2f8b92001-05-28 00:41:15 +00003397/*
drh86057612007-06-26 01:04:48 +00003398** Make sure the BtCursor* given in the argument has a valid
3399** BtCursor.info structure. If it is not already valid, call
danielk197730548662009-07-09 05:07:37 +00003400** btreeParseCell() to fill it in.
drhab01f612004-05-22 02:55:23 +00003401**
3402** BtCursor.info is a cache of the information in the current cell.
danielk197730548662009-07-09 05:07:37 +00003403** Using this cache reduces the number of calls to btreeParseCell().
drh86057612007-06-26 01:04:48 +00003404**
3405** 2007-06-25: There is a bug in some versions of MSVC that cause the
3406** compiler to crash when getCellInfo() is implemented as a macro.
3407** But there is a measureable speed advantage to using the macro on gcc
3408** (when less compiler optimizations like -Os or -O0 are used and the
3409** compiler is not doing agressive inlining.) So we use a real function
3410** for MSVC and a macro for everything else. Ticket #2457.
drh9188b382004-05-14 21:12:22 +00003411*/
drh9188b382004-05-14 21:12:22 +00003412#ifndef NDEBUG
danielk19771cc5ed82007-05-16 17:28:43 +00003413 static void assertCellInfo(BtCursor *pCur){
drh9188b382004-05-14 21:12:22 +00003414 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00003415 int iPage = pCur->iPage;
drh51c6d962004-06-06 00:42:25 +00003416 memset(&info, 0, sizeof(info));
danielk197730548662009-07-09 05:07:37 +00003417 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
drh9188b382004-05-14 21:12:22 +00003418 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
drh9188b382004-05-14 21:12:22 +00003419 }
danielk19771cc5ed82007-05-16 17:28:43 +00003420#else
3421 #define assertCellInfo(x)
3422#endif
drh86057612007-06-26 01:04:48 +00003423#ifdef _MSC_VER
3424 /* Use a real function in MSVC to work around bugs in that compiler. */
3425 static void getCellInfo(BtCursor *pCur){
3426 if( pCur->info.nSize==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003427 int iPage = pCur->iPage;
danielk197730548662009-07-09 05:07:37 +00003428 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
drha2c20e42008-03-29 16:01:04 +00003429 pCur->validNKey = 1;
drh86057612007-06-26 01:04:48 +00003430 }else{
3431 assertCellInfo(pCur);
3432 }
3433 }
3434#else /* if not _MSC_VER */
3435 /* Use a macro in all other compilers so that the function is inlined */
danielk197771d5d2c2008-09-29 11:49:47 +00003436#define getCellInfo(pCur) \
3437 if( pCur->info.nSize==0 ){ \
3438 int iPage = pCur->iPage; \
danielk197730548662009-07-09 05:07:37 +00003439 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
danielk197771d5d2c2008-09-29 11:49:47 +00003440 pCur->validNKey = 1; \
3441 }else{ \
3442 assertCellInfo(pCur); \
drh86057612007-06-26 01:04:48 +00003443 }
3444#endif /* _MSC_VER */
drh9188b382004-05-14 21:12:22 +00003445
drhea8ffdf2009-07-22 00:35:23 +00003446#ifndef NDEBUG /* The next routine used only within assert() statements */
3447/*
3448** Return true if the given BtCursor is valid. A valid cursor is one
3449** that is currently pointing to a row in a (non-empty) table.
3450** This is a verification routine is used only within assert() statements.
3451*/
3452int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3453 return pCur && pCur->eState==CURSOR_VALID;
3454}
3455#endif /* NDEBUG */
3456
drh9188b382004-05-14 21:12:22 +00003457/*
drh3aac2dd2004-04-26 14:10:20 +00003458** Set *pSize to the size of the buffer needed to hold the value of
3459** the key for the current entry. If the cursor is not pointing
3460** to a valid entry, *pSize is set to 0.
3461**
drh4b70f112004-05-02 21:12:19 +00003462** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00003463** itself, not the number of bytes in the key.
drhea8ffdf2009-07-22 00:35:23 +00003464**
3465** The caller must position the cursor prior to invoking this routine.
3466**
3467** This routine cannot fail. It always returns SQLITE_OK.
drh7e3b0a02001-04-28 16:52:40 +00003468*/
drh4a1c3802004-05-12 15:15:47 +00003469int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
drh1fee73e2007-08-29 04:00:57 +00003470 assert( cursorHoldsMutex(pCur) );
drhea8ffdf2009-07-22 00:35:23 +00003471 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3472 if( pCur->eState!=CURSOR_VALID ){
3473 *pSize = 0;
3474 }else{
3475 getCellInfo(pCur);
3476 *pSize = pCur->info.nKey;
drh72f82862001-05-24 21:06:34 +00003477 }
drhea8ffdf2009-07-22 00:35:23 +00003478 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00003479}
drh2af926b2001-05-15 00:39:25 +00003480
drh72f82862001-05-24 21:06:34 +00003481/*
drh0e1c19e2004-05-11 00:58:56 +00003482** Set *pSize to the number of bytes of data in the entry the
drhea8ffdf2009-07-22 00:35:23 +00003483** cursor currently points to.
3484**
3485** The caller must guarantee that the cursor is pointing to a non-NULL
3486** valid entry. In other words, the calling procedure must guarantee
3487** that the cursor has Cursor.eState==CURSOR_VALID.
3488**
3489** Failure is not possible. This function always returns SQLITE_OK.
3490** It might just as well be a procedure (returning void) but we continue
3491** to return an integer result code for historical reasons.
drh0e1c19e2004-05-11 00:58:56 +00003492*/
3493int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drh1fee73e2007-08-29 04:00:57 +00003494 assert( cursorHoldsMutex(pCur) );
drhea8ffdf2009-07-22 00:35:23 +00003495 assert( pCur->eState==CURSOR_VALID );
3496 getCellInfo(pCur);
3497 *pSize = pCur->info.nData;
3498 return SQLITE_OK;
drh0e1c19e2004-05-11 00:58:56 +00003499}
3500
3501/*
danielk1977d04417962007-05-02 13:16:30 +00003502** Given the page number of an overflow page in the database (parameter
3503** ovfl), this function finds the page number of the next page in the
3504** linked list of overflow pages. If possible, it uses the auto-vacuum
3505** pointer-map data instead of reading the content of page ovfl to do so.
3506**
3507** If an error occurs an SQLite error code is returned. Otherwise:
3508**
danielk1977bea2a942009-01-20 17:06:27 +00003509** The page number of the next overflow page in the linked list is
3510** written to *pPgnoNext. If page ovfl is the last page in its linked
3511** list, *pPgnoNext is set to zero.
danielk1977d04417962007-05-02 13:16:30 +00003512**
danielk1977bea2a942009-01-20 17:06:27 +00003513** If ppPage is not NULL, and a reference to the MemPage object corresponding
3514** to page number pOvfl was obtained, then *ppPage is set to point to that
3515** reference. It is the responsibility of the caller to call releasePage()
3516** on *ppPage to free the reference. In no reference was obtained (because
3517** the pointer-map was used to obtain the value for *pPgnoNext), then
3518** *ppPage is set to zero.
danielk1977d04417962007-05-02 13:16:30 +00003519*/
3520static int getOverflowPage(
drhfa3be902009-07-07 02:44:07 +00003521 BtShared *pBt, /* The database file */
3522 Pgno ovfl, /* Current overflow page number */
danielk1977bea2a942009-01-20 17:06:27 +00003523 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
danielk1977d04417962007-05-02 13:16:30 +00003524 Pgno *pPgnoNext /* OUT: Next overflow page number */
3525){
3526 Pgno next = 0;
danielk1977bea2a942009-01-20 17:06:27 +00003527 MemPage *pPage = 0;
drh1bd10f82008-12-10 21:19:56 +00003528 int rc = SQLITE_OK;
danielk1977d04417962007-05-02 13:16:30 +00003529
drh1fee73e2007-08-29 04:00:57 +00003530 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977bea2a942009-01-20 17:06:27 +00003531 assert(pPgnoNext);
danielk1977d04417962007-05-02 13:16:30 +00003532
3533#ifndef SQLITE_OMIT_AUTOVACUUM
3534 /* Try to find the next page in the overflow list using the
3535 ** autovacuum pointer-map pages. Guess that the next page in
3536 ** the overflow list is page number (ovfl+1). If that guess turns
3537 ** out to be wrong, fall back to loading the data of page
3538 ** number ovfl to determine the next page number.
3539 */
3540 if( pBt->autoVacuum ){
3541 Pgno pgno;
3542 Pgno iGuess = ovfl+1;
3543 u8 eType;
3544
3545 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3546 iGuess++;
3547 }
3548
danielk197789d40042008-11-17 14:20:56 +00003549 if( iGuess<=pagerPagecount(pBt) ){
danielk1977d04417962007-05-02 13:16:30 +00003550 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
danielk1977bea2a942009-01-20 17:06:27 +00003551 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
danielk1977d04417962007-05-02 13:16:30 +00003552 next = iGuess;
danielk1977bea2a942009-01-20 17:06:27 +00003553 rc = SQLITE_DONE;
danielk1977d04417962007-05-02 13:16:30 +00003554 }
3555 }
3556 }
3557#endif
3558
danielk1977d8a3f3d2009-07-11 11:45:23 +00003559 assert( next==0 || rc==SQLITE_DONE );
danielk1977bea2a942009-01-20 17:06:27 +00003560 if( rc==SQLITE_OK ){
danielk197730548662009-07-09 05:07:37 +00003561 rc = btreeGetPage(pBt, ovfl, &pPage, 0);
danielk1977d8a3f3d2009-07-11 11:45:23 +00003562 assert( rc==SQLITE_OK || pPage==0 );
3563 if( rc==SQLITE_OK ){
danielk1977d04417962007-05-02 13:16:30 +00003564 next = get4byte(pPage->aData);
3565 }
danielk1977443c0592009-01-16 15:21:05 +00003566 }
danielk197745d68822009-01-16 16:23:38 +00003567
danielk1977bea2a942009-01-20 17:06:27 +00003568 *pPgnoNext = next;
3569 if( ppPage ){
3570 *ppPage = pPage;
3571 }else{
3572 releasePage(pPage);
3573 }
3574 return (rc==SQLITE_DONE ? SQLITE_OK : rc);
danielk1977d04417962007-05-02 13:16:30 +00003575}
3576
danielk1977da107192007-05-04 08:32:13 +00003577/*
3578** Copy data from a buffer to a page, or from a page to a buffer.
3579**
3580** pPayload is a pointer to data stored on database page pDbPage.
3581** If argument eOp is false, then nByte bytes of data are copied
3582** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3583** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3584** of data are copied from the buffer pBuf to pPayload.
3585**
3586** SQLITE_OK is returned on success, otherwise an error code.
3587*/
3588static int copyPayload(
3589 void *pPayload, /* Pointer to page data */
3590 void *pBuf, /* Pointer to buffer */
3591 int nByte, /* Number of bytes to copy */
3592 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3593 DbPage *pDbPage /* Page containing pPayload */
3594){
3595 if( eOp ){
3596 /* Copy data from buffer to page (a write operation) */
3597 int rc = sqlite3PagerWrite(pDbPage);
3598 if( rc!=SQLITE_OK ){
3599 return rc;
3600 }
3601 memcpy(pPayload, pBuf, nByte);
3602 }else{
3603 /* Copy data from page to buffer (a read operation) */
3604 memcpy(pBuf, pPayload, nByte);
3605 }
3606 return SQLITE_OK;
3607}
danielk1977d04417962007-05-02 13:16:30 +00003608
3609/*
danielk19779f8d6402007-05-02 17:48:45 +00003610** This function is used to read or overwrite payload information
3611** for the entry that the pCur cursor is pointing to. If the eOp
3612** parameter is 0, this is a read operation (data copied into
3613** buffer pBuf). If it is non-zero, a write (data copied from
3614** buffer pBuf).
3615**
3616** A total of "amt" bytes are read or written beginning at "offset".
3617** Data is read to or from the buffer pBuf.
drh72f82862001-05-24 21:06:34 +00003618**
drh3bcdfd22009-07-12 02:32:21 +00003619** The content being read or written might appear on the main page
3620** or be scattered out on multiple overflow pages.
danielk1977da107192007-05-04 08:32:13 +00003621**
danielk1977dcbb5d32007-05-04 18:36:44 +00003622** If the BtCursor.isIncrblobHandle flag is set, and the current
danielk1977da107192007-05-04 08:32:13 +00003623** cursor entry uses one or more overflow pages, this function
3624** allocates space for and lazily popluates the overflow page-list
3625** cache array (BtCursor.aOverflow). Subsequent calls use this
3626** cache to make seeking to the supplied offset more efficient.
3627**
3628** Once an overflow page-list cache has been allocated, it may be
3629** invalidated if some other cursor writes to the same table, or if
3630** the cursor is moved to a different row. Additionally, in auto-vacuum
3631** mode, the following events may invalidate an overflow page-list cache.
3632**
3633** * An incremental vacuum,
3634** * A commit in auto_vacuum="full" mode,
3635** * Creating a table (may require moving an overflow page).
drh72f82862001-05-24 21:06:34 +00003636*/
danielk19779f8d6402007-05-02 17:48:45 +00003637static int accessPayload(
drh3aac2dd2004-04-26 14:10:20 +00003638 BtCursor *pCur, /* Cursor pointing to entry to read from */
danielk197789d40042008-11-17 14:20:56 +00003639 u32 offset, /* Begin reading this far into payload */
3640 u32 amt, /* Read this many bytes */
drh3aac2dd2004-04-26 14:10:20 +00003641 unsigned char *pBuf, /* Write the bytes into this buffer */
danielk19779f8d6402007-05-02 17:48:45 +00003642 int eOp /* zero to read. non-zero to write. */
drh3aac2dd2004-04-26 14:10:20 +00003643){
3644 unsigned char *aPayload;
danielk1977da107192007-05-04 08:32:13 +00003645 int rc = SQLITE_OK;
drhfa1a98a2004-05-14 19:08:17 +00003646 u32 nKey;
danielk19772dec9702007-05-02 16:48:37 +00003647 int iIdx = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003648 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
danielk19770d065412008-11-12 18:21:36 +00003649 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
drh3aac2dd2004-04-26 14:10:20 +00003650
danielk1977da107192007-05-04 08:32:13 +00003651 assert( pPage );
danielk1977da184232006-01-05 11:34:32 +00003652 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003653 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drh1fee73e2007-08-29 04:00:57 +00003654 assert( cursorHoldsMutex(pCur) );
danielk1977da107192007-05-04 08:32:13 +00003655
drh86057612007-06-26 01:04:48 +00003656 getCellInfo(pCur);
drh366fda62006-01-13 02:35:09 +00003657 aPayload = pCur->info.pCell + pCur->info.nHeader;
drhf49661a2008-12-10 16:45:50 +00003658 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
danielk1977da107192007-05-04 08:32:13 +00003659
drh3bcdfd22009-07-12 02:32:21 +00003660 if( NEVER(offset+amt > nKey+pCur->info.nData)
danielk19770d065412008-11-12 18:21:36 +00003661 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3662 ){
danielk1977da107192007-05-04 08:32:13 +00003663 /* Trying to read or write past the end of the data is an error */
danielk197767fd7a92008-09-10 17:53:35 +00003664 return SQLITE_CORRUPT_BKPT;
drh3aac2dd2004-04-26 14:10:20 +00003665 }
danielk1977da107192007-05-04 08:32:13 +00003666
3667 /* Check if data must be read/written to/from the btree page itself. */
drhfa1a98a2004-05-14 19:08:17 +00003668 if( offset<pCur->info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00003669 int a = amt;
drhfa1a98a2004-05-14 19:08:17 +00003670 if( a+offset>pCur->info.nLocal ){
3671 a = pCur->info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00003672 }
danielk1977da107192007-05-04 08:32:13 +00003673 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
drh2aa679f2001-06-25 02:11:07 +00003674 offset = 0;
drha34b6762004-05-07 13:30:42 +00003675 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00003676 amt -= a;
drhdd793422001-06-28 01:54:48 +00003677 }else{
drhfa1a98a2004-05-14 19:08:17 +00003678 offset -= pCur->info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00003679 }
danielk1977da107192007-05-04 08:32:13 +00003680
3681 if( rc==SQLITE_OK && amt>0 ){
danielk197789d40042008-11-17 14:20:56 +00003682 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
danielk1977da107192007-05-04 08:32:13 +00003683 Pgno nextPage;
3684
drhfa1a98a2004-05-14 19:08:17 +00003685 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
danielk1977da107192007-05-04 08:32:13 +00003686
danielk19772dec9702007-05-02 16:48:37 +00003687#ifndef SQLITE_OMIT_INCRBLOB
danielk1977dcbb5d32007-05-04 18:36:44 +00003688 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
danielk1977da107192007-05-04 08:32:13 +00003689 ** has not been allocated, allocate it now. The array is sized at
3690 ** one entry for each overflow page in the overflow chain. The
3691 ** page number of the first overflow page is stored in aOverflow[0],
3692 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3693 ** (the cache is lazily populated).
3694 */
danielk1977dcbb5d32007-05-04 18:36:44 +00003695 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
danielk19772dec9702007-05-02 16:48:37 +00003696 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
drh17435752007-08-16 04:30:38 +00003697 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
drh3bcdfd22009-07-12 02:32:21 +00003698 /* nOvfl is always positive. If it were zero, fetchPayload would have
3699 ** been used instead of this routine. */
3700 if( ALWAYS(nOvfl) && !pCur->aOverflow ){
danielk1977da107192007-05-04 08:32:13 +00003701 rc = SQLITE_NOMEM;
danielk19772dec9702007-05-02 16:48:37 +00003702 }
3703 }
danielk1977da107192007-05-04 08:32:13 +00003704
3705 /* If the overflow page-list cache has been allocated and the
3706 ** entry for the first required overflow page is valid, skip
3707 ** directly to it.
3708 */
danielk19772dec9702007-05-02 16:48:37 +00003709 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3710 iIdx = (offset/ovflSize);
3711 nextPage = pCur->aOverflow[iIdx];
3712 offset = (offset%ovflSize);
3713 }
3714#endif
danielk1977da107192007-05-04 08:32:13 +00003715
3716 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3717
3718#ifndef SQLITE_OMIT_INCRBLOB
3719 /* If required, populate the overflow page-list cache. */
3720 if( pCur->aOverflow ){
3721 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3722 pCur->aOverflow[iIdx] = nextPage;
3723 }
3724#endif
3725
danielk1977d04417962007-05-02 13:16:30 +00003726 if( offset>=ovflSize ){
3727 /* The only reason to read this page is to obtain the page
danielk1977da107192007-05-04 08:32:13 +00003728 ** number for the next page in the overflow chain. The page
drhfd131da2007-08-07 17:13:03 +00003729 ** data is not required. So first try to lookup the overflow
3730 ** page-list cache, if any, then fall back to the getOverflowPage()
danielk1977da107192007-05-04 08:32:13 +00003731 ** function.
danielk1977d04417962007-05-02 13:16:30 +00003732 */
danielk19772dec9702007-05-02 16:48:37 +00003733#ifndef SQLITE_OMIT_INCRBLOB
danielk1977da107192007-05-04 08:32:13 +00003734 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3735 nextPage = pCur->aOverflow[iIdx+1];
3736 } else
danielk19772dec9702007-05-02 16:48:37 +00003737#endif
danielk1977da107192007-05-04 08:32:13 +00003738 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
danielk1977da107192007-05-04 08:32:13 +00003739 offset -= ovflSize;
danielk1977d04417962007-05-02 13:16:30 +00003740 }else{
danielk19779f8d6402007-05-02 17:48:45 +00003741 /* Need to read this page properly. It contains some of the
3742 ** range of data that is being read (eOp==0) or written (eOp!=0).
danielk1977d04417962007-05-02 13:16:30 +00003743 */
3744 DbPage *pDbPage;
danielk1977cfe9a692004-06-16 12:00:29 +00003745 int a = amt;
danielk1977d04417962007-05-02 13:16:30 +00003746 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
danielk1977da107192007-05-04 08:32:13 +00003747 if( rc==SQLITE_OK ){
3748 aPayload = sqlite3PagerGetData(pDbPage);
3749 nextPage = get4byte(aPayload);
3750 if( a + offset > ovflSize ){
3751 a = ovflSize - offset;
danielk19779f8d6402007-05-02 17:48:45 +00003752 }
danielk1977da107192007-05-04 08:32:13 +00003753 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3754 sqlite3PagerUnref(pDbPage);
3755 offset = 0;
3756 amt -= a;
3757 pBuf += a;
danielk19779f8d6402007-05-02 17:48:45 +00003758 }
danielk1977cfe9a692004-06-16 12:00:29 +00003759 }
drh2af926b2001-05-15 00:39:25 +00003760 }
drh2af926b2001-05-15 00:39:25 +00003761 }
danielk1977cfe9a692004-06-16 12:00:29 +00003762
danielk1977da107192007-05-04 08:32:13 +00003763 if( rc==SQLITE_OK && amt>0 ){
drh49285702005-09-17 15:20:26 +00003764 return SQLITE_CORRUPT_BKPT;
drha7fcb052001-12-14 15:09:55 +00003765 }
danielk1977da107192007-05-04 08:32:13 +00003766 return rc;
drh2af926b2001-05-15 00:39:25 +00003767}
3768
drh72f82862001-05-24 21:06:34 +00003769/*
drh3aac2dd2004-04-26 14:10:20 +00003770** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003771** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003772** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00003773**
drh5d1a8722009-07-22 18:07:40 +00003774** The caller must ensure that pCur is pointing to a valid row
3775** in the table.
3776**
drh3aac2dd2004-04-26 14:10:20 +00003777** Return SQLITE_OK on success or an error code if anything goes
3778** wrong. An error is returned if "offset+amt" is larger than
3779** the available payload.
drh72f82862001-05-24 21:06:34 +00003780*/
drha34b6762004-05-07 13:30:42 +00003781int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drh1fee73e2007-08-29 04:00:57 +00003782 assert( cursorHoldsMutex(pCur) );
drh5d1a8722009-07-22 18:07:40 +00003783 assert( pCur->eState==CURSOR_VALID );
3784 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3785 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3786 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
drh3aac2dd2004-04-26 14:10:20 +00003787}
3788
3789/*
drh3aac2dd2004-04-26 14:10:20 +00003790** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003791** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003792** begins at "offset".
3793**
3794** Return SQLITE_OK on success or an error code if anything goes
3795** wrong. An error is returned if "offset+amt" is larger than
3796** the available payload.
drh72f82862001-05-24 21:06:34 +00003797*/
drh3aac2dd2004-04-26 14:10:20 +00003798int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003799 int rc;
3800
danielk19773588ceb2008-06-10 17:30:26 +00003801#ifndef SQLITE_OMIT_INCRBLOB
3802 if ( pCur->eState==CURSOR_INVALID ){
3803 return SQLITE_ABORT;
3804 }
3805#endif
3806
drh1fee73e2007-08-29 04:00:57 +00003807 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003808 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003809 if( rc==SQLITE_OK ){
3810 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003811 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3812 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drhfb192682009-07-11 18:26:28 +00003813 rc = accessPayload(pCur, offset, amt, pBuf, 0);
danielk1977da184232006-01-05 11:34:32 +00003814 }
3815 return rc;
drh2af926b2001-05-15 00:39:25 +00003816}
3817
drh72f82862001-05-24 21:06:34 +00003818/*
drh0e1c19e2004-05-11 00:58:56 +00003819** Return a pointer to payload information from the entry that the
3820** pCur cursor is pointing to. The pointer is to the beginning of
3821** the key if skipKey==0 and it points to the beginning of data if
drhe51c44f2004-05-30 20:46:09 +00003822** skipKey==1. The number of bytes of available key/data is written
3823** into *pAmt. If *pAmt==0, then the value returned will not be
3824** a valid pointer.
drh0e1c19e2004-05-11 00:58:56 +00003825**
3826** This routine is an optimization. It is common for the entire key
3827** and data to fit on the local page and for there to be no overflow
3828** pages. When that is so, this routine can be used to access the
3829** key and data without making a copy. If the key and/or data spills
drh7f751222009-03-17 22:33:00 +00003830** onto overflow pages, then accessPayload() must be used to reassemble
drh0e1c19e2004-05-11 00:58:56 +00003831** the key/data and copy it into a preallocated buffer.
3832**
3833** The pointer returned by this routine looks directly into the cached
3834** page of the database. The data might change or move the next time
3835** any btree routine is called.
3836*/
3837static const unsigned char *fetchPayload(
3838 BtCursor *pCur, /* Cursor pointing to entry to read from */
drhe51c44f2004-05-30 20:46:09 +00003839 int *pAmt, /* Write the number of available bytes here */
drh0e1c19e2004-05-11 00:58:56 +00003840 int skipKey /* read beginning at data if this is true */
3841){
3842 unsigned char *aPayload;
3843 MemPage *pPage;
drhfa1a98a2004-05-14 19:08:17 +00003844 u32 nKey;
danielk197789d40042008-11-17 14:20:56 +00003845 u32 nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003846
danielk197771d5d2c2008-09-29 11:49:47 +00003847 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
danielk1977da184232006-01-05 11:34:32 +00003848 assert( pCur->eState==CURSOR_VALID );
drh1fee73e2007-08-29 04:00:57 +00003849 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00003850 pPage = pCur->apPage[pCur->iPage];
3851 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drhfe3313f2009-07-21 19:02:20 +00003852 if( NEVER(pCur->info.nSize==0) ){
3853 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
3854 &pCur->info);
3855 }
drh43605152004-05-29 21:46:49 +00003856 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00003857 aPayload += pCur->info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00003858 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00003859 nKey = 0;
3860 }else{
drhf49661a2008-12-10 16:45:50 +00003861 nKey = (int)pCur->info.nKey;
drh0e1c19e2004-05-11 00:58:56 +00003862 }
drh0e1c19e2004-05-11 00:58:56 +00003863 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003864 aPayload += nKey;
3865 nLocal = pCur->info.nLocal - nKey;
drh0e1c19e2004-05-11 00:58:56 +00003866 }else{
drhfa1a98a2004-05-14 19:08:17 +00003867 nLocal = pCur->info.nLocal;
drhfe3313f2009-07-21 19:02:20 +00003868 assert( nLocal<=nKey );
drh0e1c19e2004-05-11 00:58:56 +00003869 }
drhe51c44f2004-05-30 20:46:09 +00003870 *pAmt = nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003871 return aPayload;
3872}
3873
3874
3875/*
drhe51c44f2004-05-30 20:46:09 +00003876** For the entry that cursor pCur is point to, return as
3877** many bytes of the key or data as are available on the local
3878** b-tree page. Write the number of available bytes into *pAmt.
drh0e1c19e2004-05-11 00:58:56 +00003879**
3880** The pointer returned is ephemeral. The key/data may move
drhd677b3d2007-08-20 22:48:41 +00003881** or be destroyed on the next call to any Btree routine,
3882** including calls from other threads against the same cache.
3883** Hence, a mutex on the BtShared should be held prior to calling
3884** this routine.
drh0e1c19e2004-05-11 00:58:56 +00003885**
3886** These routines is used to get quick access to key and data
3887** in the common case where no overflow pages are used.
drh0e1c19e2004-05-11 00:58:56 +00003888*/
drhe51c44f2004-05-30 20:46:09 +00003889const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
drhfe3313f2009-07-21 19:02:20 +00003890 const void *p = 0;
danielk19774b0aa4c2009-05-28 11:05:57 +00003891 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh1fee73e2007-08-29 04:00:57 +00003892 assert( cursorHoldsMutex(pCur) );
drhfe3313f2009-07-21 19:02:20 +00003893 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
3894 p = (const void*)fetchPayload(pCur, pAmt, 0);
danielk1977da184232006-01-05 11:34:32 +00003895 }
drhfe3313f2009-07-21 19:02:20 +00003896 return p;
drh0e1c19e2004-05-11 00:58:56 +00003897}
drhe51c44f2004-05-30 20:46:09 +00003898const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
drhfe3313f2009-07-21 19:02:20 +00003899 const void *p = 0;
danielk19774b0aa4c2009-05-28 11:05:57 +00003900 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh1fee73e2007-08-29 04:00:57 +00003901 assert( cursorHoldsMutex(pCur) );
drhfe3313f2009-07-21 19:02:20 +00003902 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
3903 p = (const void*)fetchPayload(pCur, pAmt, 1);
danielk1977da184232006-01-05 11:34:32 +00003904 }
drhfe3313f2009-07-21 19:02:20 +00003905 return p;
drh0e1c19e2004-05-11 00:58:56 +00003906}
3907
3908
3909/*
drh8178a752003-01-05 21:41:40 +00003910** Move the cursor down to a new child page. The newPgno argument is the
drhab01f612004-05-22 02:55:23 +00003911** page number of the child page to move to.
danielk1977a299d612009-07-13 11:22:10 +00003912**
3913** This function returns SQLITE_CORRUPT if the page-header flags field of
3914** the new child page does not match the flags field of the parent (i.e.
3915** if an intkey page appears to be the parent of a non-intkey page, or
3916** vice-versa).
drh72f82862001-05-24 21:06:34 +00003917*/
drh3aac2dd2004-04-26 14:10:20 +00003918static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00003919 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003920 int i = pCur->iPage;
drh72f82862001-05-24 21:06:34 +00003921 MemPage *pNewPage;
drhd0679ed2007-08-28 22:24:34 +00003922 BtShared *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00003923
drh1fee73e2007-08-29 04:00:57 +00003924 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003925 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003926 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3927 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3928 return SQLITE_CORRUPT_BKPT;
3929 }
3930 rc = getAndInitPage(pBt, newPgno, &pNewPage);
drh6019e162001-07-02 17:51:45 +00003931 if( rc ) return rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003932 pCur->apPage[i+1] = pNewPage;
3933 pCur->aiIdx[i+1] = 0;
3934 pCur->iPage++;
3935
drh271efa52004-05-30 19:19:05 +00003936 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003937 pCur->validNKey = 0;
danielk1977bd5969a2009-07-11 17:39:42 +00003938 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
drh49285702005-09-17 15:20:26 +00003939 return SQLITE_CORRUPT_BKPT;
drh4be295b2003-12-16 03:44:47 +00003940 }
drh72f82862001-05-24 21:06:34 +00003941 return SQLITE_OK;
3942}
3943
danielk1977bf93c562008-09-29 15:53:25 +00003944#ifndef NDEBUG
3945/*
3946** Page pParent is an internal (non-leaf) tree page. This function
3947** asserts that page number iChild is the left-child if the iIdx'th
3948** cell in page pParent. Or, if iIdx is equal to the total number of
3949** cells in pParent, that page number iChild is the right-child of
3950** the page.
3951*/
3952static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3953 assert( iIdx<=pParent->nCell );
3954 if( iIdx==pParent->nCell ){
3955 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3956 }else{
3957 assert( get4byte(findCell(pParent, iIdx))==iChild );
3958 }
3959}
3960#else
3961# define assertParentIndex(x,y,z)
3962#endif
3963
drh72f82862001-05-24 21:06:34 +00003964/*
drh5e2f8b92001-05-28 00:41:15 +00003965** Move the cursor up to the parent page.
3966**
3967** pCur->idx is set to the cell index that contains the pointer
3968** to the page we are coming from. If we are coming from the
3969** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00003970** the largest cell index.
drh72f82862001-05-24 21:06:34 +00003971*/
danielk197730548662009-07-09 05:07:37 +00003972static void moveToParent(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00003973 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003974 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003975 assert( pCur->iPage>0 );
3976 assert( pCur->apPage[pCur->iPage] );
danielk1977bf93c562008-09-29 15:53:25 +00003977 assertParentIndex(
3978 pCur->apPage[pCur->iPage-1],
3979 pCur->aiIdx[pCur->iPage-1],
3980 pCur->apPage[pCur->iPage]->pgno
3981 );
danielk197771d5d2c2008-09-29 11:49:47 +00003982 releasePage(pCur->apPage[pCur->iPage]);
3983 pCur->iPage--;
drh271efa52004-05-30 19:19:05 +00003984 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003985 pCur->validNKey = 0;
drh72f82862001-05-24 21:06:34 +00003986}
3987
3988/*
danielk19778f880a82009-07-13 09:41:45 +00003989** Move the cursor to point to the root page of its b-tree structure.
3990**
3991** If the table has a virtual root page, then the cursor is moved to point
3992** to the virtual root page instead of the actual root page. A table has a
3993** virtual root page when the actual root page contains no cells and a
3994** single child page. This can only happen with the table rooted at page 1.
3995**
3996** If the b-tree structure is empty, the cursor state is set to
3997** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
3998** cell located on the root (or virtual root) page and the cursor state
3999** is set to CURSOR_VALID.
4000**
4001** If this function returns successfully, it may be assumed that the
4002** page-header flags indicate that the [virtual] root-page is the expected
4003** kind of b-tree page (i.e. if when opening the cursor the caller did not
4004** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4005** indicating a table b-tree, or if the caller did specify a KeyInfo
4006** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4007** b-tree).
drh72f82862001-05-24 21:06:34 +00004008*/
drh5e2f8b92001-05-28 00:41:15 +00004009static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00004010 MemPage *pRoot;
drh777e4c42006-01-13 04:31:58 +00004011 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00004012 Btree *p = pCur->pBtree;
4013 BtShared *pBt = p->pBt;
drhbd03cae2001-06-02 02:40:57 +00004014
drh1fee73e2007-08-29 04:00:57 +00004015 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +00004016 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4017 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
4018 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
4019 if( pCur->eState>=CURSOR_REQUIRESEEK ){
4020 if( pCur->eState==CURSOR_FAULT ){
drh4c301aa2009-07-15 17:25:45 +00004021 assert( pCur->skipNext!=SQLITE_OK );
4022 return pCur->skipNext;
drhfb982642007-08-30 01:19:59 +00004023 }
danielk1977be51a652008-10-08 17:58:48 +00004024 sqlite3BtreeClearCursor(pCur);
drhbf700f32007-03-31 02:36:44 +00004025 }
danielk197771d5d2c2008-09-29 11:49:47 +00004026
4027 if( pCur->iPage>=0 ){
4028 int i;
4029 for(i=1; i<=pCur->iPage; i++){
4030 releasePage(pCur->apPage[i]);
danielk1977d9f6c532008-09-19 16:39:38 +00004031 }
danielk1977172114a2009-07-07 15:47:12 +00004032 pCur->iPage = 0;
drh777e4c42006-01-13 04:31:58 +00004033 }else{
drh4c301aa2009-07-15 17:25:45 +00004034 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4035 if( rc!=SQLITE_OK ){
drh777e4c42006-01-13 04:31:58 +00004036 pCur->eState = CURSOR_INVALID;
4037 return rc;
4038 }
danielk1977172114a2009-07-07 15:47:12 +00004039 pCur->iPage = 0;
4040
4041 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4042 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4043 ** NULL, the caller expects a table b-tree. If this is not the case,
4044 ** return an SQLITE_CORRUPT error. */
4045 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4046 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4047 return SQLITE_CORRUPT_BKPT;
4048 }
drhc39e0002004-05-07 23:50:57 +00004049 }
danielk197771d5d2c2008-09-29 11:49:47 +00004050
danielk19778f880a82009-07-13 09:41:45 +00004051 /* Assert that the root page is of the correct type. This must be the
4052 ** case as the call to this function that loaded the root-page (either
4053 ** this call or a previous invocation) would have detected corruption
4054 ** if the assumption were not true, and it is not possible for the flags
4055 ** byte to have been modified while this cursor is holding a reference
4056 ** to the page. */
danielk197771d5d2c2008-09-29 11:49:47 +00004057 pRoot = pCur->apPage[0];
4058 assert( pRoot->pgno==pCur->pgnoRoot );
danielk19778f880a82009-07-13 09:41:45 +00004059 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4060
danielk197771d5d2c2008-09-29 11:49:47 +00004061 pCur->aiIdx[0] = 0;
drh271efa52004-05-30 19:19:05 +00004062 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004063 pCur->atLast = 0;
4064 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004065
drh8856d6a2004-04-29 14:42:46 +00004066 if( pRoot->nCell==0 && !pRoot->leaf ){
4067 Pgno subpage;
drhc85240d2009-06-04 16:14:33 +00004068 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
drh43605152004-05-29 21:46:49 +00004069 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
danielk1977da184232006-01-05 11:34:32 +00004070 pCur->eState = CURSOR_VALID;
drh4b70f112004-05-02 21:12:19 +00004071 rc = moveToChild(pCur, subpage);
danielk197771d5d2c2008-09-29 11:49:47 +00004072 }else{
4073 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
drh8856d6a2004-04-29 14:42:46 +00004074 }
4075 return rc;
drh72f82862001-05-24 21:06:34 +00004076}
drh2af926b2001-05-15 00:39:25 +00004077
drh5e2f8b92001-05-28 00:41:15 +00004078/*
4079** Move the cursor down to the left-most leaf entry beneath the
4080** entry to which it is currently pointing.
drh777e4c42006-01-13 04:31:58 +00004081**
4082** The left-most leaf is the one with the smallest key - the first
4083** in ascending order.
drh5e2f8b92001-05-28 00:41:15 +00004084*/
4085static int moveToLeftmost(BtCursor *pCur){
4086 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00004087 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00004088 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00004089
drh1fee73e2007-08-29 04:00:57 +00004090 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00004091 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00004092 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4093 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4094 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
drh8178a752003-01-05 21:41:40 +00004095 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00004096 }
drhd677b3d2007-08-20 22:48:41 +00004097 return rc;
drh5e2f8b92001-05-28 00:41:15 +00004098}
4099
drh2dcc9aa2002-12-04 13:40:25 +00004100/*
4101** Move the cursor down to the right-most leaf entry beneath the
4102** page to which it is currently pointing. Notice the difference
4103** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
4104** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4105** finds the right-most entry beneath the *page*.
drh777e4c42006-01-13 04:31:58 +00004106**
4107** The right-most entry is the one with the largest key - the last
4108** key in ascending order.
drh2dcc9aa2002-12-04 13:40:25 +00004109*/
4110static int moveToRightmost(BtCursor *pCur){
4111 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00004112 int rc = SQLITE_OK;
drh1bd10f82008-12-10 21:19:56 +00004113 MemPage *pPage = 0;
drh2dcc9aa2002-12-04 13:40:25 +00004114
drh1fee73e2007-08-29 04:00:57 +00004115 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00004116 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00004117 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
drh43605152004-05-29 21:46:49 +00004118 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
danielk197771d5d2c2008-09-29 11:49:47 +00004119 pCur->aiIdx[pCur->iPage] = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00004120 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00004121 }
drhd677b3d2007-08-20 22:48:41 +00004122 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00004123 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
drhd677b3d2007-08-20 22:48:41 +00004124 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004125 pCur->validNKey = 0;
drhd677b3d2007-08-20 22:48:41 +00004126 }
danielk1977518002e2008-09-05 05:02:46 +00004127 return rc;
drh2dcc9aa2002-12-04 13:40:25 +00004128}
4129
drh5e00f6c2001-09-13 13:46:56 +00004130/* Move the cursor to the first entry in the table. Return SQLITE_OK
4131** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00004132** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00004133*/
drh3aac2dd2004-04-26 14:10:20 +00004134int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00004135 int rc;
drhd677b3d2007-08-20 22:48:41 +00004136
drh1fee73e2007-08-29 04:00:57 +00004137 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00004138 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh5e00f6c2001-09-13 13:46:56 +00004139 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00004140 if( rc==SQLITE_OK ){
4141 if( pCur->eState==CURSOR_INVALID ){
danielk197771d5d2c2008-09-29 11:49:47 +00004142 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00004143 *pRes = 1;
4144 rc = SQLITE_OK;
4145 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00004146 assert( pCur->apPage[pCur->iPage]->nCell>0 );
drhd677b3d2007-08-20 22:48:41 +00004147 *pRes = 0;
4148 rc = moveToLeftmost(pCur);
4149 }
drh5e00f6c2001-09-13 13:46:56 +00004150 }
drh5e00f6c2001-09-13 13:46:56 +00004151 return rc;
4152}
drh5e2f8b92001-05-28 00:41:15 +00004153
drh9562b552002-02-19 15:00:07 +00004154/* Move the cursor to the last entry in the table. Return SQLITE_OK
4155** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00004156** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00004157*/
drh3aac2dd2004-04-26 14:10:20 +00004158int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00004159 int rc;
drhd677b3d2007-08-20 22:48:41 +00004160
drh1fee73e2007-08-29 04:00:57 +00004161 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00004162 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk19773f632d52009-05-02 10:03:09 +00004163
4164 /* If the cursor already points to the last entry, this is a no-op. */
4165 if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4166#ifdef SQLITE_DEBUG
4167 /* This block serves to assert() that the cursor really does point
4168 ** to the last entry in the b-tree. */
4169 int ii;
4170 for(ii=0; ii<pCur->iPage; ii++){
4171 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4172 }
4173 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4174 assert( pCur->apPage[pCur->iPage]->leaf );
4175#endif
4176 return SQLITE_OK;
4177 }
4178
drh9562b552002-02-19 15:00:07 +00004179 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00004180 if( rc==SQLITE_OK ){
4181 if( CURSOR_INVALID==pCur->eState ){
danielk197771d5d2c2008-09-29 11:49:47 +00004182 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00004183 *pRes = 1;
4184 }else{
4185 assert( pCur->eState==CURSOR_VALID );
4186 *pRes = 0;
4187 rc = moveToRightmost(pCur);
drhf49661a2008-12-10 16:45:50 +00004188 pCur->atLast = rc==SQLITE_OK ?1:0;
drhd677b3d2007-08-20 22:48:41 +00004189 }
drh9562b552002-02-19 15:00:07 +00004190 }
drh9562b552002-02-19 15:00:07 +00004191 return rc;
4192}
4193
drhe14006d2008-03-25 17:23:32 +00004194/* Move the cursor so that it points to an entry near the key
drhe63d9992008-08-13 19:11:48 +00004195** specified by pIdxKey or intKey. Return a success code.
drh72f82862001-05-24 21:06:34 +00004196**
drhe63d9992008-08-13 19:11:48 +00004197** For INTKEY tables, the intKey parameter is used. pIdxKey
4198** must be NULL. For index tables, pIdxKey is used and intKey
4199** is ignored.
drh3aac2dd2004-04-26 14:10:20 +00004200**
drh5e2f8b92001-05-28 00:41:15 +00004201** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00004202** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00004203** were present. The cursor might point to an entry that comes
4204** before or after the key.
4205**
drh64022502009-01-09 14:11:04 +00004206** An integer is written into *pRes which is the result of
4207** comparing the key with the entry to which the cursor is
4208** pointing. The meaning of the integer written into
4209** *pRes is as follows:
drhbd03cae2001-06-02 02:40:57 +00004210**
4211** *pRes<0 The cursor is left pointing at an entry that
drh64022502009-01-09 14:11:04 +00004212** is smaller than intKey/pIdxKey or if the table is empty
drh1a844c32002-12-04 22:29:28 +00004213** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00004214**
4215** *pRes==0 The cursor is left pointing at an entry that
drh64022502009-01-09 14:11:04 +00004216** exactly matches intKey/pIdxKey.
drhbd03cae2001-06-02 02:40:57 +00004217**
4218** *pRes>0 The cursor is left pointing at an entry that
drh64022502009-01-09 14:11:04 +00004219** is larger than intKey/pIdxKey.
drhd677b3d2007-08-20 22:48:41 +00004220**
drha059ad02001-04-17 20:09:11 +00004221*/
drhe63d9992008-08-13 19:11:48 +00004222int sqlite3BtreeMovetoUnpacked(
4223 BtCursor *pCur, /* The cursor to be moved */
4224 UnpackedRecord *pIdxKey, /* Unpacked index key */
4225 i64 intKey, /* The table key */
4226 int biasRight, /* If true, bias the search to the high end */
4227 int *pRes /* Write search results here */
drhe4d90812007-03-29 05:51:49 +00004228){
drh72f82862001-05-24 21:06:34 +00004229 int rc;
drhd677b3d2007-08-20 22:48:41 +00004230
drh1fee73e2007-08-29 04:00:57 +00004231 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00004232 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk19775cb09632009-07-09 11:36:01 +00004233 assert( pRes );
danielk19773fd7cf52009-07-13 07:30:52 +00004234 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
drha2c20e42008-03-29 16:01:04 +00004235
4236 /* If the cursor is already positioned at the point we are trying
4237 ** to move to, then just return without doing any work */
danielk197771d5d2c2008-09-29 11:49:47 +00004238 if( pCur->eState==CURSOR_VALID && pCur->validNKey
4239 && pCur->apPage[0]->intKey
4240 ){
drhe63d9992008-08-13 19:11:48 +00004241 if( pCur->info.nKey==intKey ){
drha2c20e42008-03-29 16:01:04 +00004242 *pRes = 0;
4243 return SQLITE_OK;
4244 }
drhe63d9992008-08-13 19:11:48 +00004245 if( pCur->atLast && pCur->info.nKey<intKey ){
drha2c20e42008-03-29 16:01:04 +00004246 *pRes = -1;
4247 return SQLITE_OK;
4248 }
4249 }
4250
drh5e2f8b92001-05-28 00:41:15 +00004251 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00004252 if( rc ){
4253 return rc;
4254 }
danielk197771d5d2c2008-09-29 11:49:47 +00004255 assert( pCur->apPage[pCur->iPage] );
4256 assert( pCur->apPage[pCur->iPage]->isInit );
danielk1977171fff32009-07-11 05:06:51 +00004257 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID );
danielk1977da184232006-01-05 11:34:32 +00004258 if( pCur->eState==CURSOR_INVALID ){
drhf328bc82004-05-10 23:29:49 +00004259 *pRes = -1;
danielk197771d5d2c2008-09-29 11:49:47 +00004260 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhc39e0002004-05-07 23:50:57 +00004261 return SQLITE_OK;
4262 }
danielk197771d5d2c2008-09-29 11:49:47 +00004263 assert( pCur->apPage[0]->intKey || pIdxKey );
drh14684382006-11-30 13:05:29 +00004264 for(;;){
drh72f82862001-05-24 21:06:34 +00004265 int lwr, upr;
4266 Pgno chldPg;
danielk197771d5d2c2008-09-29 11:49:47 +00004267 MemPage *pPage = pCur->apPage[pCur->iPage];
danielk1977171fff32009-07-11 05:06:51 +00004268 int c;
4269
4270 /* pPage->nCell must be greater than zero. If this is the root-page
4271 ** the cursor would have been INVALID above and this for(;;) loop
4272 ** not run. If this is not the root-page, then the moveToChild() routine
danielk19773fd7cf52009-07-13 07:30:52 +00004273 ** would have already detected db corruption. Similarly, pPage must
4274 ** be the right kind (index or table) of b-tree page. Otherwise
4275 ** a moveToChild() or moveToRoot() call would have detected corruption. */
danielk1977171fff32009-07-11 05:06:51 +00004276 assert( pPage->nCell>0 );
danielk19773fd7cf52009-07-13 07:30:52 +00004277 assert( pPage->intKey==(pIdxKey==0) );
drh72f82862001-05-24 21:06:34 +00004278 lwr = 0;
4279 upr = pPage->nCell-1;
drhe4d90812007-03-29 05:51:49 +00004280 if( biasRight ){
drhf49661a2008-12-10 16:45:50 +00004281 pCur->aiIdx[pCur->iPage] = (u16)upr;
drhe4d90812007-03-29 05:51:49 +00004282 }else{
drhf49661a2008-12-10 16:45:50 +00004283 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
drhe4d90812007-03-29 05:51:49 +00004284 }
drh64022502009-01-09 14:11:04 +00004285 for(;;){
danielk197711c327a2009-05-04 19:01:26 +00004286 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */
4287 u8 *pCell; /* Pointer to current cell in pPage */
4288
drh366fda62006-01-13 02:35:09 +00004289 pCur->info.nSize = 0;
danielk197711c327a2009-05-04 19:01:26 +00004290 pCell = findCell(pPage, idx) + pPage->childPtrSize;
drh3aac2dd2004-04-26 14:10:20 +00004291 if( pPage->intKey ){
danielk197711c327a2009-05-04 19:01:26 +00004292 i64 nCellKey;
drhd172f862006-01-12 15:01:15 +00004293 if( pPage->hasData ){
danielk1977bab45c62006-01-16 15:14:27 +00004294 u32 dummy;
shane3f8d5cf2008-04-24 19:15:09 +00004295 pCell += getVarint32(pCell, dummy);
drhd172f862006-01-12 15:01:15 +00004296 }
drha2c20e42008-03-29 16:01:04 +00004297 getVarint(pCell, (u64*)&nCellKey);
drhe63d9992008-08-13 19:11:48 +00004298 if( nCellKey==intKey ){
drh3aac2dd2004-04-26 14:10:20 +00004299 c = 0;
drhe63d9992008-08-13 19:11:48 +00004300 }else if( nCellKey<intKey ){
drh41eb9e92008-04-02 18:33:07 +00004301 c = -1;
4302 }else{
drhe63d9992008-08-13 19:11:48 +00004303 assert( nCellKey>intKey );
drh41eb9e92008-04-02 18:33:07 +00004304 c = +1;
drh3aac2dd2004-04-26 14:10:20 +00004305 }
danielk197711c327a2009-05-04 19:01:26 +00004306 pCur->validNKey = 1;
4307 pCur->info.nKey = nCellKey;
drh3aac2dd2004-04-26 14:10:20 +00004308 }else{
danielk197711c327a2009-05-04 19:01:26 +00004309 /* The maximum supported page-size is 32768 bytes. This means that
4310 ** the maximum number of record bytes stored on an index B-Tree
4311 ** page is at most 8198 bytes, which may be stored as a 2-byte
4312 ** varint. This information is used to attempt to avoid parsing
4313 ** the entire cell by checking for the cases where the record is
4314 ** stored entirely within the b-tree page by inspecting the first
4315 ** 2 bytes of the cell.
4316 */
4317 int nCell = pCell[0];
4318 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){
4319 /* This branch runs if the record-size field of the cell is a
4320 ** single byte varint and the record fits entirely on the main
4321 ** b-tree page. */
4322 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4323 }else if( !(pCell[1] & 0x80)
4324 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4325 ){
4326 /* The record-size field is a 2 byte varint and the record
4327 ** fits entirely on the main b-tree page. */
4328 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
drhe51c44f2004-05-30 20:46:09 +00004329 }else{
danielk197711c327a2009-05-04 19:01:26 +00004330 /* The record flows over onto one or more overflow pages. In
4331 ** this case the whole cell needs to be parsed, a buffer allocated
4332 ** and accessPayload() used to retrieve the record into the
4333 ** buffer before VdbeRecordCompare() can be called. */
4334 void *pCellKey;
4335 u8 * const pCellBody = pCell - pPage->childPtrSize;
danielk197730548662009-07-09 05:07:37 +00004336 btreeParseCellPtr(pPage, pCellBody, &pCur->info);
shane60a4b532009-05-06 18:57:09 +00004337 nCell = (int)pCur->info.nKey;
danielk197711c327a2009-05-04 19:01:26 +00004338 pCellKey = sqlite3Malloc( nCell );
danielk19776507ecb2008-03-25 09:56:44 +00004339 if( pCellKey==0 ){
4340 rc = SQLITE_NOMEM;
4341 goto moveto_finish;
4342 }
drhfb192682009-07-11 18:26:28 +00004343 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
drhec9b31f2009-08-25 13:53:49 +00004344 if( rc ){
4345 sqlite3_free(pCellKey);
4346 goto moveto_finish;
4347 }
danielk197711c327a2009-05-04 19:01:26 +00004348 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
drhfacf0302008-06-17 15:12:00 +00004349 sqlite3_free(pCellKey);
drhe51c44f2004-05-30 20:46:09 +00004350 }
drh3aac2dd2004-04-26 14:10:20 +00004351 }
drh72f82862001-05-24 21:06:34 +00004352 if( c==0 ){
drh44845222008-07-17 18:39:57 +00004353 if( pPage->intKey && !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00004354 lwr = idx;
drhfc70e6f2004-05-12 21:11:27 +00004355 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00004356 break;
4357 }else{
drh64022502009-01-09 14:11:04 +00004358 *pRes = 0;
drh1e968a02008-03-25 00:22:21 +00004359 rc = SQLITE_OK;
4360 goto moveto_finish;
drh8b18dd42004-05-12 19:18:15 +00004361 }
drh72f82862001-05-24 21:06:34 +00004362 }
4363 if( c<0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00004364 lwr = idx+1;
drh72f82862001-05-24 21:06:34 +00004365 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00004366 upr = idx-1;
drh72f82862001-05-24 21:06:34 +00004367 }
drhf1d68b32007-03-29 04:43:26 +00004368 if( lwr>upr ){
4369 break;
4370 }
drhf49661a2008-12-10 16:45:50 +00004371 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
drh72f82862001-05-24 21:06:34 +00004372 }
4373 assert( lwr==upr+1 );
danielk197771d5d2c2008-09-29 11:49:47 +00004374 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00004375 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00004376 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00004377 }else if( lwr>=pPage->nCell ){
drh43605152004-05-29 21:46:49 +00004378 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh72f82862001-05-24 21:06:34 +00004379 }else{
danielk19771cc5ed82007-05-16 17:28:43 +00004380 chldPg = get4byte(findCell(pPage, lwr));
drh72f82862001-05-24 21:06:34 +00004381 }
4382 if( chldPg==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00004383 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
danielk19775cb09632009-07-09 11:36:01 +00004384 *pRes = c;
drh1e968a02008-03-25 00:22:21 +00004385 rc = SQLITE_OK;
4386 goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00004387 }
drhf49661a2008-12-10 16:45:50 +00004388 pCur->aiIdx[pCur->iPage] = (u16)lwr;
drh271efa52004-05-30 19:19:05 +00004389 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004390 pCur->validNKey = 0;
drh8178a752003-01-05 21:41:40 +00004391 rc = moveToChild(pCur, chldPg);
drh1e968a02008-03-25 00:22:21 +00004392 if( rc ) goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00004393 }
drh1e968a02008-03-25 00:22:21 +00004394moveto_finish:
drhe63d9992008-08-13 19:11:48 +00004395 return rc;
4396}
4397
drhd677b3d2007-08-20 22:48:41 +00004398
drh72f82862001-05-24 21:06:34 +00004399/*
drhc39e0002004-05-07 23:50:57 +00004400** Return TRUE if the cursor is not pointing at an entry of the table.
4401**
4402** TRUE will be returned after a call to sqlite3BtreeNext() moves
4403** past the last entry in the table or sqlite3BtreePrev() moves past
4404** the first entry. TRUE is also returned if the table is empty.
4405*/
4406int sqlite3BtreeEof(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00004407 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4408 ** have been deleted? This API will need to change to return an error code
4409 ** as well as the boolean result value.
4410 */
4411 return (CURSOR_VALID!=pCur->eState);
drhc39e0002004-05-07 23:50:57 +00004412}
4413
4414/*
drhbd03cae2001-06-02 02:40:57 +00004415** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00004416** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00004417** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00004418** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00004419*/
drhd094db12008-04-03 21:46:57 +00004420int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00004421 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00004422 int idx;
danielk197797a227c2006-01-20 16:32:04 +00004423 MemPage *pPage;
drh8b18dd42004-05-12 19:18:15 +00004424
drh1fee73e2007-08-29 04:00:57 +00004425 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00004426 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00004427 if( rc!=SQLITE_OK ){
4428 return rc;
4429 }
drh8c4d3a62007-04-06 01:03:32 +00004430 assert( pRes!=0 );
drh8c4d3a62007-04-06 01:03:32 +00004431 if( CURSOR_INVALID==pCur->eState ){
4432 *pRes = 1;
4433 return SQLITE_OK;
4434 }
drh4c301aa2009-07-15 17:25:45 +00004435 if( pCur->skipNext>0 ){
4436 pCur->skipNext = 0;
danielk1977da184232006-01-05 11:34:32 +00004437 *pRes = 0;
4438 return SQLITE_OK;
4439 }
drh4c301aa2009-07-15 17:25:45 +00004440 pCur->skipNext = 0;
danielk1977da184232006-01-05 11:34:32 +00004441
danielk197771d5d2c2008-09-29 11:49:47 +00004442 pPage = pCur->apPage[pCur->iPage];
4443 idx = ++pCur->aiIdx[pCur->iPage];
4444 assert( pPage->isInit );
4445 assert( idx<=pPage->nCell );
danielk19776a43f9b2004-11-16 04:57:24 +00004446
drh271efa52004-05-30 19:19:05 +00004447 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004448 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004449 if( idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00004450 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00004451 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
drh5e2f8b92001-05-28 00:41:15 +00004452 if( rc ) return rc;
4453 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00004454 *pRes = 0;
4455 return rc;
drh72f82862001-05-24 21:06:34 +00004456 }
drh5e2f8b92001-05-28 00:41:15 +00004457 do{
danielk197771d5d2c2008-09-29 11:49:47 +00004458 if( pCur->iPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00004459 *pRes = 1;
danielk1977da184232006-01-05 11:34:32 +00004460 pCur->eState = CURSOR_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00004461 return SQLITE_OK;
4462 }
danielk197730548662009-07-09 05:07:37 +00004463 moveToParent(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00004464 pPage = pCur->apPage[pCur->iPage];
4465 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00004466 *pRes = 0;
drh44845222008-07-17 18:39:57 +00004467 if( pPage->intKey ){
drh8b18dd42004-05-12 19:18:15 +00004468 rc = sqlite3BtreeNext(pCur, pRes);
4469 }else{
4470 rc = SQLITE_OK;
4471 }
4472 return rc;
drh8178a752003-01-05 21:41:40 +00004473 }
4474 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00004475 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00004476 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00004477 }
drh5e2f8b92001-05-28 00:41:15 +00004478 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00004479 return rc;
drh72f82862001-05-24 21:06:34 +00004480}
drhd677b3d2007-08-20 22:48:41 +00004481
drh72f82862001-05-24 21:06:34 +00004482
drh3b7511c2001-05-26 13:15:44 +00004483/*
drh2dcc9aa2002-12-04 13:40:25 +00004484** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00004485** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00004486** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00004487** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00004488*/
drhd094db12008-04-03 21:46:57 +00004489int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00004490 int rc;
drh8178a752003-01-05 21:41:40 +00004491 MemPage *pPage;
danielk1977da184232006-01-05 11:34:32 +00004492
drh1fee73e2007-08-29 04:00:57 +00004493 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00004494 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00004495 if( rc!=SQLITE_OK ){
4496 return rc;
4497 }
drha2c20e42008-03-29 16:01:04 +00004498 pCur->atLast = 0;
drh8c4d3a62007-04-06 01:03:32 +00004499 if( CURSOR_INVALID==pCur->eState ){
4500 *pRes = 1;
4501 return SQLITE_OK;
4502 }
drh4c301aa2009-07-15 17:25:45 +00004503 if( pCur->skipNext<0 ){
4504 pCur->skipNext = 0;
danielk1977da184232006-01-05 11:34:32 +00004505 *pRes = 0;
4506 return SQLITE_OK;
4507 }
drh4c301aa2009-07-15 17:25:45 +00004508 pCur->skipNext = 0;
danielk1977da184232006-01-05 11:34:32 +00004509
danielk197771d5d2c2008-09-29 11:49:47 +00004510 pPage = pCur->apPage[pCur->iPage];
4511 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00004512 if( !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00004513 int idx = pCur->aiIdx[pCur->iPage];
4514 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
drhd677b3d2007-08-20 22:48:41 +00004515 if( rc ){
4516 return rc;
4517 }
drh2dcc9aa2002-12-04 13:40:25 +00004518 rc = moveToRightmost(pCur);
4519 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00004520 while( pCur->aiIdx[pCur->iPage]==0 ){
4521 if( pCur->iPage==0 ){
danielk1977da184232006-01-05 11:34:32 +00004522 pCur->eState = CURSOR_INVALID;
drhc39e0002004-05-07 23:50:57 +00004523 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00004524 return SQLITE_OK;
4525 }
danielk197730548662009-07-09 05:07:37 +00004526 moveToParent(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00004527 }
drh271efa52004-05-30 19:19:05 +00004528 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004529 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004530
4531 pCur->aiIdx[pCur->iPage]--;
4532 pPage = pCur->apPage[pCur->iPage];
drh44845222008-07-17 18:39:57 +00004533 if( pPage->intKey && !pPage->leaf ){
drh8b18dd42004-05-12 19:18:15 +00004534 rc = sqlite3BtreePrevious(pCur, pRes);
4535 }else{
4536 rc = SQLITE_OK;
4537 }
drh2dcc9aa2002-12-04 13:40:25 +00004538 }
drh8178a752003-01-05 21:41:40 +00004539 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00004540 return rc;
4541}
4542
4543/*
drh3b7511c2001-05-26 13:15:44 +00004544** Allocate a new page from the database file.
4545**
danielk19773b8a05f2007-03-19 17:44:26 +00004546** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
drh3b7511c2001-05-26 13:15:44 +00004547** has already been called on the new page.) The new page has also
4548** been referenced and the calling routine is responsible for calling
danielk19773b8a05f2007-03-19 17:44:26 +00004549** sqlite3PagerUnref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00004550**
4551** SQLITE_OK is returned on success. Any other return value indicates
4552** an error. *ppPage and *pPgno are undefined in the event of an error.
danielk19773b8a05f2007-03-19 17:44:26 +00004553** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00004554**
drh199e3cf2002-07-18 11:01:47 +00004555** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4556** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00004557** attempt to keep related pages close to each other in the database file,
4558** which in turn can make database access faster.
danielk1977cb1a7eb2004-11-05 12:27:02 +00004559**
4560** If the "exact" parameter is not 0, and the page-number nearby exists
4561** anywhere on the free-list, then it is guarenteed to be returned. This
4562** is only used by auto-vacuum databases when allocating a new table.
drh3b7511c2001-05-26 13:15:44 +00004563*/
drh4f0c5872007-03-26 22:05:01 +00004564static int allocateBtreePage(
danielk1977aef0bf62005-12-30 16:28:01 +00004565 BtShared *pBt,
danielk1977cb1a7eb2004-11-05 12:27:02 +00004566 MemPage **ppPage,
4567 Pgno *pPgno,
4568 Pgno nearby,
4569 u8 exact
4570){
drh3aac2dd2004-04-26 14:10:20 +00004571 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00004572 int rc;
drh35cd6432009-06-05 14:17:21 +00004573 u32 n; /* Number of pages on the freelist */
drh042d6a12009-06-17 13:57:16 +00004574 u32 k; /* Number of leaves on the trunk of the freelist */
drhd3627af2006-12-18 18:34:51 +00004575 MemPage *pTrunk = 0;
4576 MemPage *pPrevTrunk = 0;
drh1662b5a2009-06-04 19:06:09 +00004577 Pgno mxPage; /* Total size of the database file */
drh30e58752002-03-02 20:41:57 +00004578
drh1fee73e2007-08-29 04:00:57 +00004579 assert( sqlite3_mutex_held(pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004580 pPage1 = pBt->pPage1;
drh1662b5a2009-06-04 19:06:09 +00004581 mxPage = pagerPagecount(pBt);
drh3aac2dd2004-04-26 14:10:20 +00004582 n = get4byte(&pPage1->aData[36]);
drhdf35a082009-07-09 02:24:35 +00004583 testcase( n==mxPage-1 );
4584 if( n>=mxPage ){
drh1662b5a2009-06-04 19:06:09 +00004585 return SQLITE_CORRUPT_BKPT;
4586 }
drh3aac2dd2004-04-26 14:10:20 +00004587 if( n>0 ){
drh91025292004-05-03 19:49:32 +00004588 /* There are pages on the freelist. Reuse one of those pages. */
danielk1977cb1a7eb2004-11-05 12:27:02 +00004589 Pgno iTrunk;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004590 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4591
4592 /* If the 'exact' parameter was true and a query of the pointer-map
4593 ** shows that the page 'nearby' is somewhere on the free-list, then
4594 ** the entire-list will be searched for that page.
4595 */
4596#ifndef SQLITE_OMIT_AUTOVACUUM
drh1662b5a2009-06-04 19:06:09 +00004597 if( exact && nearby<=mxPage ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004598 u8 eType;
4599 assert( nearby>0 );
4600 assert( pBt->autoVacuum );
4601 rc = ptrmapGet(pBt, nearby, &eType, 0);
4602 if( rc ) return rc;
4603 if( eType==PTRMAP_FREEPAGE ){
4604 searchList = 1;
4605 }
4606 *pPgno = nearby;
4607 }
4608#endif
4609
4610 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4611 ** first free-list trunk page. iPrevTrunk is initially 1.
4612 */
danielk19773b8a05f2007-03-19 17:44:26 +00004613 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3b7511c2001-05-26 13:15:44 +00004614 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004615 put4byte(&pPage1->aData[36], n-1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004616
4617 /* The code within this loop is run only once if the 'searchList' variable
4618 ** is not true. Otherwise, it runs once for each trunk-page on the
4619 ** free-list until the page 'nearby' is located.
4620 */
4621 do {
4622 pPrevTrunk = pTrunk;
4623 if( pPrevTrunk ){
4624 iTrunk = get4byte(&pPrevTrunk->aData[0]);
drhbea00b92002-07-08 10:59:50 +00004625 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00004626 iTrunk = get4byte(&pPage1->aData[32]);
drhbea00b92002-07-08 10:59:50 +00004627 }
drhdf35a082009-07-09 02:24:35 +00004628 testcase( iTrunk==mxPage );
drh1662b5a2009-06-04 19:06:09 +00004629 if( iTrunk>mxPage ){
4630 rc = SQLITE_CORRUPT_BKPT;
4631 }else{
danielk197730548662009-07-09 05:07:37 +00004632 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
drh1662b5a2009-06-04 19:06:09 +00004633 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004634 if( rc ){
drhd3627af2006-12-18 18:34:51 +00004635 pTrunk = 0;
4636 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004637 }
4638
4639 k = get4byte(&pTrunk->aData[4]);
4640 if( k==0 && !searchList ){
4641 /* The trunk has no leaves and the list is not being searched.
4642 ** So extract the trunk page itself and use it as the newly
4643 ** allocated page */
4644 assert( pPrevTrunk==0 );
danielk19773b8a05f2007-03-19 17:44:26 +00004645 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004646 if( rc ){
4647 goto end_allocate_page;
4648 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004649 *pPgno = iTrunk;
4650 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4651 *ppPage = pTrunk;
4652 pTrunk = 0;
4653 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh042d6a12009-06-17 13:57:16 +00004654 }else if( k>(u32)(pBt->usableSize/4 - 2) ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004655 /* Value of k is out of range. Database corruption */
drhd3627af2006-12-18 18:34:51 +00004656 rc = SQLITE_CORRUPT_BKPT;
4657 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004658#ifndef SQLITE_OMIT_AUTOVACUUM
4659 }else if( searchList && nearby==iTrunk ){
4660 /* The list is being searched and this trunk page is the page
4661 ** to allocate, regardless of whether it has leaves.
4662 */
4663 assert( *pPgno==iTrunk );
4664 *ppPage = pTrunk;
4665 searchList = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00004666 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004667 if( rc ){
4668 goto end_allocate_page;
4669 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004670 if( k==0 ){
4671 if( !pPrevTrunk ){
4672 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4673 }else{
4674 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4675 }
4676 }else{
4677 /* The trunk page is required by the caller but it contains
4678 ** pointers to free-list leaves. The first leaf becomes a trunk
4679 ** page in this case.
4680 */
4681 MemPage *pNewTrunk;
4682 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
drh1662b5a2009-06-04 19:06:09 +00004683 if( iNewTrunk>mxPage ){
4684 rc = SQLITE_CORRUPT_BKPT;
4685 goto end_allocate_page;
4686 }
drhdf35a082009-07-09 02:24:35 +00004687 testcase( iNewTrunk==mxPage );
danielk197730548662009-07-09 05:07:37 +00004688 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004689 if( rc!=SQLITE_OK ){
drhd3627af2006-12-18 18:34:51 +00004690 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004691 }
danielk19773b8a05f2007-03-19 17:44:26 +00004692 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004693 if( rc!=SQLITE_OK ){
4694 releasePage(pNewTrunk);
drhd3627af2006-12-18 18:34:51 +00004695 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004696 }
4697 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4698 put4byte(&pNewTrunk->aData[4], k-1);
4699 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
drhd3627af2006-12-18 18:34:51 +00004700 releasePage(pNewTrunk);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004701 if( !pPrevTrunk ){
drhc5053fb2008-11-27 02:22:10 +00004702 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
danielk1977cb1a7eb2004-11-05 12:27:02 +00004703 put4byte(&pPage1->aData[32], iNewTrunk);
4704 }else{
danielk19773b8a05f2007-03-19 17:44:26 +00004705 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004706 if( rc ){
4707 goto end_allocate_page;
4708 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004709 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4710 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004711 }
4712 pTrunk = 0;
4713 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4714#endif
danielk1977e5765212009-06-17 11:13:28 +00004715 }else if( k>0 ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004716 /* Extract a leaf from the trunk */
drh042d6a12009-06-17 13:57:16 +00004717 u32 closest;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004718 Pgno iPage;
4719 unsigned char *aData = pTrunk->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00004720 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004721 if( rc ){
4722 goto end_allocate_page;
4723 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004724 if( nearby>0 ){
drh042d6a12009-06-17 13:57:16 +00004725 u32 i;
4726 int dist;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004727 closest = 0;
4728 dist = get4byte(&aData[8]) - nearby;
4729 if( dist<0 ) dist = -dist;
4730 for(i=1; i<k; i++){
4731 int d2 = get4byte(&aData[8+i*4]) - nearby;
4732 if( d2<0 ) d2 = -d2;
4733 if( d2<dist ){
4734 closest = i;
4735 dist = d2;
4736 }
4737 }
4738 }else{
4739 closest = 0;
4740 }
4741
4742 iPage = get4byte(&aData[8+closest*4]);
drhdf35a082009-07-09 02:24:35 +00004743 testcase( iPage==mxPage );
drh1662b5a2009-06-04 19:06:09 +00004744 if( iPage>mxPage ){
4745 rc = SQLITE_CORRUPT_BKPT;
4746 goto end_allocate_page;
4747 }
drhdf35a082009-07-09 02:24:35 +00004748 testcase( iPage==mxPage );
danielk1977cb1a7eb2004-11-05 12:27:02 +00004749 if( !searchList || iPage==nearby ){
danielk1977bea2a942009-01-20 17:06:27 +00004750 int noContent;
shane1f9e6aa2008-06-09 19:27:11 +00004751 *pPgno = iPage;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004752 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4753 ": %d more free pages\n",
4754 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4755 if( closest<k-1 ){
4756 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4757 }
4758 put4byte(&aData[4], k-1);
drhc5053fb2008-11-27 02:22:10 +00004759 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
danielk1977bea2a942009-01-20 17:06:27 +00004760 noContent = !btreeGetHasContent(pBt, *pPgno);
danielk197730548662009-07-09 05:07:37 +00004761 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004762 if( rc==SQLITE_OK ){
danielk19773b8a05f2007-03-19 17:44:26 +00004763 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004764 if( rc!=SQLITE_OK ){
4765 releasePage(*ppPage);
4766 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004767 }
4768 searchList = 0;
4769 }
drhee696e22004-08-30 16:52:17 +00004770 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004771 releasePage(pPrevTrunk);
drhd3627af2006-12-18 18:34:51 +00004772 pPrevTrunk = 0;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004773 }while( searchList );
drh3b7511c2001-05-26 13:15:44 +00004774 }else{
drh3aac2dd2004-04-26 14:10:20 +00004775 /* There are no pages on the freelist, so create a new page at the
4776 ** end of the file */
danielk197789d40042008-11-17 14:20:56 +00004777 int nPage = pagerPagecount(pBt);
danielk1977ad0132d2008-06-07 08:58:22 +00004778 *pPgno = nPage + 1;
danielk1977afcdd022004-10-31 16:25:42 +00004779
danielk1977bea2a942009-01-20 17:06:27 +00004780 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4781 (*pPgno)++;
4782 }
4783
danielk1977afcdd022004-10-31 16:25:42 +00004784#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977266664d2006-02-10 08:24:21 +00004785 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
danielk1977afcdd022004-10-31 16:25:42 +00004786 /* If *pPgno refers to a pointer-map page, allocate two new pages
4787 ** at the end of the file instead of one. The first allocated page
4788 ** becomes a new pointer-map page, the second is used by the caller.
4789 */
danielk1977ac861692009-03-28 10:54:22 +00004790 MemPage *pPg = 0;
danielk1977afcdd022004-10-31 16:25:42 +00004791 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
danielk1977599fcba2004-11-08 07:13:13 +00004792 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk197730548662009-07-09 05:07:37 +00004793 rc = btreeGetPage(pBt, *pPgno, &pPg, 0);
danielk1977ac861692009-03-28 10:54:22 +00004794 if( rc==SQLITE_OK ){
4795 rc = sqlite3PagerWrite(pPg->pDbPage);
4796 releasePage(pPg);
4797 }
4798 if( rc ) return rc;
danielk1977afcdd022004-10-31 16:25:42 +00004799 (*pPgno)++;
drh72190432008-01-31 14:54:43 +00004800 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
danielk1977afcdd022004-10-31 16:25:42 +00004801 }
4802#endif
4803
danielk1977599fcba2004-11-08 07:13:13 +00004804 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk197730548662009-07-09 05:07:37 +00004805 rc = btreeGetPage(pBt, *pPgno, ppPage, 0);
drh3b7511c2001-05-26 13:15:44 +00004806 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00004807 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004808 if( rc!=SQLITE_OK ){
4809 releasePage(*ppPage);
4810 }
drh3a4c1412004-05-09 20:40:11 +00004811 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00004812 }
danielk1977599fcba2004-11-08 07:13:13 +00004813
4814 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drhd3627af2006-12-18 18:34:51 +00004815
4816end_allocate_page:
4817 releasePage(pTrunk);
4818 releasePage(pPrevTrunk);
danielk1977b247c212008-11-21 09:09:01 +00004819 if( rc==SQLITE_OK ){
4820 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4821 releasePage(*ppPage);
4822 return SQLITE_CORRUPT_BKPT;
4823 }
4824 (*ppPage)->isInit = 0;
danielk1977a50d9aa2009-06-08 14:49:45 +00004825 }else{
4826 *ppPage = 0;
danielk1977eaa06f62008-09-18 17:34:44 +00004827 }
drh3b7511c2001-05-26 13:15:44 +00004828 return rc;
4829}
4830
4831/*
danielk1977bea2a942009-01-20 17:06:27 +00004832** This function is used to add page iPage to the database file free-list.
4833** It is assumed that the page is not already a part of the free-list.
drh5e2f8b92001-05-28 00:41:15 +00004834**
danielk1977bea2a942009-01-20 17:06:27 +00004835** The value passed as the second argument to this function is optional.
4836** If the caller happens to have a pointer to the MemPage object
4837** corresponding to page iPage handy, it may pass it as the second value.
4838** Otherwise, it may pass NULL.
4839**
4840** If a pointer to a MemPage object is passed as the second argument,
4841** its reference count is not altered by this function.
drh3b7511c2001-05-26 13:15:44 +00004842*/
danielk1977bea2a942009-01-20 17:06:27 +00004843static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
4844 MemPage *pTrunk = 0; /* Free-list trunk page */
4845 Pgno iTrunk = 0; /* Page number of free-list trunk page */
4846 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
4847 MemPage *pPage; /* Page being freed. May be NULL. */
4848 int rc; /* Return Code */
4849 int nFree; /* Initial number of pages on free-list */
drh8b2f49b2001-06-08 00:21:52 +00004850
danielk1977bea2a942009-01-20 17:06:27 +00004851 assert( sqlite3_mutex_held(pBt->mutex) );
4852 assert( iPage>1 );
4853 assert( !pMemPage || pMemPage->pgno==iPage );
4854
4855 if( pMemPage ){
4856 pPage = pMemPage;
4857 sqlite3PagerRef(pPage->pDbPage);
4858 }else{
4859 pPage = btreePageLookup(pBt, iPage);
4860 }
drh3aac2dd2004-04-26 14:10:20 +00004861
drha34b6762004-05-07 13:30:42 +00004862 /* Increment the free page count on pPage1 */
danielk19773b8a05f2007-03-19 17:44:26 +00004863 rc = sqlite3PagerWrite(pPage1->pDbPage);
danielk1977bea2a942009-01-20 17:06:27 +00004864 if( rc ) goto freepage_out;
4865 nFree = get4byte(&pPage1->aData[36]);
4866 put4byte(&pPage1->aData[36], nFree+1);
drh3aac2dd2004-04-26 14:10:20 +00004867
drhfcce93f2006-02-22 03:08:32 +00004868#ifdef SQLITE_SECURE_DELETE
4869 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4870 ** always fully overwrite deleted information with zeros.
4871 */
danielk197730548662009-07-09 05:07:37 +00004872 if( (!pPage && (rc = btreeGetPage(pBt, iPage, &pPage, 0)))
danielk1977bea2a942009-01-20 17:06:27 +00004873 || (rc = sqlite3PagerWrite(pPage->pDbPage))
4874 ){
4875 goto freepage_out;
4876 }
drhfcce93f2006-02-22 03:08:32 +00004877 memset(pPage->aData, 0, pPage->pBt->pageSize);
4878#endif
4879
danielk1977687566d2004-11-02 12:56:41 +00004880 /* If the database supports auto-vacuum, write an entry in the pointer-map
danielk1977cb1a7eb2004-11-05 12:27:02 +00004881 ** to indicate that the page is free.
danielk1977687566d2004-11-02 12:56:41 +00004882 */
danielk197785d90ca2008-07-19 14:25:15 +00004883 if( ISAUTOVACUUM ){
drh98add2e2009-07-20 17:11:49 +00004884 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
danielk1977bea2a942009-01-20 17:06:27 +00004885 if( rc ) goto freepage_out;
danielk1977687566d2004-11-02 12:56:41 +00004886 }
danielk1977687566d2004-11-02 12:56:41 +00004887
danielk1977bea2a942009-01-20 17:06:27 +00004888 /* Now manipulate the actual database free-list structure. There are two
4889 ** possibilities. If the free-list is currently empty, or if the first
4890 ** trunk page in the free-list is full, then this page will become a
4891 ** new free-list trunk page. Otherwise, it will become a leaf of the
4892 ** first trunk page in the current free-list. This block tests if it
4893 ** is possible to add the page as a new free-list leaf.
4894 */
4895 if( nFree!=0 ){
drhc046e3e2009-07-15 11:26:44 +00004896 u32 nLeaf; /* Initial number of leaf cells on trunk page */
danielk1977bea2a942009-01-20 17:06:27 +00004897
4898 iTrunk = get4byte(&pPage1->aData[32]);
danielk197730548662009-07-09 05:07:37 +00004899 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
danielk1977bea2a942009-01-20 17:06:27 +00004900 if( rc!=SQLITE_OK ){
4901 goto freepage_out;
4902 }
4903
4904 nLeaf = get4byte(&pTrunk->aData[4]);
drheeb844a2009-08-08 18:01:07 +00004905 assert( pBt->usableSize>32 );
4906 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
danielk1977bea2a942009-01-20 17:06:27 +00004907 rc = SQLITE_CORRUPT_BKPT;
4908 goto freepage_out;
4909 }
drheeb844a2009-08-08 18:01:07 +00004910 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
danielk1977bea2a942009-01-20 17:06:27 +00004911 /* In this case there is room on the trunk page to insert the page
4912 ** being freed as a new leaf.
drh45b1fac2008-07-04 17:52:42 +00004913 **
4914 ** Note that the trunk page is not really full until it contains
4915 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4916 ** coded. But due to a coding error in versions of SQLite prior to
4917 ** 3.6.0, databases with freelist trunk pages holding more than
4918 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4919 ** to maintain backwards compatibility with older versions of SQLite,
drhc046e3e2009-07-15 11:26:44 +00004920 ** we will continue to restrict the number of entries to usableSize/4 - 8
drh45b1fac2008-07-04 17:52:42 +00004921 ** for now. At some point in the future (once everyone has upgraded
4922 ** to 3.6.0 or later) we should consider fixing the conditional above
4923 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4924 */
danielk19773b8a05f2007-03-19 17:44:26 +00004925 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhf5345442007-04-09 12:45:02 +00004926 if( rc==SQLITE_OK ){
danielk1977bea2a942009-01-20 17:06:27 +00004927 put4byte(&pTrunk->aData[4], nLeaf+1);
4928 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
drhfcce93f2006-02-22 03:08:32 +00004929#ifndef SQLITE_SECURE_DELETE
danielk1977bea2a942009-01-20 17:06:27 +00004930 if( pPage ){
4931 sqlite3PagerDontWrite(pPage->pDbPage);
4932 }
drhfcce93f2006-02-22 03:08:32 +00004933#endif
danielk1977bea2a942009-01-20 17:06:27 +00004934 rc = btreeSetHasContent(pBt, iPage);
drhf5345442007-04-09 12:45:02 +00004935 }
drh3a4c1412004-05-09 20:40:11 +00004936 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
danielk1977bea2a942009-01-20 17:06:27 +00004937 goto freepage_out;
drh3aac2dd2004-04-26 14:10:20 +00004938 }
drh3b7511c2001-05-26 13:15:44 +00004939 }
danielk1977bea2a942009-01-20 17:06:27 +00004940
4941 /* If control flows to this point, then it was not possible to add the
4942 ** the page being freed as a leaf page of the first trunk in the free-list.
4943 ** Possibly because the free-list is empty, or possibly because the
4944 ** first trunk in the free-list is full. Either way, the page being freed
4945 ** will become the new first trunk page in the free-list.
4946 */
drhc046e3e2009-07-15 11:26:44 +00004947 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
4948 goto freepage_out;
4949 }
4950 rc = sqlite3PagerWrite(pPage->pDbPage);
4951 if( rc!=SQLITE_OK ){
danielk1977bea2a942009-01-20 17:06:27 +00004952 goto freepage_out;
4953 }
4954 put4byte(pPage->aData, iTrunk);
4955 put4byte(&pPage->aData[4], 0);
4956 put4byte(&pPage1->aData[32], iPage);
4957 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
4958
4959freepage_out:
4960 if( pPage ){
4961 pPage->isInit = 0;
4962 }
4963 releasePage(pPage);
4964 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00004965 return rc;
4966}
drhc314dc72009-07-21 11:52:34 +00004967static void freePage(MemPage *pPage, int *pRC){
4968 if( (*pRC)==SQLITE_OK ){
4969 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
4970 }
danielk1977bea2a942009-01-20 17:06:27 +00004971}
drh3b7511c2001-05-26 13:15:44 +00004972
4973/*
drh3aac2dd2004-04-26 14:10:20 +00004974** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00004975*/
drh3aac2dd2004-04-26 14:10:20 +00004976static int clearCell(MemPage *pPage, unsigned char *pCell){
danielk1977aef0bf62005-12-30 16:28:01 +00004977 BtShared *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00004978 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00004979 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00004980 int rc;
drh94440812007-03-06 11:42:19 +00004981 int nOvfl;
shane63207ab2009-02-04 01:49:30 +00004982 u16 ovflPageSize;
drh3b7511c2001-05-26 13:15:44 +00004983
drh1fee73e2007-08-29 04:00:57 +00004984 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197730548662009-07-09 05:07:37 +00004985 btreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004986 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00004987 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00004988 }
drh6f11bef2004-05-13 01:12:56 +00004989 ovflPgno = get4byte(&pCell[info.iOverflow]);
shane63207ab2009-02-04 01:49:30 +00004990 assert( pBt->usableSize > 4 );
drh94440812007-03-06 11:42:19 +00004991 ovflPageSize = pBt->usableSize - 4;
drh72365832007-03-06 15:53:44 +00004992 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4993 assert( ovflPgno==0 || nOvfl>0 );
4994 while( nOvfl-- ){
shane63207ab2009-02-04 01:49:30 +00004995 Pgno iNext = 0;
danielk1977bea2a942009-01-20 17:06:27 +00004996 MemPage *pOvfl = 0;
danielk1977e589a672009-04-11 16:06:15 +00004997 if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){
4998 /* 0 is not a legal page number and page 1 cannot be an
4999 ** overflow page. Therefore if ovflPgno<2 or past the end of the
5000 ** file the database must be corrupt. */
drh49285702005-09-17 15:20:26 +00005001 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00005002 }
danielk1977bea2a942009-01-20 17:06:27 +00005003 if( nOvfl ){
5004 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5005 if( rc ) return rc;
5006 }
5007 rc = freePage2(pBt, pOvfl, ovflPgno);
5008 if( pOvfl ){
5009 sqlite3PagerUnref(pOvfl->pDbPage);
5010 }
drh3b7511c2001-05-26 13:15:44 +00005011 if( rc ) return rc;
danielk1977bea2a942009-01-20 17:06:27 +00005012 ovflPgno = iNext;
drh3b7511c2001-05-26 13:15:44 +00005013 }
drh5e2f8b92001-05-28 00:41:15 +00005014 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00005015}
5016
5017/*
drh91025292004-05-03 19:49:32 +00005018** Create the byte sequence used to represent a cell on page pPage
5019** and write that byte sequence into pCell[]. Overflow pages are
5020** allocated and filled in as necessary. The calling procedure
5021** is responsible for making sure sufficient space has been allocated
5022** for pCell[].
5023**
5024** Note that pCell does not necessary need to point to the pPage->aData
5025** area. pCell might point to some temporary storage. The cell will
5026** be constructed in this temporary area then copied into pPage->aData
5027** later.
drh3b7511c2001-05-26 13:15:44 +00005028*/
5029static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00005030 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00005031 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00005032 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00005033 const void *pData,int nData, /* The data */
drhb026e052007-05-02 01:34:31 +00005034 int nZero, /* Extra zero bytes to append to pData */
drh4b70f112004-05-02 21:12:19 +00005035 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00005036){
drh3b7511c2001-05-26 13:15:44 +00005037 int nPayload;
drh8c6fa9b2004-05-26 00:01:53 +00005038 const u8 *pSrc;
drha34b6762004-05-07 13:30:42 +00005039 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00005040 int spaceLeft;
5041 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00005042 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00005043 unsigned char *pPrior;
5044 unsigned char *pPayload;
danielk1977aef0bf62005-12-30 16:28:01 +00005045 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00005046 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00005047 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00005048 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00005049
drh1fee73e2007-08-29 04:00:57 +00005050 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00005051
drhc5053fb2008-11-27 02:22:10 +00005052 /* pPage is not necessarily writeable since pCell might be auxiliary
5053 ** buffer space that is separate from the pPage buffer area */
5054 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5055 || sqlite3PagerIswriteable(pPage->pDbPage) );
5056
drh91025292004-05-03 19:49:32 +00005057 /* Fill in the header. */
drh43605152004-05-29 21:46:49 +00005058 nHeader = 0;
drh91025292004-05-03 19:49:32 +00005059 if( !pPage->leaf ){
5060 nHeader += 4;
5061 }
drh8b18dd42004-05-12 19:18:15 +00005062 if( pPage->hasData ){
drhb026e052007-05-02 01:34:31 +00005063 nHeader += putVarint(&pCell[nHeader], nData+nZero);
drh6f11bef2004-05-13 01:12:56 +00005064 }else{
drhb026e052007-05-02 01:34:31 +00005065 nData = nZero = 0;
drh91025292004-05-03 19:49:32 +00005066 }
drh6f11bef2004-05-13 01:12:56 +00005067 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
danielk197730548662009-07-09 05:07:37 +00005068 btreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00005069 assert( info.nHeader==nHeader );
5070 assert( info.nKey==nKey );
danielk197789d40042008-11-17 14:20:56 +00005071 assert( info.nData==(u32)(nData+nZero) );
drh6f11bef2004-05-13 01:12:56 +00005072
5073 /* Fill in the payload */
drhb026e052007-05-02 01:34:31 +00005074 nPayload = nData + nZero;
drh3aac2dd2004-04-26 14:10:20 +00005075 if( pPage->intKey ){
5076 pSrc = pData;
5077 nSrc = nData;
drh91025292004-05-03 19:49:32 +00005078 nData = 0;
drhf49661a2008-12-10 16:45:50 +00005079 }else{
danielk197731d31b82009-07-13 13:18:07 +00005080 if( NEVER(nKey>0x7fffffff || pKey==0) ){
5081 return SQLITE_CORRUPT_BKPT;
drh20abac22009-01-28 20:21:17 +00005082 }
drhf49661a2008-12-10 16:45:50 +00005083 nPayload += (int)nKey;
drh3aac2dd2004-04-26 14:10:20 +00005084 pSrc = pKey;
drhf49661a2008-12-10 16:45:50 +00005085 nSrc = (int)nKey;
drh3aac2dd2004-04-26 14:10:20 +00005086 }
drh6f11bef2004-05-13 01:12:56 +00005087 *pnSize = info.nSize;
5088 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00005089 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00005090 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00005091
drh3b7511c2001-05-26 13:15:44 +00005092 while( nPayload>0 ){
5093 if( spaceLeft==0 ){
danielk1977afcdd022004-10-31 16:25:42 +00005094#ifndef SQLITE_OMIT_AUTOVACUUM
5095 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
danielk1977b39f70b2007-05-17 18:28:11 +00005096 if( pBt->autoVacuum ){
5097 do{
5098 pgnoOvfl++;
5099 } while(
5100 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5101 );
danielk1977b39f70b2007-05-17 18:28:11 +00005102 }
danielk1977afcdd022004-10-31 16:25:42 +00005103#endif
drhf49661a2008-12-10 16:45:50 +00005104 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
danielk1977afcdd022004-10-31 16:25:42 +00005105#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977a19df672004-11-03 11:37:07 +00005106 /* If the database supports auto-vacuum, and the second or subsequent
5107 ** overflow page is being allocated, add an entry to the pointer-map
danielk19774ef24492007-05-23 09:52:41 +00005108 ** for that page now.
5109 **
5110 ** If this is the first overflow page, then write a partial entry
5111 ** to the pointer-map. If we write nothing to this pointer-map slot,
5112 ** then the optimistic overflow chain processing in clearCell()
5113 ** may misinterpret the uninitialised values and delete the
5114 ** wrong pages from the database.
danielk1977afcdd022004-10-31 16:25:42 +00005115 */
danielk19774ef24492007-05-23 09:52:41 +00005116 if( pBt->autoVacuum && rc==SQLITE_OK ){
5117 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
drh98add2e2009-07-20 17:11:49 +00005118 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
danielk197789a4be82007-05-23 13:34:32 +00005119 if( rc ){
5120 releasePage(pOvfl);
5121 }
danielk1977afcdd022004-10-31 16:25:42 +00005122 }
5123#endif
drh3b7511c2001-05-26 13:15:44 +00005124 if( rc ){
drh9b171272004-05-08 02:03:22 +00005125 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00005126 return rc;
5127 }
drhc5053fb2008-11-27 02:22:10 +00005128
5129 /* If pToRelease is not zero than pPrior points into the data area
5130 ** of pToRelease. Make sure pToRelease is still writeable. */
5131 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5132
5133 /* If pPrior is part of the data area of pPage, then make sure pPage
5134 ** is still writeable */
5135 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5136 || sqlite3PagerIswriteable(pPage->pDbPage) );
5137
drh3aac2dd2004-04-26 14:10:20 +00005138 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00005139 releasePage(pToRelease);
5140 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00005141 pPrior = pOvfl->aData;
5142 put4byte(pPrior, 0);
5143 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00005144 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00005145 }
5146 n = nPayload;
5147 if( n>spaceLeft ) n = spaceLeft;
drhc5053fb2008-11-27 02:22:10 +00005148
5149 /* If pToRelease is not zero than pPayload points into the data area
5150 ** of pToRelease. Make sure pToRelease is still writeable. */
5151 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5152
5153 /* If pPayload is part of the data area of pPage, then make sure pPage
5154 ** is still writeable */
5155 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5156 || sqlite3PagerIswriteable(pPage->pDbPage) );
5157
drhb026e052007-05-02 01:34:31 +00005158 if( nSrc>0 ){
5159 if( n>nSrc ) n = nSrc;
5160 assert( pSrc );
5161 memcpy(pPayload, pSrc, n);
5162 }else{
5163 memset(pPayload, 0, n);
5164 }
drh3b7511c2001-05-26 13:15:44 +00005165 nPayload -= n;
drhde647132004-05-07 17:57:49 +00005166 pPayload += n;
drh9b171272004-05-08 02:03:22 +00005167 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00005168 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00005169 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00005170 if( nSrc==0 ){
5171 nSrc = nData;
5172 pSrc = pData;
5173 }
drhdd793422001-06-28 01:54:48 +00005174 }
drh9b171272004-05-08 02:03:22 +00005175 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00005176 return SQLITE_OK;
5177}
5178
drh14acc042001-06-10 19:56:58 +00005179/*
5180** Remove the i-th cell from pPage. This routine effects pPage only.
5181** The cell content is not freed or deallocated. It is assumed that
5182** the cell content has been copied someplace else. This routine just
5183** removes the reference to the cell from pPage.
5184**
5185** "sz" must be the number of bytes in the cell.
drh14acc042001-06-10 19:56:58 +00005186*/
drh98add2e2009-07-20 17:11:49 +00005187static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
drh43605152004-05-29 21:46:49 +00005188 int i; /* Loop counter */
5189 int pc; /* Offset to cell content of cell being deleted */
5190 u8 *data; /* pPage->aData */
5191 u8 *ptr; /* Used to move bytes around within data[] */
shanedcc50b72008-11-13 18:29:50 +00005192 int rc; /* The return code */
drhc314dc72009-07-21 11:52:34 +00005193 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
drh43605152004-05-29 21:46:49 +00005194
drh98add2e2009-07-20 17:11:49 +00005195 if( *pRC ) return;
5196
drh8c42ca92001-06-22 19:15:00 +00005197 assert( idx>=0 && idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00005198 assert( sz==cellSize(pPage, idx) );
danielk19773b8a05f2007-03-19 17:44:26 +00005199 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00005200 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhda200cc2004-05-09 11:51:38 +00005201 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00005202 ptr = &data[pPage->cellOffset + 2*idx];
shane0af3f892008-11-12 04:55:34 +00005203 pc = get2byte(ptr);
drhc314dc72009-07-21 11:52:34 +00005204 hdr = pPage->hdrOffset;
5205 testcase( pc==get2byte(&data[hdr+5]) );
5206 testcase( pc+sz==pPage->pBt->usableSize );
5207 if( pc < get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
drh98add2e2009-07-20 17:11:49 +00005208 *pRC = SQLITE_CORRUPT_BKPT;
5209 return;
shane0af3f892008-11-12 04:55:34 +00005210 }
shanedcc50b72008-11-13 18:29:50 +00005211 rc = freeSpace(pPage, pc, sz);
drh98add2e2009-07-20 17:11:49 +00005212 if( rc ){
5213 *pRC = rc;
5214 return;
shanedcc50b72008-11-13 18:29:50 +00005215 }
drh43605152004-05-29 21:46:49 +00005216 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
5217 ptr[0] = ptr[2];
5218 ptr[1] = ptr[3];
drh14acc042001-06-10 19:56:58 +00005219 }
5220 pPage->nCell--;
drhc314dc72009-07-21 11:52:34 +00005221 put2byte(&data[hdr+3], pPage->nCell);
drh43605152004-05-29 21:46:49 +00005222 pPage->nFree += 2;
drh14acc042001-06-10 19:56:58 +00005223}
5224
5225/*
5226** Insert a new cell on pPage at cell index "i". pCell points to the
5227** content of the cell.
5228**
5229** If the cell content will fit on the page, then put it there. If it
drh43605152004-05-29 21:46:49 +00005230** will not fit, then make a copy of the cell content into pTemp if
5231** pTemp is not null. Regardless of pTemp, allocate a new entry
5232** in pPage->aOvfl[] and make it point to the cell content (either
5233** in pTemp or the original pCell) and also record its index.
5234** Allocating a new entry in pPage->aCell[] implies that
5235** pPage->nOverflow is incremented.
danielk1977a3ad5e72005-01-07 08:56:44 +00005236**
5237** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5238** cell. The caller will overwrite them after this function returns. If
drh4b238df2005-01-08 15:43:18 +00005239** nSkip is non-zero, then pCell may not point to an invalid memory location
danielk1977a3ad5e72005-01-07 08:56:44 +00005240** (but pCell+nSkip is always valid).
drh14acc042001-06-10 19:56:58 +00005241*/
drh98add2e2009-07-20 17:11:49 +00005242static void insertCell(
drh24cd67e2004-05-10 16:18:47 +00005243 MemPage *pPage, /* Page into which we are copying */
drh43605152004-05-29 21:46:49 +00005244 int i, /* New cell becomes the i-th cell of the page */
5245 u8 *pCell, /* Content of the new cell */
5246 int sz, /* Bytes of content in pCell */
danielk1977a3ad5e72005-01-07 08:56:44 +00005247 u8 *pTemp, /* Temp storage space for pCell, if needed */
drh98add2e2009-07-20 17:11:49 +00005248 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */
5249 int *pRC /* Read and write return code from here */
drh24cd67e2004-05-10 16:18:47 +00005250){
drh43605152004-05-29 21:46:49 +00005251 int idx; /* Where to write new cell content in data[] */
5252 int j; /* Loop counter */
drh43605152004-05-29 21:46:49 +00005253 int end; /* First byte past the last cell pointer in data[] */
5254 int ins; /* Index in data[] where new cell pointer is inserted */
drh43605152004-05-29 21:46:49 +00005255 int cellOffset; /* Address of first cell pointer in data[] */
5256 u8 *data; /* The content of the whole page */
5257 u8 *ptr; /* Used for moving information around in data[] */
5258
danielk19774dbaa892009-06-16 16:50:22 +00005259 int nSkip = (iChild ? 4 : 0);
5260
drh98add2e2009-07-20 17:11:49 +00005261 if( *pRC ) return;
5262
drh43605152004-05-29 21:46:49 +00005263 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
drhf49661a2008-12-10 16:45:50 +00005264 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
5265 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
drh43605152004-05-29 21:46:49 +00005266 assert( sz==cellSizePtr(pPage, pCell) );
drh1fee73e2007-08-29 04:00:57 +00005267 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +00005268 if( pPage->nOverflow || sz+2>pPage->nFree ){
drh24cd67e2004-05-10 16:18:47 +00005269 if( pTemp ){
danielk1977a3ad5e72005-01-07 08:56:44 +00005270 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00005271 pCell = pTemp;
drh24cd67e2004-05-10 16:18:47 +00005272 }
danielk19774dbaa892009-06-16 16:50:22 +00005273 if( iChild ){
5274 put4byte(pCell, iChild);
5275 }
drh43605152004-05-29 21:46:49 +00005276 j = pPage->nOverflow++;
danielk197789d40042008-11-17 14:20:56 +00005277 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
drh43605152004-05-29 21:46:49 +00005278 pPage->aOvfl[j].pCell = pCell;
drhf49661a2008-12-10 16:45:50 +00005279 pPage->aOvfl[j].idx = (u16)i;
drh14acc042001-06-10 19:56:58 +00005280 }else{
danielk19776e465eb2007-08-21 13:11:00 +00005281 int rc = sqlite3PagerWrite(pPage->pDbPage);
5282 if( rc!=SQLITE_OK ){
drh98add2e2009-07-20 17:11:49 +00005283 *pRC = rc;
5284 return;
danielk19776e465eb2007-08-21 13:11:00 +00005285 }
5286 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +00005287 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00005288 cellOffset = pPage->cellOffset;
drh0a45c272009-07-08 01:49:11 +00005289 end = cellOffset + 2*pPage->nCell;
drh43605152004-05-29 21:46:49 +00005290 ins = cellOffset + 2*i;
drh0a45c272009-07-08 01:49:11 +00005291 rc = allocateSpace(pPage, sz, &idx);
drh98add2e2009-07-20 17:11:49 +00005292 if( rc ){ *pRC = rc; return; }
drhc314dc72009-07-21 11:52:34 +00005293 /* The allocateSpace() routine guarantees the following two properties
5294 ** if it returns success */
5295 assert( idx >= end+2 );
5296 assert( idx+sz <= pPage->pBt->usableSize );
drh43605152004-05-29 21:46:49 +00005297 pPage->nCell++;
drh0a45c272009-07-08 01:49:11 +00005298 pPage->nFree -= (u16)(2 + sz);
danielk1977a3ad5e72005-01-07 08:56:44 +00005299 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
danielk19774dbaa892009-06-16 16:50:22 +00005300 if( iChild ){
5301 put4byte(&data[idx], iChild);
5302 }
drh0a45c272009-07-08 01:49:11 +00005303 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){
drh43605152004-05-29 21:46:49 +00005304 ptr[0] = ptr[-2];
5305 ptr[1] = ptr[-1];
drhda200cc2004-05-09 11:51:38 +00005306 }
drh43605152004-05-29 21:46:49 +00005307 put2byte(&data[ins], idx);
drh0a45c272009-07-08 01:49:11 +00005308 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
danielk1977a19df672004-11-03 11:37:07 +00005309#ifndef SQLITE_OMIT_AUTOVACUUM
5310 if( pPage->pBt->autoVacuum ){
5311 /* The cell may contain a pointer to an overflow page. If so, write
5312 ** the entry for the overflow page into the pointer map.
5313 */
drh98add2e2009-07-20 17:11:49 +00005314 ptrmapPutOvflPtr(pPage, pCell, pRC);
danielk1977a19df672004-11-03 11:37:07 +00005315 }
5316#endif
drh14acc042001-06-10 19:56:58 +00005317 }
5318}
5319
5320/*
drhfa1a98a2004-05-14 19:08:17 +00005321** Add a list of cells to a page. The page should be initially empty.
5322** The cells are guaranteed to fit on the page.
5323*/
5324static void assemblePage(
5325 MemPage *pPage, /* The page to be assemblied */
5326 int nCell, /* The number of cells to add to this page */
drh43605152004-05-29 21:46:49 +00005327 u8 **apCell, /* Pointers to cell bodies */
drha9121e42008-02-19 14:59:35 +00005328 u16 *aSize /* Sizes of the cells */
drhfa1a98a2004-05-14 19:08:17 +00005329){
5330 int i; /* Loop counter */
danielk1977fad91942009-04-29 17:49:59 +00005331 u8 *pCellptr; /* Address of next cell pointer */
drh43605152004-05-29 21:46:49 +00005332 int cellbody; /* Address of next cell body */
danielk1977fad91942009-04-29 17:49:59 +00005333 u8 * const data = pPage->aData; /* Pointer to data for pPage */
5334 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
5335 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
drhfa1a98a2004-05-14 19:08:17 +00005336
drh43605152004-05-29 21:46:49 +00005337 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +00005338 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf49661a2008-12-10 16:45:50 +00005339 assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
drhc5053fb2008-11-27 02:22:10 +00005340 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk1977fad91942009-04-29 17:49:59 +00005341
5342 /* Check that the page has just been zeroed by zeroPage() */
5343 assert( pPage->nCell==0 );
5344 assert( get2byte(&data[hdr+5])==nUsable );
5345
5346 pCellptr = &data[pPage->cellOffset + nCell*2];
5347 cellbody = nUsable;
5348 for(i=nCell-1; i>=0; i--){
5349 pCellptr -= 2;
5350 cellbody -= aSize[i];
5351 put2byte(pCellptr, cellbody);
5352 memcpy(&data[cellbody], apCell[i], aSize[i]);
drhfa1a98a2004-05-14 19:08:17 +00005353 }
danielk1977fad91942009-04-29 17:49:59 +00005354 put2byte(&data[hdr+3], nCell);
5355 put2byte(&data[hdr+5], cellbody);
5356 pPage->nFree -= (nCell*2 + nUsable - cellbody);
drhf49661a2008-12-10 16:45:50 +00005357 pPage->nCell = (u16)nCell;
drhfa1a98a2004-05-14 19:08:17 +00005358}
5359
drh14acc042001-06-10 19:56:58 +00005360/*
drhc3b70572003-01-04 19:44:07 +00005361** The following parameters determine how many adjacent pages get involved
5362** in a balancing operation. NN is the number of neighbors on either side
5363** of the page that participate in the balancing operation. NB is the
5364** total number of pages that participate, including the target page and
5365** NN neighbors on either side.
5366**
5367** The minimum value of NN is 1 (of course). Increasing NN above 1
5368** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5369** in exchange for a larger degradation in INSERT and UPDATE performance.
5370** The value of NN appears to give the best results overall.
5371*/
5372#define NN 1 /* Number of neighbors on either side of pPage */
5373#define NB (NN*2+1) /* Total pages involved in the balance */
5374
danielk1977ac245ec2005-01-14 13:50:11 +00005375
drh615ae552005-01-16 23:21:00 +00005376#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00005377/*
5378** This version of balance() handles the common special case where
5379** a new entry is being inserted on the extreme right-end of the
5380** tree, in other words, when the new entry will become the largest
5381** entry in the tree.
5382**
drhc314dc72009-07-21 11:52:34 +00005383** Instead of trying to balance the 3 right-most leaf pages, just add
drhf222e712005-01-14 22:55:49 +00005384** a new page to the right-hand side and put the one new entry in
5385** that page. This leaves the right side of the tree somewhat
5386** unbalanced. But odds are that we will be inserting new entries
5387** at the end soon afterwards so the nearly empty page will quickly
5388** fill up. On average.
5389**
5390** pPage is the leaf page which is the right-most page in the tree.
5391** pParent is its parent. pPage must have a single overflow entry
5392** which is also the right-most entry on the page.
danielk1977a50d9aa2009-06-08 14:49:45 +00005393**
5394** The pSpace buffer is used to store a temporary copy of the divider
5395** cell that will be inserted into pParent. Such a cell consists of a 4
5396** byte page number followed by a variable length integer. In other
5397** words, at most 13 bytes. Hence the pSpace buffer must be at
5398** least 13 bytes in size.
drhf222e712005-01-14 22:55:49 +00005399*/
danielk1977a50d9aa2009-06-08 14:49:45 +00005400static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5401 BtShared *const pBt = pPage->pBt; /* B-Tree Database */
danielk19774dbaa892009-06-16 16:50:22 +00005402 MemPage *pNew; /* Newly allocated page */
danielk19776f235cc2009-06-04 14:46:08 +00005403 int rc; /* Return Code */
5404 Pgno pgnoNew; /* Page number of pNew */
danielk1977ac245ec2005-01-14 13:50:11 +00005405
drh1fee73e2007-08-29 04:00:57 +00005406 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk1977a50d9aa2009-06-08 14:49:45 +00005407 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
danielk1977e56b60e2009-06-10 09:11:06 +00005408 assert( pPage->nOverflow==1 );
5409
drh5d1a8722009-07-22 18:07:40 +00005410 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
drhd677b3d2007-08-20 22:48:41 +00005411
danielk1977a50d9aa2009-06-08 14:49:45 +00005412 /* Allocate a new page. This page will become the right-sibling of
5413 ** pPage. Make the parent page writable, so that the new divider cell
5414 ** may be inserted. If both these operations are successful, proceed.
5415 */
drh4f0c5872007-03-26 22:05:01 +00005416 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
danielk19774dbaa892009-06-16 16:50:22 +00005417
danielk1977eaa06f62008-09-18 17:34:44 +00005418 if( rc==SQLITE_OK ){
danielk1977a50d9aa2009-06-08 14:49:45 +00005419
5420 u8 *pOut = &pSpace[4];
danielk19776f235cc2009-06-04 14:46:08 +00005421 u8 *pCell = pPage->aOvfl[0].pCell;
5422 u16 szCell = cellSizePtr(pPage, pCell);
5423 u8 *pStop;
5424
drhc5053fb2008-11-27 02:22:10 +00005425 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
danielk1977e56b60e2009-06-10 09:11:06 +00005426 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5427 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
danielk1977eaa06f62008-09-18 17:34:44 +00005428 assemblePage(pNew, 1, &pCell, &szCell);
danielk19774dbaa892009-06-16 16:50:22 +00005429
5430 /* If this is an auto-vacuum database, update the pointer map
5431 ** with entries for the new page, and any pointer from the
5432 ** cell on the page to an overflow page. If either of these
5433 ** operations fails, the return code is set, but the contents
5434 ** of the parent page are still manipulated by thh code below.
5435 ** That is Ok, at this point the parent page is guaranteed to
5436 ** be marked as dirty. Returning an error code will cause a
5437 ** rollback, undoing any changes made to the parent page.
5438 */
5439 if( ISAUTOVACUUM ){
drh98add2e2009-07-20 17:11:49 +00005440 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5441 if( szCell>pNew->minLocal ){
5442 ptrmapPutOvflPtr(pNew, pCell, &rc);
danielk19774dbaa892009-06-16 16:50:22 +00005443 }
5444 }
danielk1977eaa06f62008-09-18 17:34:44 +00005445
danielk19776f235cc2009-06-04 14:46:08 +00005446 /* Create a divider cell to insert into pParent. The divider cell
5447 ** consists of a 4-byte page number (the page number of pPage) and
5448 ** a variable length key value (which must be the same value as the
5449 ** largest key on pPage).
danielk1977eaa06f62008-09-18 17:34:44 +00005450 **
danielk19776f235cc2009-06-04 14:46:08 +00005451 ** To find the largest key value on pPage, first find the right-most
5452 ** cell on pPage. The first two fields of this cell are the
5453 ** record-length (a variable length integer at most 32-bits in size)
5454 ** and the key value (a variable length integer, may have any value).
5455 ** The first of the while(...) loops below skips over the record-length
5456 ** field. The second while(...) loop copies the key value from the
danielk1977a50d9aa2009-06-08 14:49:45 +00005457 ** cell on pPage into the pSpace buffer.
danielk1977eaa06f62008-09-18 17:34:44 +00005458 */
danielk1977eaa06f62008-09-18 17:34:44 +00005459 pCell = findCell(pPage, pPage->nCell-1);
danielk19776f235cc2009-06-04 14:46:08 +00005460 pStop = &pCell[9];
5461 while( (*(pCell++)&0x80) && pCell<pStop );
5462 pStop = &pCell[9];
5463 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5464
danielk19774dbaa892009-06-16 16:50:22 +00005465 /* Insert the new divider cell into pParent. */
drh98add2e2009-07-20 17:11:49 +00005466 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5467 0, pPage->pgno, &rc);
danielk19776f235cc2009-06-04 14:46:08 +00005468
5469 /* Set the right-child pointer of pParent to point to the new page. */
danielk1977eaa06f62008-09-18 17:34:44 +00005470 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5471
danielk1977e08a3c42008-09-18 18:17:03 +00005472 /* Release the reference to the new page. */
5473 releasePage(pNew);
danielk1977ac11ee62005-01-15 12:45:51 +00005474 }
5475
danielk1977eaa06f62008-09-18 17:34:44 +00005476 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00005477}
drh615ae552005-01-16 23:21:00 +00005478#endif /* SQLITE_OMIT_QUICKBALANCE */
drh43605152004-05-29 21:46:49 +00005479
danielk19774dbaa892009-06-16 16:50:22 +00005480#if 0
drhc3b70572003-01-04 19:44:07 +00005481/*
danielk19774dbaa892009-06-16 16:50:22 +00005482** This function does not contribute anything to the operation of SQLite.
5483** it is sometimes activated temporarily while debugging code responsible
5484** for setting pointer-map entries.
5485*/
5486static int ptrmapCheckPages(MemPage **apPage, int nPage){
5487 int i, j;
5488 for(i=0; i<nPage; i++){
5489 Pgno n;
5490 u8 e;
5491 MemPage *pPage = apPage[i];
5492 BtShared *pBt = pPage->pBt;
5493 assert( pPage->isInit );
5494
5495 for(j=0; j<pPage->nCell; j++){
5496 CellInfo info;
5497 u8 *z;
5498
5499 z = findCell(pPage, j);
danielk197730548662009-07-09 05:07:37 +00005500 btreeParseCellPtr(pPage, z, &info);
danielk19774dbaa892009-06-16 16:50:22 +00005501 if( info.iOverflow ){
5502 Pgno ovfl = get4byte(&z[info.iOverflow]);
5503 ptrmapGet(pBt, ovfl, &e, &n);
5504 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5505 }
5506 if( !pPage->leaf ){
5507 Pgno child = get4byte(z);
5508 ptrmapGet(pBt, child, &e, &n);
5509 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5510 }
5511 }
5512 if( !pPage->leaf ){
5513 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5514 ptrmapGet(pBt, child, &e, &n);
5515 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5516 }
5517 }
5518 return 1;
5519}
5520#endif
5521
danielk1977cd581a72009-06-23 15:43:39 +00005522/*
5523** This function is used to copy the contents of the b-tree node stored
5524** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5525** the pointer-map entries for each child page are updated so that the
5526** parent page stored in the pointer map is page pTo. If pFrom contained
5527** any cells with overflow page pointers, then the corresponding pointer
5528** map entries are also updated so that the parent page is page pTo.
5529**
5530** If pFrom is currently carrying any overflow cells (entries in the
5531** MemPage.aOvfl[] array), they are not copied to pTo.
5532**
danielk197730548662009-07-09 05:07:37 +00005533** Before returning, page pTo is reinitialized using btreeInitPage().
danielk1977cd581a72009-06-23 15:43:39 +00005534**
5535** The performance of this function is not critical. It is only used by
5536** the balance_shallower() and balance_deeper() procedures, neither of
5537** which are called often under normal circumstances.
5538*/
drhc314dc72009-07-21 11:52:34 +00005539static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5540 if( (*pRC)==SQLITE_OK ){
5541 BtShared * const pBt = pFrom->pBt;
5542 u8 * const aFrom = pFrom->aData;
5543 u8 * const aTo = pTo->aData;
5544 int const iFromHdr = pFrom->hdrOffset;
5545 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5546 TESTONLY(int rc;)
5547 int iData;
5548
5549
5550 assert( pFrom->isInit );
5551 assert( pFrom->nFree>=iToHdr );
5552 assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize );
5553
5554 /* Copy the b-tree node content from page pFrom to page pTo. */
5555 iData = get2byte(&aFrom[iFromHdr+5]);
5556 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5557 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5558
5559 /* Reinitialize page pTo so that the contents of the MemPage structure
5560 ** match the new data. The initialization of pTo "cannot" fail, as the
5561 ** data copied from pFrom is known to be valid. */
5562 pTo->isInit = 0;
5563 TESTONLY(rc = ) btreeInitPage(pTo);
5564 assert( rc==SQLITE_OK );
5565
5566 /* If this is an auto-vacuum database, update the pointer-map entries
5567 ** for any b-tree or overflow pages that pTo now contains the pointers to.
5568 */
5569 if( ISAUTOVACUUM ){
5570 *pRC = setChildPtrmaps(pTo);
5571 }
danielk1977cd581a72009-06-23 15:43:39 +00005572 }
danielk1977cd581a72009-06-23 15:43:39 +00005573}
5574
5575/*
danielk19774dbaa892009-06-16 16:50:22 +00005576** This routine redistributes cells on the iParentIdx'th child of pParent
5577** (hereafter "the page") and up to 2 siblings so that all pages have about the
5578** same amount of free space. Usually a single sibling on either side of the
5579** page are used in the balancing, though both siblings might come from one
5580** side if the page is the first or last child of its parent. If the page
5581** has fewer than 2 siblings (something which can only happen if the page
5582** is a root page or a child of a root page) then all available siblings
5583** participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00005584**
danielk19774dbaa892009-06-16 16:50:22 +00005585** The number of siblings of the page might be increased or decreased by
5586** one or two in an effort to keep pages nearly full but not over full.
drh14acc042001-06-10 19:56:58 +00005587**
danielk19774dbaa892009-06-16 16:50:22 +00005588** Note that when this routine is called, some of the cells on the page
5589** might not actually be stored in MemPage.aData[]. This can happen
5590** if the page is overfull. This routine ensures that all cells allocated
5591** to the page and its siblings fit into MemPage.aData[] before returning.
drh14acc042001-06-10 19:56:58 +00005592**
danielk19774dbaa892009-06-16 16:50:22 +00005593** In the course of balancing the page and its siblings, cells may be
5594** inserted into or removed from the parent page (pParent). Doing so
5595** may cause the parent page to become overfull or underfull. If this
5596** happens, it is the responsibility of the caller to invoke the correct
5597** balancing routine to fix this problem (see the balance() routine).
drh8c42ca92001-06-22 19:15:00 +00005598**
drh5e00f6c2001-09-13 13:46:56 +00005599** If this routine fails for any reason, it might leave the database
danielk19776067a9b2009-06-09 09:41:00 +00005600** in a corrupted state. So if this routine fails, the database should
drh5e00f6c2001-09-13 13:46:56 +00005601** be rolled back.
danielk19774dbaa892009-06-16 16:50:22 +00005602**
5603** The third argument to this function, aOvflSpace, is a pointer to a
drhcd09c532009-07-20 19:30:00 +00005604** buffer big enough to hold one page. If while inserting cells into the parent
5605** page (pParent) the parent page becomes overfull, this buffer is
5606** used to store the parent's overflow cells. Because this function inserts
danielk19774dbaa892009-06-16 16:50:22 +00005607** a maximum of four divider cells into the parent page, and the maximum
5608** size of a cell stored within an internal node is always less than 1/4
5609** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5610** enough for all overflow cells.
5611**
5612** If aOvflSpace is set to a null pointer, this function returns
5613** SQLITE_NOMEM.
drh8b2f49b2001-06-08 00:21:52 +00005614*/
danielk19774dbaa892009-06-16 16:50:22 +00005615static int balance_nonroot(
5616 MemPage *pParent, /* Parent page of siblings being balanced */
5617 int iParentIdx, /* Index of "the page" in pParent */
danielk1977cd581a72009-06-23 15:43:39 +00005618 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
5619 int isRoot /* True if pParent is a root-page */
danielk19774dbaa892009-06-16 16:50:22 +00005620){
drh16a9b832007-05-05 18:39:25 +00005621 BtShared *pBt; /* The whole database */
danielk1977634f2982005-03-28 08:44:07 +00005622 int nCell = 0; /* Number of cells in apCell[] */
5623 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
danielk1977a4124bd2008-12-23 10:37:47 +00005624 int nNew = 0; /* Number of pages in apNew[] */
danielk19774dbaa892009-06-16 16:50:22 +00005625 int nOld; /* Number of pages in apOld[] */
drh14acc042001-06-10 19:56:58 +00005626 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00005627 int nxDiv; /* Next divider slot in pParent->aCell[] */
shane85095702009-06-15 16:27:08 +00005628 int rc = SQLITE_OK; /* The return code */
shane36840fd2009-06-26 16:32:13 +00005629 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00005630 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00005631 int usableSpace; /* Bytes in pPage beyond the header */
5632 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00005633 int subtotal; /* Subtotal of bytes in cells on one page */
drhe5ae5732008-06-15 02:51:47 +00005634 int iSpace1 = 0; /* First unused byte of aSpace1[] */
danielk19776067a9b2009-06-09 09:41:00 +00005635 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
drhfacf0302008-06-17 15:12:00 +00005636 int szScratch; /* Size of scratch memory requested */
drhc3b70572003-01-04 19:44:07 +00005637 MemPage *apOld[NB]; /* pPage and up to two siblings */
drh4b70f112004-05-02 21:12:19 +00005638 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drha2fce642004-06-05 00:01:44 +00005639 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
danielk19774dbaa892009-06-16 16:50:22 +00005640 u8 *pRight; /* Location in parent of right-sibling pointer */
5641 u8 *apDiv[NB-1]; /* Divider cells in pParent */
drha2fce642004-06-05 00:01:44 +00005642 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
5643 int szNew[NB+2]; /* Combined size of cells place on i-th page */
danielk197750f059b2005-03-29 02:54:03 +00005644 u8 **apCell = 0; /* All cells begin balanced */
drha9121e42008-02-19 14:59:35 +00005645 u16 *szCell; /* Local size of all cells in apCell[] */
danielk19774dbaa892009-06-16 16:50:22 +00005646 u8 *aSpace1; /* Space for copies of dividers cells */
5647 Pgno pgno; /* Temp var to store a page number in */
drh8b2f49b2001-06-08 00:21:52 +00005648
danielk1977a50d9aa2009-06-08 14:49:45 +00005649 pBt = pParent->pBt;
5650 assert( sqlite3_mutex_held(pBt->mutex) );
5651 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
danielk1977474b7cc2008-07-09 11:49:46 +00005652
danielk1977e5765212009-06-17 11:13:28 +00005653#if 0
drh43605152004-05-29 21:46:49 +00005654 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
danielk1977e5765212009-06-17 11:13:28 +00005655#endif
drh2e38c322004-09-03 18:38:44 +00005656
danielk19774dbaa892009-06-16 16:50:22 +00005657 /* At this point pParent may have at most one overflow cell. And if
5658 ** this overflow cell is present, it must be the cell with
5659 ** index iParentIdx. This scenario comes about when this function
drhcd09c532009-07-20 19:30:00 +00005660 ** is called (indirectly) from sqlite3BtreeDelete().
5661 */
danielk19774dbaa892009-06-16 16:50:22 +00005662 assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5663 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx );
5664
danielk197711a8a862009-06-17 11:49:52 +00005665 if( !aOvflSpace ){
5666 return SQLITE_NOMEM;
5667 }
5668
danielk1977a50d9aa2009-06-08 14:49:45 +00005669 /* Find the sibling pages to balance. Also locate the cells in pParent
5670 ** that divide the siblings. An attempt is made to find NN siblings on
5671 ** either side of pPage. More siblings are taken from one side, however,
5672 ** if there are fewer than NN siblings on the other side. If pParent
danielk19774dbaa892009-06-16 16:50:22 +00005673 ** has NB or fewer children then all children of pParent are taken.
5674 **
5675 ** This loop also drops the divider cells from the parent page. This
5676 ** way, the remainder of the function does not have to deal with any
drhcd09c532009-07-20 19:30:00 +00005677 ** overflow cells in the parent page, since if any existed they will
5678 ** have already been removed.
5679 */
danielk19774dbaa892009-06-16 16:50:22 +00005680 i = pParent->nOverflow + pParent->nCell;
5681 if( i<2 ){
drhc3b70572003-01-04 19:44:07 +00005682 nxDiv = 0;
danielk19774dbaa892009-06-16 16:50:22 +00005683 nOld = i+1;
5684 }else{
5685 nOld = 3;
5686 if( iParentIdx==0 ){
5687 nxDiv = 0;
5688 }else if( iParentIdx==i ){
5689 nxDiv = i-2;
drh14acc042001-06-10 19:56:58 +00005690 }else{
danielk19774dbaa892009-06-16 16:50:22 +00005691 nxDiv = iParentIdx-1;
drh8b2f49b2001-06-08 00:21:52 +00005692 }
danielk19774dbaa892009-06-16 16:50:22 +00005693 i = 2;
5694 }
5695 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
5696 pRight = &pParent->aData[pParent->hdrOffset+8];
5697 }else{
5698 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
5699 }
5700 pgno = get4byte(pRight);
5701 while( 1 ){
5702 rc = getAndInitPage(pBt, pgno, &apOld[i]);
5703 if( rc ){
danielk197789bc4bc2009-07-21 19:25:24 +00005704 memset(apOld, 0, (i+1)*sizeof(MemPage*));
danielk19774dbaa892009-06-16 16:50:22 +00005705 goto balance_cleanup;
5706 }
danielk1977634f2982005-03-28 08:44:07 +00005707 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
danielk19774dbaa892009-06-16 16:50:22 +00005708 if( (i--)==0 ) break;
5709
drhcd09c532009-07-20 19:30:00 +00005710 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){
danielk19774dbaa892009-06-16 16:50:22 +00005711 apDiv[i] = pParent->aOvfl[0].pCell;
5712 pgno = get4byte(apDiv[i]);
5713 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5714 pParent->nOverflow = 0;
5715 }else{
5716 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
5717 pgno = get4byte(apDiv[i]);
5718 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5719
5720 /* Drop the cell from the parent page. apDiv[i] still points to
5721 ** the cell within the parent, even though it has been dropped.
5722 ** This is safe because dropping a cell only overwrites the first
5723 ** four bytes of it, and this function does not need the first
5724 ** four bytes of the divider cell. So the pointer is safe to use
danielk197711a8a862009-06-17 11:49:52 +00005725 ** later on.
5726 **
5727 ** Unless SQLite is compiled in secure-delete mode. In this case,
5728 ** the dropCell() routine will overwrite the entire cell with zeroes.
5729 ** In this case, temporarily copy the cell into the aOvflSpace[]
5730 ** buffer. It will be copied out again as soon as the aSpace[] buffer
5731 ** is allocated. */
5732#ifdef SQLITE_SECURE_DELETE
5733 memcpy(&aOvflSpace[apDiv[i]-pParent->aData], apDiv[i], szNew[i]);
5734 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
5735#endif
drh98add2e2009-07-20 17:11:49 +00005736 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
danielk19774dbaa892009-06-16 16:50:22 +00005737 }
drh8b2f49b2001-06-08 00:21:52 +00005738 }
5739
drha9121e42008-02-19 14:59:35 +00005740 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
drh8d97f1f2005-05-05 18:14:13 +00005741 ** alignment */
drha9121e42008-02-19 14:59:35 +00005742 nMaxCells = (nMaxCells + 3)&~3;
drh8d97f1f2005-05-05 18:14:13 +00005743
drh8b2f49b2001-06-08 00:21:52 +00005744 /*
danielk1977634f2982005-03-28 08:44:07 +00005745 ** Allocate space for memory structures
5746 */
danielk19774dbaa892009-06-16 16:50:22 +00005747 k = pBt->pageSize + ROUND8(sizeof(MemPage));
drhfacf0302008-06-17 15:12:00 +00005748 szScratch =
drha9121e42008-02-19 14:59:35 +00005749 nMaxCells*sizeof(u8*) /* apCell */
5750 + nMaxCells*sizeof(u16) /* szCell */
drhe5ae5732008-06-15 02:51:47 +00005751 + pBt->pageSize /* aSpace1 */
danielk19774dbaa892009-06-16 16:50:22 +00005752 + k*nOld; /* Page copies (apCopy) */
drhfacf0302008-06-17 15:12:00 +00005753 apCell = sqlite3ScratchMalloc( szScratch );
danielk197711a8a862009-06-17 11:49:52 +00005754 if( apCell==0 ){
danielk1977634f2982005-03-28 08:44:07 +00005755 rc = SQLITE_NOMEM;
5756 goto balance_cleanup;
5757 }
drha9121e42008-02-19 14:59:35 +00005758 szCell = (u16*)&apCell[nMaxCells];
danielk19774dbaa892009-06-16 16:50:22 +00005759 aSpace1 = (u8*)&szCell[nMaxCells];
drhea598cb2009-04-05 12:22:08 +00005760 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
drh14acc042001-06-10 19:56:58 +00005761
5762 /*
5763 ** Load pointers to all cells on sibling pages and the divider cells
5764 ** into the local apCell[] array. Make copies of the divider cells
danielk19774dbaa892009-06-16 16:50:22 +00005765 ** into space obtained from aSpace1[] and remove the the divider Cells
drhb6f41482004-05-14 01:58:11 +00005766 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00005767 **
5768 ** If the siblings are on leaf pages, then the child pointers of the
5769 ** divider cells are stripped from the cells before they are copied
drhe5ae5732008-06-15 02:51:47 +00005770 ** into aSpace1[]. In this way, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00005771 ** child pointers. If siblings are not leaves, then all cell in
5772 ** apCell[] include child pointers. Either way, all cells in apCell[]
5773 ** are alike.
drh96f5b762004-05-16 16:24:36 +00005774 **
5775 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5776 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
drh8b2f49b2001-06-08 00:21:52 +00005777 */
danielk1977a50d9aa2009-06-08 14:49:45 +00005778 leafCorrection = apOld[0]->leaf*4;
5779 leafData = apOld[0]->hasData;
drh8b2f49b2001-06-08 00:21:52 +00005780 for(i=0; i<nOld; i++){
danielk19774dbaa892009-06-16 16:50:22 +00005781 int limit;
5782
5783 /* Before doing anything else, take a copy of the i'th original sibling
5784 ** The rest of this function will use data from the copies rather
5785 ** that the original pages since the original pages will be in the
5786 ** process of being overwritten. */
5787 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
5788 memcpy(pOld, apOld[i], sizeof(MemPage));
5789 pOld->aData = (void*)&pOld[1];
5790 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
5791
5792 limit = pOld->nCell+pOld->nOverflow;
drh43605152004-05-29 21:46:49 +00005793 for(j=0; j<limit; j++){
danielk1977634f2982005-03-28 08:44:07 +00005794 assert( nCell<nMaxCells );
drh43605152004-05-29 21:46:49 +00005795 apCell[nCell] = findOverflowCell(pOld, j);
5796 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
danielk19774dbaa892009-06-16 16:50:22 +00005797 nCell++;
5798 }
5799 if( i<nOld-1 && !leafData){
shane36840fd2009-06-26 16:32:13 +00005800 u16 sz = (u16)szNew[i];
danielk19774dbaa892009-06-16 16:50:22 +00005801 u8 *pTemp;
5802 assert( nCell<nMaxCells );
5803 szCell[nCell] = sz;
5804 pTemp = &aSpace1[iSpace1];
5805 iSpace1 += sz;
5806 assert( sz<=pBt->pageSize/4 );
5807 assert( iSpace1<=pBt->pageSize );
5808 memcpy(pTemp, apDiv[i], sz);
5809 apCell[nCell] = pTemp+leafCorrection;
5810 assert( leafCorrection==0 || leafCorrection==4 );
shane36840fd2009-06-26 16:32:13 +00005811 szCell[nCell] = szCell[nCell] - leafCorrection;
danielk19774dbaa892009-06-16 16:50:22 +00005812 if( !pOld->leaf ){
5813 assert( leafCorrection==0 );
5814 assert( pOld->hdrOffset==0 );
5815 /* The right pointer of the child page pOld becomes the left
5816 ** pointer of the divider cell */
5817 memcpy(apCell[nCell], &pOld->aData[8], 4);
5818 }else{
5819 assert( leafCorrection==4 );
5820 if( szCell[nCell]<4 ){
5821 /* Do not allow any cells smaller than 4 bytes. */
5822 szCell[nCell] = 4;
danielk1977ac11ee62005-01-15 12:45:51 +00005823 }
5824 }
drh14acc042001-06-10 19:56:58 +00005825 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00005826 }
drh8b2f49b2001-06-08 00:21:52 +00005827 }
5828
5829 /*
drh6019e162001-07-02 17:51:45 +00005830 ** Figure out the number of pages needed to hold all nCell cells.
5831 ** Store this number in "k". Also compute szNew[] which is the total
5832 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00005833 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00005834 ** cntNew[k] should equal nCell.
5835 **
drh96f5b762004-05-16 16:24:36 +00005836 ** Values computed by this block:
5837 **
5838 ** k: The total number of sibling pages
5839 ** szNew[i]: Spaced used on the i-th sibling page.
5840 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5841 ** the right of the i-th sibling page.
5842 ** usableSpace: Number of bytes of space available on each sibling.
5843 **
drh8b2f49b2001-06-08 00:21:52 +00005844 */
drh43605152004-05-29 21:46:49 +00005845 usableSpace = pBt->usableSize - 12 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00005846 for(subtotal=k=i=0; i<nCell; i++){
danielk1977634f2982005-03-28 08:44:07 +00005847 assert( i<nMaxCells );
drh43605152004-05-29 21:46:49 +00005848 subtotal += szCell[i] + 2;
drh4b70f112004-05-02 21:12:19 +00005849 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00005850 szNew[k] = subtotal - szCell[i];
5851 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00005852 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00005853 subtotal = 0;
5854 k++;
drheac74422009-06-14 12:47:11 +00005855 if( k>NB+1 ){ rc = SQLITE_CORRUPT; goto balance_cleanup; }
drh6019e162001-07-02 17:51:45 +00005856 }
5857 }
5858 szNew[k] = subtotal;
5859 cntNew[k] = nCell;
5860 k++;
drh96f5b762004-05-16 16:24:36 +00005861
5862 /*
5863 ** The packing computed by the previous block is biased toward the siblings
5864 ** on the left side. The left siblings are always nearly full, while the
5865 ** right-most sibling might be nearly empty. This block of code attempts
5866 ** to adjust the packing of siblings to get a better balance.
5867 **
5868 ** This adjustment is more than an optimization. The packing above might
5869 ** be so out of balance as to be illegal. For example, the right-most
5870 ** sibling might be completely empty. This adjustment is not optional.
5871 */
drh6019e162001-07-02 17:51:45 +00005872 for(i=k-1; i>0; i--){
drh96f5b762004-05-16 16:24:36 +00005873 int szRight = szNew[i]; /* Size of sibling on the right */
5874 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5875 int r; /* Index of right-most cell in left sibling */
5876 int d; /* Index of first cell to the left of right sibling */
5877
5878 r = cntNew[i-1] - 1;
5879 d = r + 1 - leafData;
danielk1977634f2982005-03-28 08:44:07 +00005880 assert( d<nMaxCells );
5881 assert( r<nMaxCells );
drh43605152004-05-29 21:46:49 +00005882 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5883 szRight += szCell[d] + 2;
5884 szLeft -= szCell[r] + 2;
drh6019e162001-07-02 17:51:45 +00005885 cntNew[i-1]--;
drh96f5b762004-05-16 16:24:36 +00005886 r = cntNew[i-1] - 1;
5887 d = r + 1 - leafData;
drh6019e162001-07-02 17:51:45 +00005888 }
drh96f5b762004-05-16 16:24:36 +00005889 szNew[i] = szRight;
5890 szNew[i-1] = szLeft;
drh6019e162001-07-02 17:51:45 +00005891 }
drh09d0deb2005-08-02 17:13:09 +00005892
danielk19776f235cc2009-06-04 14:46:08 +00005893 /* Either we found one or more cells (cntnew[0])>0) or pPage is
drh09d0deb2005-08-02 17:13:09 +00005894 ** a virtual root page. A virtual root page is when the real root
5895 ** page is page 1 and we are the only child of that page.
5896 */
5897 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
drh8b2f49b2001-06-08 00:21:52 +00005898
danielk1977e5765212009-06-17 11:13:28 +00005899 TRACE(("BALANCE: old: %d %d %d ",
5900 apOld[0]->pgno,
5901 nOld>=2 ? apOld[1]->pgno : 0,
5902 nOld>=3 ? apOld[2]->pgno : 0
5903 ));
5904
drh8b2f49b2001-06-08 00:21:52 +00005905 /*
drh6b308672002-07-08 02:16:37 +00005906 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00005907 */
drheac74422009-06-14 12:47:11 +00005908 if( apOld[0]->pgno<=1 ){
5909 rc = SQLITE_CORRUPT;
5910 goto balance_cleanup;
5911 }
danielk1977a50d9aa2009-06-08 14:49:45 +00005912 pageFlags = apOld[0]->aData[0];
drh14acc042001-06-10 19:56:58 +00005913 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00005914 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00005915 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00005916 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00005917 apOld[i] = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00005918 rc = sqlite3PagerWrite(pNew->pDbPage);
drhf5345442007-04-09 12:45:02 +00005919 nNew++;
danielk197728129562005-01-11 10:25:06 +00005920 if( rc ) goto balance_cleanup;
drh6b308672002-07-08 02:16:37 +00005921 }else{
drh7aa8f852006-03-28 00:24:44 +00005922 assert( i>0 );
danielk19774dbaa892009-06-16 16:50:22 +00005923 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
drh6b308672002-07-08 02:16:37 +00005924 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00005925 apNew[i] = pNew;
drhf5345442007-04-09 12:45:02 +00005926 nNew++;
danielk19774dbaa892009-06-16 16:50:22 +00005927
5928 /* Set the pointer-map entry for the new sibling page. */
5929 if( ISAUTOVACUUM ){
drh98add2e2009-07-20 17:11:49 +00005930 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
danielk19774dbaa892009-06-16 16:50:22 +00005931 if( rc!=SQLITE_OK ){
5932 goto balance_cleanup;
5933 }
5934 }
drh6b308672002-07-08 02:16:37 +00005935 }
drh8b2f49b2001-06-08 00:21:52 +00005936 }
5937
danielk1977299b1872004-11-22 10:02:10 +00005938 /* Free any old pages that were not reused as new pages.
5939 */
5940 while( i<nOld ){
drhc314dc72009-07-21 11:52:34 +00005941 freePage(apOld[i], &rc);
danielk1977299b1872004-11-22 10:02:10 +00005942 if( rc ) goto balance_cleanup;
5943 releasePage(apOld[i]);
5944 apOld[i] = 0;
5945 i++;
5946 }
5947
drh8b2f49b2001-06-08 00:21:52 +00005948 /*
drhf9ffac92002-03-02 19:00:31 +00005949 ** Put the new pages in accending order. This helps to
5950 ** keep entries in the disk file in order so that a scan
5951 ** of the table is a linear scan through the file. That
5952 ** in turn helps the operating system to deliver pages
5953 ** from the disk more rapidly.
5954 **
5955 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00005956 ** n is never more than NB (a small constant), that should
5957 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00005958 **
drhc3b70572003-01-04 19:44:07 +00005959 ** When NB==3, this one optimization makes the database
5960 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00005961 */
5962 for(i=0; i<k-1; i++){
danielk19774dbaa892009-06-16 16:50:22 +00005963 int minV = apNew[i]->pgno;
drhf9ffac92002-03-02 19:00:31 +00005964 int minI = i;
5965 for(j=i+1; j<k; j++){
danielk19774dbaa892009-06-16 16:50:22 +00005966 if( apNew[j]->pgno<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00005967 minI = j;
danielk19774dbaa892009-06-16 16:50:22 +00005968 minV = apNew[j]->pgno;
drhf9ffac92002-03-02 19:00:31 +00005969 }
5970 }
5971 if( minI>i ){
5972 int t;
5973 MemPage *pT;
danielk19774dbaa892009-06-16 16:50:22 +00005974 t = apNew[i]->pgno;
drhf9ffac92002-03-02 19:00:31 +00005975 pT = apNew[i];
drhf9ffac92002-03-02 19:00:31 +00005976 apNew[i] = apNew[minI];
drhf9ffac92002-03-02 19:00:31 +00005977 apNew[minI] = pT;
5978 }
5979 }
danielk1977e5765212009-06-17 11:13:28 +00005980 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
danielk19774dbaa892009-06-16 16:50:22 +00005981 apNew[0]->pgno, szNew[0],
5982 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
5983 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
5984 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
5985 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
5986
5987 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5988 put4byte(pRight, apNew[nNew-1]->pgno);
drh24cd67e2004-05-10 16:18:47 +00005989
drhf9ffac92002-03-02 19:00:31 +00005990 /*
drh14acc042001-06-10 19:56:58 +00005991 ** Evenly distribute the data in apCell[] across the new pages.
5992 ** Insert divider cells into pParent as necessary.
5993 */
5994 j = 0;
5995 for(i=0; i<nNew; i++){
danielk1977ac11ee62005-01-15 12:45:51 +00005996 /* Assemble the new sibling page. */
drh14acc042001-06-10 19:56:58 +00005997 MemPage *pNew = apNew[i];
drh19642e52005-03-29 13:17:45 +00005998 assert( j<nMaxCells );
drh10131482008-07-11 03:34:09 +00005999 zeroPage(pNew, pageFlags);
drhfa1a98a2004-05-14 19:08:17 +00006000 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
drh09d0deb2005-08-02 17:13:09 +00006001 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
drh43605152004-05-29 21:46:49 +00006002 assert( pNew->nOverflow==0 );
danielk1977ac11ee62005-01-15 12:45:51 +00006003
danielk1977ac11ee62005-01-15 12:45:51 +00006004 j = cntNew[i];
6005
6006 /* If the sibling page assembled above was not the right-most sibling,
6007 ** insert a divider cell into the parent page.
6008 */
danielk19771c3d2bf2009-06-23 16:40:17 +00006009 assert( i<nNew-1 || j==nCell );
6010 if( j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00006011 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00006012 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00006013 int sz;
danielk1977634f2982005-03-28 08:44:07 +00006014
6015 assert( j<nMaxCells );
drh8b18dd42004-05-12 19:18:15 +00006016 pCell = apCell[j];
6017 sz = szCell[j] + leafCorrection;
danielk19776067a9b2009-06-09 09:41:00 +00006018 pTemp = &aOvflSpace[iOvflSpace];
drh4b70f112004-05-02 21:12:19 +00006019 if( !pNew->leaf ){
drh43605152004-05-29 21:46:49 +00006020 memcpy(&pNew->aData[8], pCell, 4);
drh8b18dd42004-05-12 19:18:15 +00006021 }else if( leafData ){
drhfd131da2007-08-07 17:13:03 +00006022 /* If the tree is a leaf-data tree, and the siblings are leaves,
danielk1977ac11ee62005-01-15 12:45:51 +00006023 ** then there is no divider cell in apCell[]. Instead, the divider
6024 ** cell consists of the integer key for the right-most cell of
6025 ** the sibling-page assembled above only.
6026 */
drh6f11bef2004-05-13 01:12:56 +00006027 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00006028 j--;
danielk197730548662009-07-09 05:07:37 +00006029 btreeParseCellPtr(pNew, apCell[j], &info);
drhe5ae5732008-06-15 02:51:47 +00006030 pCell = pTemp;
danielk19774dbaa892009-06-16 16:50:22 +00006031 sz = 4 + putVarint(&pCell[4], info.nKey);
drh8b18dd42004-05-12 19:18:15 +00006032 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00006033 }else{
6034 pCell -= 4;
danielk19774aeff622007-05-12 09:30:47 +00006035 /* Obscure case for non-leaf-data trees: If the cell at pCell was
drh85b623f2007-12-13 21:54:09 +00006036 ** previously stored on a leaf node, and its reported size was 4
danielk19774aeff622007-05-12 09:30:47 +00006037 ** bytes, then it may actually be smaller than this
danielk197730548662009-07-09 05:07:37 +00006038 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
drh85b623f2007-12-13 21:54:09 +00006039 ** any cell). But it is important to pass the correct size to
danielk19774aeff622007-05-12 09:30:47 +00006040 ** insertCell(), so reparse the cell now.
6041 **
6042 ** Note that this can never happen in an SQLite data file, as all
6043 ** cells are at least 4 bytes. It only happens in b-trees used
6044 ** to evaluate "IN (SELECT ...)" and similar clauses.
6045 */
6046 if( szCell[j]==4 ){
6047 assert(leafCorrection==4);
6048 sz = cellSizePtr(pParent, pCell);
6049 }
drh4b70f112004-05-02 21:12:19 +00006050 }
danielk19776067a9b2009-06-09 09:41:00 +00006051 iOvflSpace += sz;
drhe5ae5732008-06-15 02:51:47 +00006052 assert( sz<=pBt->pageSize/4 );
danielk19776067a9b2009-06-09 09:41:00 +00006053 assert( iOvflSpace<=pBt->pageSize );
drh98add2e2009-07-20 17:11:49 +00006054 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
danielk1977e80463b2004-11-03 03:01:16 +00006055 if( rc!=SQLITE_OK ) goto balance_cleanup;
drhc5053fb2008-11-27 02:22:10 +00006056 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
danielk197785d90ca2008-07-19 14:25:15 +00006057
drh14acc042001-06-10 19:56:58 +00006058 j++;
6059 nxDiv++;
6060 }
6061 }
drh6019e162001-07-02 17:51:45 +00006062 assert( j==nCell );
drh7aa8f852006-03-28 00:24:44 +00006063 assert( nOld>0 );
6064 assert( nNew>0 );
drh4b70f112004-05-02 21:12:19 +00006065 if( (pageFlags & PTF_LEAF)==0 ){
danielk197787c52b52008-07-19 11:49:07 +00006066 u8 *zChild = &apCopy[nOld-1]->aData[8];
6067 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
drh14acc042001-06-10 19:56:58 +00006068 }
6069
danielk197713bd99f2009-06-24 05:40:34 +00006070 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6071 /* The root page of the b-tree now contains no cells. The only sibling
6072 ** page is the right-child of the parent. Copy the contents of the
6073 ** child page into the parent, decreasing the overall height of the
6074 ** b-tree structure by one. This is described as the "balance-shallower"
6075 ** sub-algorithm in some documentation.
6076 **
6077 ** If this is an auto-vacuum database, the call to copyNodeContent()
6078 ** sets all pointer-map entries corresponding to database image pages
6079 ** for which the pointer is stored within the content being copied.
6080 **
6081 ** The second assert below verifies that the child page is defragmented
6082 ** (it must be, as it was just reconstructed using assemblePage()). This
6083 ** is important if the parent page happens to be page 1 of the database
6084 ** image. */
6085 assert( nNew==1 );
6086 assert( apNew[0]->nFree ==
6087 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6088 );
drhc314dc72009-07-21 11:52:34 +00006089 copyNodeContent(apNew[0], pParent, &rc);
6090 freePage(apNew[0], &rc);
danielk197713bd99f2009-06-24 05:40:34 +00006091 }else if( ISAUTOVACUUM ){
6092 /* Fix the pointer-map entries for all the cells that were shifted around.
6093 ** There are several different types of pointer-map entries that need to
6094 ** be dealt with by this routine. Some of these have been set already, but
6095 ** many have not. The following is a summary:
6096 **
6097 ** 1) The entries associated with new sibling pages that were not
6098 ** siblings when this function was called. These have already
6099 ** been set. We don't need to worry about old siblings that were
6100 ** moved to the free-list - the freePage() code has taken care
6101 ** of those.
6102 **
6103 ** 2) The pointer-map entries associated with the first overflow
6104 ** page in any overflow chains used by new divider cells. These
6105 ** have also already been taken care of by the insertCell() code.
6106 **
6107 ** 3) If the sibling pages are not leaves, then the child pages of
6108 ** cells stored on the sibling pages may need to be updated.
6109 **
6110 ** 4) If the sibling pages are not internal intkey nodes, then any
6111 ** overflow pages used by these cells may need to be updated
6112 ** (internal intkey nodes never contain pointers to overflow pages).
6113 **
6114 ** 5) If the sibling pages are not leaves, then the pointer-map
6115 ** entries for the right-child pages of each sibling may need
6116 ** to be updated.
6117 **
6118 ** Cases 1 and 2 are dealt with above by other code. The next
6119 ** block deals with cases 3 and 4 and the one after that, case 5. Since
6120 ** setting a pointer map entry is a relatively expensive operation, this
6121 ** code only sets pointer map entries for child or overflow pages that have
6122 ** actually moved between pages. */
danielk19774dbaa892009-06-16 16:50:22 +00006123 MemPage *pNew = apNew[0];
6124 MemPage *pOld = apCopy[0];
6125 int nOverflow = pOld->nOverflow;
6126 int iNextOld = pOld->nCell + nOverflow;
6127 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1);
6128 j = 0; /* Current 'old' sibling page */
6129 k = 0; /* Current 'new' sibling page */
drhc314dc72009-07-21 11:52:34 +00006130 for(i=0; i<nCell; i++){
danielk19774dbaa892009-06-16 16:50:22 +00006131 int isDivider = 0;
6132 while( i==iNextOld ){
6133 /* Cell i is the cell immediately following the last cell on old
6134 ** sibling page j. If the siblings are not leaf pages of an
6135 ** intkey b-tree, then cell i was a divider cell. */
6136 pOld = apCopy[++j];
6137 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6138 if( pOld->nOverflow ){
6139 nOverflow = pOld->nOverflow;
6140 iOverflow = i + !leafData + pOld->aOvfl[0].idx;
6141 }
6142 isDivider = !leafData;
6143 }
6144
6145 assert(nOverflow>0 || iOverflow<i );
6146 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1);
6147 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1);
6148 if( i==iOverflow ){
6149 isDivider = 1;
6150 if( (--nOverflow)>0 ){
6151 iOverflow++;
6152 }
6153 }
6154
6155 if( i==cntNew[k] ){
6156 /* Cell i is the cell immediately following the last cell on new
6157 ** sibling page k. If the siblings are not leaf pages of an
6158 ** intkey b-tree, then cell i is a divider cell. */
6159 pNew = apNew[++k];
6160 if( !leafData ) continue;
6161 }
danielk19774dbaa892009-06-16 16:50:22 +00006162 assert( j<nOld );
6163 assert( k<nNew );
6164
6165 /* If the cell was originally divider cell (and is not now) or
6166 ** an overflow cell, or if the cell was located on a different sibling
6167 ** page before the balancing, then the pointer map entries associated
6168 ** with any child or overflow pages need to be updated. */
6169 if( isDivider || pOld->pgno!=pNew->pgno ){
6170 if( !leafCorrection ){
drh98add2e2009-07-20 17:11:49 +00006171 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
danielk19774dbaa892009-06-16 16:50:22 +00006172 }
drh98add2e2009-07-20 17:11:49 +00006173 if( szCell[i]>pNew->minLocal ){
6174 ptrmapPutOvflPtr(pNew, apCell[i], &rc);
danielk19774dbaa892009-06-16 16:50:22 +00006175 }
6176 }
6177 }
6178
6179 if( !leafCorrection ){
drh98add2e2009-07-20 17:11:49 +00006180 for(i=0; i<nNew; i++){
6181 u32 key = get4byte(&apNew[i]->aData[8]);
6182 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
danielk19774dbaa892009-06-16 16:50:22 +00006183 }
6184 }
6185
6186#if 0
6187 /* The ptrmapCheckPages() contains assert() statements that verify that
6188 ** all pointer map pages are set correctly. This is helpful while
6189 ** debugging. This is usually disabled because a corrupt database may
6190 ** cause an assert() statement to fail. */
6191 ptrmapCheckPages(apNew, nNew);
6192 ptrmapCheckPages(&pParent, 1);
6193#endif
6194 }
6195
danielk197771d5d2c2008-09-29 11:49:47 +00006196 assert( pParent->isInit );
danielk1977e5765212009-06-17 11:13:28 +00006197 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6198 nOld, nNew, nCell));
danielk1977cd581a72009-06-23 15:43:39 +00006199
drh8b2f49b2001-06-08 00:21:52 +00006200 /*
drh14acc042001-06-10 19:56:58 +00006201 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00006202 */
drh14acc042001-06-10 19:56:58 +00006203balance_cleanup:
drhfacf0302008-06-17 15:12:00 +00006204 sqlite3ScratchFree(apCell);
drh8b2f49b2001-06-08 00:21:52 +00006205 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00006206 releasePage(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00006207 }
drh14acc042001-06-10 19:56:58 +00006208 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00006209 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00006210 }
danielk1977eaa06f62008-09-18 17:34:44 +00006211
drh8b2f49b2001-06-08 00:21:52 +00006212 return rc;
6213}
6214
drh43605152004-05-29 21:46:49 +00006215
6216/*
danielk1977a50d9aa2009-06-08 14:49:45 +00006217** This function is called when the root page of a b-tree structure is
6218** overfull (has one or more overflow pages).
drh43605152004-05-29 21:46:49 +00006219**
danielk1977a50d9aa2009-06-08 14:49:45 +00006220** A new child page is allocated and the contents of the current root
6221** page, including overflow cells, are copied into the child. The root
6222** page is then overwritten to make it an empty page with the right-child
6223** pointer pointing to the new page.
6224**
6225** Before returning, all pointer-map entries corresponding to pages
6226** that the new child-page now contains pointers to are updated. The
6227** entry corresponding to the new right-child pointer of the root
6228** page is also updated.
6229**
6230** If successful, *ppChild is set to contain a reference to the child
6231** page and SQLITE_OK is returned. In this case the caller is required
6232** to call releasePage() on *ppChild exactly once. If an error occurs,
6233** an error code is returned and *ppChild is set to 0.
drh43605152004-05-29 21:46:49 +00006234*/
danielk1977a50d9aa2009-06-08 14:49:45 +00006235static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6236 int rc; /* Return value from subprocedures */
6237 MemPage *pChild = 0; /* Pointer to a new child page */
shane5eff7cf2009-08-10 03:57:58 +00006238 Pgno pgnoChild = 0; /* Page number of the new child page */
danielk1977a50d9aa2009-06-08 14:49:45 +00006239 BtShared *pBt = pRoot->pBt; /* The BTree */
drh43605152004-05-29 21:46:49 +00006240
danielk1977a50d9aa2009-06-08 14:49:45 +00006241 assert( pRoot->nOverflow>0 );
drh1fee73e2007-08-29 04:00:57 +00006242 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977bc2ca9e2008-11-13 14:28:28 +00006243
danielk1977a50d9aa2009-06-08 14:49:45 +00006244 /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6245 ** page that will become the new right-child of pPage. Copy the contents
6246 ** of the node stored on pRoot into the new child page.
6247 */
drh98add2e2009-07-20 17:11:49 +00006248 rc = sqlite3PagerWrite(pRoot->pDbPage);
6249 if( rc==SQLITE_OK ){
6250 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
drhc314dc72009-07-21 11:52:34 +00006251 copyNodeContent(pRoot, pChild, &rc);
6252 if( ISAUTOVACUUM ){
6253 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
drh98add2e2009-07-20 17:11:49 +00006254 }
6255 }
6256 if( rc ){
danielk1977a50d9aa2009-06-08 14:49:45 +00006257 *ppChild = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00006258 releasePage(pChild);
danielk1977a50d9aa2009-06-08 14:49:45 +00006259 return rc;
danielk197771d5d2c2008-09-29 11:49:47 +00006260 }
danielk1977a50d9aa2009-06-08 14:49:45 +00006261 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6262 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6263 assert( pChild->nCell==pRoot->nCell );
danielk197771d5d2c2008-09-29 11:49:47 +00006264
danielk1977a50d9aa2009-06-08 14:49:45 +00006265 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6266
6267 /* Copy the overflow cells from pRoot to pChild */
6268 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
6269 pChild->nOverflow = pRoot->nOverflow;
danielk1977a50d9aa2009-06-08 14:49:45 +00006270
6271 /* Zero the contents of pRoot. Then install pChild as the right-child. */
6272 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6273 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6274
6275 *ppChild = pChild;
6276 return SQLITE_OK;
drh43605152004-05-29 21:46:49 +00006277}
6278
6279/*
danielk197771d5d2c2008-09-29 11:49:47 +00006280** The page that pCur currently points to has just been modified in
6281** some way. This function figures out if this modification means the
6282** tree needs to be balanced, and if so calls the appropriate balancing
danielk1977a50d9aa2009-06-08 14:49:45 +00006283** routine. Balancing routines are:
6284**
6285** balance_quick()
danielk1977a50d9aa2009-06-08 14:49:45 +00006286** balance_deeper()
6287** balance_nonroot()
drh43605152004-05-29 21:46:49 +00006288*/
danielk1977a50d9aa2009-06-08 14:49:45 +00006289static int balance(BtCursor *pCur){
drh43605152004-05-29 21:46:49 +00006290 int rc = SQLITE_OK;
danielk1977a50d9aa2009-06-08 14:49:45 +00006291 const int nMin = pCur->pBt->usableSize * 2 / 3;
6292 u8 aBalanceQuickSpace[13];
6293 u8 *pFree = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00006294
shane75ac1de2009-06-09 18:58:52 +00006295 TESTONLY( int balance_quick_called = 0 );
6296 TESTONLY( int balance_deeper_called = 0 );
danielk1977a50d9aa2009-06-08 14:49:45 +00006297
6298 do {
6299 int iPage = pCur->iPage;
6300 MemPage *pPage = pCur->apPage[iPage];
6301
6302 if( iPage==0 ){
6303 if( pPage->nOverflow ){
6304 /* The root page of the b-tree is overfull. In this case call the
6305 ** balance_deeper() function to create a new child for the root-page
6306 ** and copy the current contents of the root-page to it. The
6307 ** next iteration of the do-loop will balance the child page.
6308 */
6309 assert( (balance_deeper_called++)==0 );
6310 rc = balance_deeper(pPage, &pCur->apPage[1]);
6311 if( rc==SQLITE_OK ){
6312 pCur->iPage = 1;
6313 pCur->aiIdx[0] = 0;
6314 pCur->aiIdx[1] = 0;
6315 assert( pCur->apPage[1]->nOverflow );
6316 }
danielk1977a50d9aa2009-06-08 14:49:45 +00006317 }else{
danielk1977a50d9aa2009-06-08 14:49:45 +00006318 break;
6319 }
6320 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6321 break;
6322 }else{
6323 MemPage * const pParent = pCur->apPage[iPage-1];
6324 int const iIdx = pCur->aiIdx[iPage-1];
6325
6326 rc = sqlite3PagerWrite(pParent->pDbPage);
6327 if( rc==SQLITE_OK ){
6328#ifndef SQLITE_OMIT_QUICKBALANCE
6329 if( pPage->hasData
6330 && pPage->nOverflow==1
6331 && pPage->aOvfl[0].idx==pPage->nCell
6332 && pParent->pgno!=1
6333 && pParent->nCell==iIdx
6334 ){
6335 /* Call balance_quick() to create a new sibling of pPage on which
6336 ** to store the overflow cell. balance_quick() inserts a new cell
6337 ** into pParent, which may cause pParent overflow. If this
6338 ** happens, the next interation of the do-loop will balance pParent
6339 ** use either balance_nonroot() or balance_deeper(). Until this
6340 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6341 ** buffer.
6342 **
6343 ** The purpose of the following assert() is to check that only a
6344 ** single call to balance_quick() is made for each call to this
6345 ** function. If this were not verified, a subtle bug involving reuse
6346 ** of the aBalanceQuickSpace[] might sneak in.
6347 */
6348 assert( (balance_quick_called++)==0 );
6349 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6350 }else
6351#endif
6352 {
6353 /* In this case, call balance_nonroot() to redistribute cells
6354 ** between pPage and up to 2 of its sibling pages. This involves
6355 ** modifying the contents of pParent, which may cause pParent to
6356 ** become overfull or underfull. The next iteration of the do-loop
6357 ** will balance the parent page to correct this.
6358 **
6359 ** If the parent page becomes overfull, the overflow cell or cells
6360 ** are stored in the pSpace buffer allocated immediately below.
6361 ** A subsequent iteration of the do-loop will deal with this by
6362 ** calling balance_nonroot() (balance_deeper() may be called first,
6363 ** but it doesn't deal with overflow cells - just moves them to a
6364 ** different page). Once this subsequent call to balance_nonroot()
6365 ** has completed, it is safe to release the pSpace buffer used by
6366 ** the previous call, as the overflow cell data will have been
6367 ** copied either into the body of a database page or into the new
6368 ** pSpace buffer passed to the latter call to balance_nonroot().
6369 */
6370 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
danielk1977cd581a72009-06-23 15:43:39 +00006371 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
danielk1977a50d9aa2009-06-08 14:49:45 +00006372 if( pFree ){
6373 /* If pFree is not NULL, it points to the pSpace buffer used
6374 ** by a previous call to balance_nonroot(). Its contents are
6375 ** now stored either on real database pages or within the
6376 ** new pSpace buffer, so it may be safely freed here. */
6377 sqlite3PageFree(pFree);
6378 }
6379
danielk19774dbaa892009-06-16 16:50:22 +00006380 /* The pSpace buffer will be freed after the next call to
6381 ** balance_nonroot(), or just before this function returns, whichever
6382 ** comes first. */
danielk1977a50d9aa2009-06-08 14:49:45 +00006383 pFree = pSpace;
danielk1977a50d9aa2009-06-08 14:49:45 +00006384 }
6385 }
6386
6387 pPage->nOverflow = 0;
6388
6389 /* The next iteration of the do-loop balances the parent page. */
6390 releasePage(pPage);
6391 pCur->iPage--;
drh43605152004-05-29 21:46:49 +00006392 }
danielk1977a50d9aa2009-06-08 14:49:45 +00006393 }while( rc==SQLITE_OK );
6394
6395 if( pFree ){
6396 sqlite3PageFree(pFree);
drh43605152004-05-29 21:46:49 +00006397 }
6398 return rc;
6399}
6400
drhf74b8d92002-09-01 23:20:45 +00006401
6402/*
drh3b7511c2001-05-26 13:15:44 +00006403** Insert a new record into the BTree. The key is given by (pKey,nKey)
6404** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00006405** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00006406** is left pointing at a random location.
6407**
6408** For an INTKEY table, only the nKey value of the key is used. pKey is
6409** ignored. For a ZERODATA table, the pData and nData are both ignored.
danielk1977de630352009-05-04 11:42:29 +00006410**
6411** If the seekResult parameter is non-zero, then a successful call to
danielk19773509a652009-07-06 18:56:13 +00006412** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
danielk1977de630352009-05-04 11:42:29 +00006413** been performed. seekResult is the search result returned (a negative
6414** number if pCur points at an entry that is smaller than (pKey, nKey), or
6415** a positive value if pCur points at an etry that is larger than
6416** (pKey, nKey)).
6417**
drh3e9ca092009-09-08 01:14:48 +00006418** If the seekResult parameter is non-zero, then the caller guarantees that
6419** cursor pCur is pointing at the existing copy of a row that is to be
6420** overwritten. If the seekResult parameter is 0, then cursor pCur may
6421** point to any entry or to no entry at all and so this function has to seek
danielk1977de630352009-05-04 11:42:29 +00006422** the cursor before the new key can be inserted.
drh3b7511c2001-05-26 13:15:44 +00006423*/
drh3aac2dd2004-04-26 14:10:20 +00006424int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00006425 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00006426 const void *pKey, i64 nKey, /* The key of the new record */
drhe4d90812007-03-29 05:51:49 +00006427 const void *pData, int nData, /* The data of the new record */
drhb026e052007-05-02 01:34:31 +00006428 int nZero, /* Number of extra 0 bytes to append to data */
danielk1977de630352009-05-04 11:42:29 +00006429 int appendBias, /* True if this is likely an append */
danielk19773509a652009-07-06 18:56:13 +00006430 int seekResult /* Result of prior MovetoUnpacked() call */
drh3b7511c2001-05-26 13:15:44 +00006431){
drh3b7511c2001-05-26 13:15:44 +00006432 int rc;
drh3e9ca092009-09-08 01:14:48 +00006433 int loc = seekResult; /* -1: before desired location +1: after */
drh14acc042001-06-10 19:56:58 +00006434 int szNew;
danielk197771d5d2c2008-09-29 11:49:47 +00006435 int idx;
drh3b7511c2001-05-26 13:15:44 +00006436 MemPage *pPage;
drhd677b3d2007-08-20 22:48:41 +00006437 Btree *p = pCur->pBtree;
6438 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00006439 unsigned char *oldCell;
drh2e38c322004-09-03 18:38:44 +00006440 unsigned char *newCell = 0;
drh3b7511c2001-05-26 13:15:44 +00006441
drh98add2e2009-07-20 17:11:49 +00006442 if( pCur->eState==CURSOR_FAULT ){
6443 assert( pCur->skipNext!=SQLITE_OK );
6444 return pCur->skipNext;
6445 }
6446
drh1fee73e2007-08-29 04:00:57 +00006447 assert( cursorHoldsMutex(pCur) );
danielk197731d31b82009-07-13 13:18:07 +00006448 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly );
danielk197796d48e92009-06-29 06:00:37 +00006449 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6450
danielk197731d31b82009-07-13 13:18:07 +00006451 /* Assert that the caller has been consistent. If this cursor was opened
6452 ** expecting an index b-tree, then the caller should be inserting blob
6453 ** keys with no associated data. If the cursor was opened expecting an
6454 ** intkey table, the caller should be inserting integer keys with a
6455 ** blob of associated data. */
6456 assert( (pKey==0)==(pCur->pKeyInfo==0) );
6457
danielk197796d48e92009-06-29 06:00:37 +00006458 /* If this is an insert into a table b-tree, invalidate any incrblob
6459 ** cursors open on the row being replaced (assuming this is a replace
6460 ** operation - if it is not, the following is a no-op). */
6461 if( pCur->pKeyInfo==0 ){
drheeb844a2009-08-08 18:01:07 +00006462 invalidateIncrblobCursors(p, nKey, 0);
drhf74b8d92002-09-01 23:20:45 +00006463 }
danielk197796d48e92009-06-29 06:00:37 +00006464
danielk19779c3acf32009-05-02 07:36:49 +00006465 /* Save the positions of any other cursors open on this table.
6466 **
danielk19773509a652009-07-06 18:56:13 +00006467 ** In some cases, the call to btreeMoveto() below is a no-op. For
danielk19779c3acf32009-05-02 07:36:49 +00006468 ** example, when inserting data into a table with auto-generated integer
6469 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6470 ** integer key to use. It then calls this function to actually insert the
danielk19773509a652009-07-06 18:56:13 +00006471 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
danielk19779c3acf32009-05-02 07:36:49 +00006472 ** that the cursor is already where it needs to be and returns without
6473 ** doing any work. To avoid thwarting these optimizations, it is important
6474 ** not to clear the cursor here.
6475 */
drh4c301aa2009-07-15 17:25:45 +00006476 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6477 if( rc ) return rc;
6478 if( !loc ){
6479 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6480 if( rc ) return rc;
danielk1977da184232006-01-05 11:34:32 +00006481 }
danielk1977b980d2212009-06-22 18:03:51 +00006482 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
danielk1977da184232006-01-05 11:34:32 +00006483
danielk197771d5d2c2008-09-29 11:49:47 +00006484 pPage = pCur->apPage[pCur->iPage];
drh4a1c3802004-05-12 15:15:47 +00006485 assert( pPage->intKey || nKey>=0 );
drh44845222008-07-17 18:39:57 +00006486 assert( pPage->leaf || !pPage->intKey );
danielk19778f880a82009-07-13 09:41:45 +00006487
drh3a4c1412004-05-09 20:40:11 +00006488 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6489 pCur->pgnoRoot, nKey, nData, pPage->pgno,
6490 loc==0 ? "overwrite" : "new entry"));
danielk197771d5d2c2008-09-29 11:49:47 +00006491 assert( pPage->isInit );
danielk197752ae7242008-03-25 14:24:56 +00006492 allocateTempSpace(pBt);
6493 newCell = pBt->pTmpSpace;
drh2e38c322004-09-03 18:38:44 +00006494 if( newCell==0 ) return SQLITE_NOMEM;
drhb026e052007-05-02 01:34:31 +00006495 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
drh2e38c322004-09-03 18:38:44 +00006496 if( rc ) goto end_insert;
drh43605152004-05-29 21:46:49 +00006497 assert( szNew==cellSizePtr(pPage, newCell) );
drh2e38c322004-09-03 18:38:44 +00006498 assert( szNew<=MX_CELL_SIZE(pBt) );
danielk197771d5d2c2008-09-29 11:49:47 +00006499 idx = pCur->aiIdx[pCur->iPage];
danielk1977b980d2212009-06-22 18:03:51 +00006500 if( loc==0 ){
drha9121e42008-02-19 14:59:35 +00006501 u16 szOld;
danielk197771d5d2c2008-09-29 11:49:47 +00006502 assert( idx<pPage->nCell );
danielk19776e465eb2007-08-21 13:11:00 +00006503 rc = sqlite3PagerWrite(pPage->pDbPage);
6504 if( rc ){
6505 goto end_insert;
6506 }
danielk197771d5d2c2008-09-29 11:49:47 +00006507 oldCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00006508 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006509 memcpy(newCell, oldCell, 4);
drh4b70f112004-05-02 21:12:19 +00006510 }
drh43605152004-05-29 21:46:49 +00006511 szOld = cellSizePtr(pPage, oldCell);
drh4b70f112004-05-02 21:12:19 +00006512 rc = clearCell(pPage, oldCell);
drh98add2e2009-07-20 17:11:49 +00006513 dropCell(pPage, idx, szOld, &rc);
drh2e38c322004-09-03 18:38:44 +00006514 if( rc ) goto end_insert;
drh7c717f72001-06-24 20:39:41 +00006515 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00006516 assert( pPage->leaf );
danielk197771d5d2c2008-09-29 11:49:47 +00006517 idx = ++pCur->aiIdx[pCur->iPage];
drh14acc042001-06-10 19:56:58 +00006518 }else{
drh4b70f112004-05-02 21:12:19 +00006519 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00006520 }
drh98add2e2009-07-20 17:11:49 +00006521 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
danielk19773f632d52009-05-02 10:03:09 +00006522 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
drh9bf9e9c2008-12-05 20:01:43 +00006523
danielk1977a50d9aa2009-06-08 14:49:45 +00006524 /* If no error has occured and pPage has an overflow cell, call balance()
6525 ** to redistribute the cells within the tree. Since balance() may move
6526 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6527 ** variables.
danielk19773f632d52009-05-02 10:03:09 +00006528 **
danielk1977a50d9aa2009-06-08 14:49:45 +00006529 ** Previous versions of SQLite called moveToRoot() to move the cursor
6530 ** back to the root page as balance() used to invalidate the contents
danielk197754109bb2009-06-23 11:22:29 +00006531 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6532 ** set the cursor state to "invalid". This makes common insert operations
6533 ** slightly faster.
danielk19773f632d52009-05-02 10:03:09 +00006534 **
danielk1977a50d9aa2009-06-08 14:49:45 +00006535 ** There is a subtle but important optimization here too. When inserting
6536 ** multiple records into an intkey b-tree using a single cursor (as can
6537 ** happen while processing an "INSERT INTO ... SELECT" statement), it
6538 ** is advantageous to leave the cursor pointing to the last entry in
6539 ** the b-tree if possible. If the cursor is left pointing to the last
6540 ** entry in the table, and the next row inserted has an integer key
6541 ** larger than the largest existing key, it is possible to insert the
6542 ** row without seeking the cursor. This can be a big performance boost.
danielk19773f632d52009-05-02 10:03:09 +00006543 */
danielk1977a50d9aa2009-06-08 14:49:45 +00006544 pCur->info.nSize = 0;
6545 pCur->validNKey = 0;
6546 if( rc==SQLITE_OK && pPage->nOverflow ){
danielk1977a50d9aa2009-06-08 14:49:45 +00006547 rc = balance(pCur);
6548
6549 /* Must make sure nOverflow is reset to zero even if the balance()
danielk197754109bb2009-06-23 11:22:29 +00006550 ** fails. Internal data structure corruption will result otherwise.
6551 ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6552 ** from trying to save the current position of the cursor. */
danielk1977a50d9aa2009-06-08 14:49:45 +00006553 pCur->apPage[pCur->iPage]->nOverflow = 0;
danielk197754109bb2009-06-23 11:22:29 +00006554 pCur->eState = CURSOR_INVALID;
danielk19773f632d52009-05-02 10:03:09 +00006555 }
danielk1977a50d9aa2009-06-08 14:49:45 +00006556 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
drh9bf9e9c2008-12-05 20:01:43 +00006557
drh2e38c322004-09-03 18:38:44 +00006558end_insert:
drh5e2f8b92001-05-28 00:41:15 +00006559 return rc;
6560}
6561
6562/*
drh4b70f112004-05-02 21:12:19 +00006563** Delete the entry that the cursor is pointing to. The cursor
drhf94a1732008-09-30 17:18:17 +00006564** is left pointing at a arbitrary location.
drh3b7511c2001-05-26 13:15:44 +00006565*/
drh3aac2dd2004-04-26 14:10:20 +00006566int sqlite3BtreeDelete(BtCursor *pCur){
drhd677b3d2007-08-20 22:48:41 +00006567 Btree *p = pCur->pBtree;
danielk19774dbaa892009-06-16 16:50:22 +00006568 BtShared *pBt = p->pBt;
6569 int rc; /* Return code */
6570 MemPage *pPage; /* Page to delete cell from */
6571 unsigned char *pCell; /* Pointer to cell to delete */
6572 int iCellIdx; /* Index of cell to delete */
6573 int iCellDepth; /* Depth of node containing pCell */
drh8b2f49b2001-06-08 00:21:52 +00006574
drh1fee73e2007-08-29 04:00:57 +00006575 assert( cursorHoldsMutex(pCur) );
drh64022502009-01-09 14:11:04 +00006576 assert( pBt->inTransaction==TRANS_WRITE );
drhf74b8d92002-09-01 23:20:45 +00006577 assert( !pBt->readOnly );
drh64022502009-01-09 14:11:04 +00006578 assert( pCur->wrFlag );
danielk197796d48e92009-06-29 06:00:37 +00006579 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6580 assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6581
danielk19774dbaa892009-06-16 16:50:22 +00006582 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6583 || NEVER(pCur->eState!=CURSOR_VALID)
6584 ){
6585 return SQLITE_ERROR; /* Something has gone awry. */
drhf74b8d92002-09-01 23:20:45 +00006586 }
danielk1977da184232006-01-05 11:34:32 +00006587
danielk197796d48e92009-06-29 06:00:37 +00006588 /* If this is a delete operation to remove a row from a table b-tree,
6589 ** invalidate any incrblob cursors open on the row being deleted. */
6590 if( pCur->pKeyInfo==0 ){
drheeb844a2009-08-08 18:01:07 +00006591 invalidateIncrblobCursors(p, pCur->info.nKey, 0);
danielk19774dbaa892009-06-16 16:50:22 +00006592 }
6593
6594 iCellDepth = pCur->iPage;
6595 iCellIdx = pCur->aiIdx[iCellDepth];
6596 pPage = pCur->apPage[iCellDepth];
6597 pCell = findCell(pPage, iCellIdx);
6598
6599 /* If the page containing the entry to delete is not a leaf page, move
6600 ** the cursor to the largest entry in the tree that is smaller than
6601 ** the entry being deleted. This cell will replace the cell being deleted
6602 ** from the internal node. The 'previous' entry is used for this instead
6603 ** of the 'next' entry, as the previous entry is always a part of the
6604 ** sub-tree headed by the child page of the cell being deleted. This makes
6605 ** balancing the tree following the delete operation easier. */
6606 if( !pPage->leaf ){
6607 int notUsed;
drh4c301aa2009-07-15 17:25:45 +00006608 rc = sqlite3BtreePrevious(pCur, &notUsed);
6609 if( rc ) return rc;
danielk19774dbaa892009-06-16 16:50:22 +00006610 }
6611
6612 /* Save the positions of any other cursors open on this table before
6613 ** making any modifications. Make the page containing the entry to be
6614 ** deleted writable. Then free any overflow pages associated with the
drha4ec1d42009-07-11 13:13:11 +00006615 ** entry and finally remove the cell itself from within the page.
6616 */
6617 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6618 if( rc ) return rc;
6619 rc = sqlite3PagerWrite(pPage->pDbPage);
6620 if( rc ) return rc;
6621 rc = clearCell(pPage, pCell);
drh98add2e2009-07-20 17:11:49 +00006622 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
drha4ec1d42009-07-11 13:13:11 +00006623 if( rc ) return rc;
danielk1977e6efa742004-11-10 11:55:10 +00006624
danielk19774dbaa892009-06-16 16:50:22 +00006625 /* If the cell deleted was not located on a leaf page, then the cursor
6626 ** is currently pointing to the largest entry in the sub-tree headed
6627 ** by the child-page of the cell that was just deleted from an internal
6628 ** node. The cell from the leaf node needs to be moved to the internal
6629 ** node to replace the deleted cell. */
drh4b70f112004-05-02 21:12:19 +00006630 if( !pPage->leaf ){
danielk19774dbaa892009-06-16 16:50:22 +00006631 MemPage *pLeaf = pCur->apPage[pCur->iPage];
6632 int nCell;
6633 Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6634 unsigned char *pTmp;
danielk1977e6efa742004-11-10 11:55:10 +00006635
danielk19774dbaa892009-06-16 16:50:22 +00006636 pCell = findCell(pLeaf, pLeaf->nCell-1);
6637 nCell = cellSizePtr(pLeaf, pCell);
6638 assert( MX_CELL_SIZE(pBt)>=nCell );
danielk197771d5d2c2008-09-29 11:49:47 +00006639
danielk19774dbaa892009-06-16 16:50:22 +00006640 allocateTempSpace(pBt);
6641 pTmp = pBt->pTmpSpace;
danielk19772f78fc62008-09-30 09:31:45 +00006642
drha4ec1d42009-07-11 13:13:11 +00006643 rc = sqlite3PagerWrite(pLeaf->pDbPage);
drh98add2e2009-07-20 17:11:49 +00006644 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6645 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
drha4ec1d42009-07-11 13:13:11 +00006646 if( rc ) return rc;
drh5e2f8b92001-05-28 00:41:15 +00006647 }
danielk19774dbaa892009-06-16 16:50:22 +00006648
6649 /* Balance the tree. If the entry deleted was located on a leaf page,
6650 ** then the cursor still points to that page. In this case the first
6651 ** call to balance() repairs the tree, and the if(...) condition is
6652 ** never true.
6653 **
6654 ** Otherwise, if the entry deleted was on an internal node page, then
6655 ** pCur is pointing to the leaf page from which a cell was removed to
6656 ** replace the cell deleted from the internal node. This is slightly
6657 ** tricky as the leaf node may be underfull, and the internal node may
6658 ** be either under or overfull. In this case run the balancing algorithm
6659 ** on the leaf node first. If the balance proceeds far enough up the
6660 ** tree that we can be sure that any problem in the internal node has
6661 ** been corrected, so be it. Otherwise, after balancing the leaf node,
6662 ** walk the cursor up the tree to the internal node and balance it as
6663 ** well. */
6664 rc = balance(pCur);
6665 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
6666 while( pCur->iPage>iCellDepth ){
6667 releasePage(pCur->apPage[pCur->iPage--]);
6668 }
6669 rc = balance(pCur);
6670 }
6671
danielk19776b456a22005-03-21 04:04:02 +00006672 if( rc==SQLITE_OK ){
6673 moveToRoot(pCur);
6674 }
drh5e2f8b92001-05-28 00:41:15 +00006675 return rc;
drh3b7511c2001-05-26 13:15:44 +00006676}
drh8b2f49b2001-06-08 00:21:52 +00006677
6678/*
drhc6b52df2002-01-04 03:09:29 +00006679** Create a new BTree table. Write into *piTable the page
6680** number for the root page of the new table.
6681**
drhab01f612004-05-22 02:55:23 +00006682** The type of type is determined by the flags parameter. Only the
6683** following values of flags are currently in use. Other values for
6684** flags might not work:
6685**
6686** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6687** BTREE_ZERODATA Used for SQL indices
drh8b2f49b2001-06-08 00:21:52 +00006688*/
drhd677b3d2007-08-20 22:48:41 +00006689static int btreeCreateTable(Btree *p, int *piTable, int flags){
danielk1977aef0bf62005-12-30 16:28:01 +00006690 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006691 MemPage *pRoot;
6692 Pgno pgnoRoot;
6693 int rc;
drhd677b3d2007-08-20 22:48:41 +00006694
drh1fee73e2007-08-29 04:00:57 +00006695 assert( sqlite3BtreeHoldsMutex(p) );
drh64022502009-01-09 14:11:04 +00006696 assert( pBt->inTransaction==TRANS_WRITE );
danielk197728129562005-01-11 10:25:06 +00006697 assert( !pBt->readOnly );
danielk1977e6efa742004-11-10 11:55:10 +00006698
danielk1977003ba062004-11-04 02:57:33 +00006699#ifdef SQLITE_OMIT_AUTOVACUUM
drh4f0c5872007-03-26 22:05:01 +00006700 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
drhd677b3d2007-08-20 22:48:41 +00006701 if( rc ){
6702 return rc;
6703 }
danielk1977003ba062004-11-04 02:57:33 +00006704#else
danielk1977687566d2004-11-02 12:56:41 +00006705 if( pBt->autoVacuum ){
danielk1977003ba062004-11-04 02:57:33 +00006706 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6707 MemPage *pPageMove; /* The page to move to. */
6708
danielk197720713f32007-05-03 11:43:33 +00006709 /* Creating a new table may probably require moving an existing database
6710 ** to make room for the new tables root page. In case this page turns
6711 ** out to be an overflow page, delete all overflow page-map caches
6712 ** held by open cursors.
6713 */
danielk197792d4d7a2007-05-04 12:05:56 +00006714 invalidateAllOverflowCache(pBt);
danielk197720713f32007-05-03 11:43:33 +00006715
danielk1977003ba062004-11-04 02:57:33 +00006716 /* Read the value of meta[3] from the database to determine where the
6717 ** root page of the new table should go. meta[3] is the largest root-page
6718 ** created so far, so the new root-page is (meta[3]+1).
6719 */
danielk1977602b4662009-07-02 07:47:33 +00006720 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00006721 pgnoRoot++;
6722
danielk1977599fcba2004-11-08 07:13:13 +00006723 /* The new root-page may not be allocated on a pointer-map page, or the
6724 ** PENDING_BYTE page.
6725 */
drh72190432008-01-31 14:54:43 +00006726 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
danielk1977599fcba2004-11-08 07:13:13 +00006727 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
danielk1977003ba062004-11-04 02:57:33 +00006728 pgnoRoot++;
6729 }
6730 assert( pgnoRoot>=3 );
6731
6732 /* Allocate a page. The page that currently resides at pgnoRoot will
6733 ** be moved to the allocated page (unless the allocated page happens
6734 ** to reside at pgnoRoot).
6735 */
drh4f0c5872007-03-26 22:05:01 +00006736 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
danielk1977003ba062004-11-04 02:57:33 +00006737 if( rc!=SQLITE_OK ){
danielk1977687566d2004-11-02 12:56:41 +00006738 return rc;
6739 }
danielk1977003ba062004-11-04 02:57:33 +00006740
6741 if( pgnoMove!=pgnoRoot ){
danielk1977f35843b2007-04-07 15:03:17 +00006742 /* pgnoRoot is the page that will be used for the root-page of
6743 ** the new table (assuming an error did not occur). But we were
6744 ** allocated pgnoMove. If required (i.e. if it was not allocated
6745 ** by extending the file), the current page at position pgnoMove
6746 ** is already journaled.
6747 */
drheeb844a2009-08-08 18:01:07 +00006748 u8 eType = 0;
6749 Pgno iPtrPage = 0;
danielk1977003ba062004-11-04 02:57:33 +00006750
6751 releasePage(pPageMove);
danielk1977f35843b2007-04-07 15:03:17 +00006752
6753 /* Move the page currently at pgnoRoot to pgnoMove. */
danielk197730548662009-07-09 05:07:37 +00006754 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006755 if( rc!=SQLITE_OK ){
6756 return rc;
6757 }
6758 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
drh27731d72009-06-22 12:05:10 +00006759 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6760 rc = SQLITE_CORRUPT_BKPT;
6761 }
6762 if( rc!=SQLITE_OK ){
danielk1977003ba062004-11-04 02:57:33 +00006763 releasePage(pRoot);
6764 return rc;
6765 }
drhccae6022005-02-26 17:31:26 +00006766 assert( eType!=PTRMAP_ROOTPAGE );
6767 assert( eType!=PTRMAP_FREEPAGE );
danielk19774c999992008-07-16 18:17:55 +00006768 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
danielk1977003ba062004-11-04 02:57:33 +00006769 releasePage(pRoot);
danielk1977f35843b2007-04-07 15:03:17 +00006770
6771 /* Obtain the page at pgnoRoot */
danielk1977003ba062004-11-04 02:57:33 +00006772 if( rc!=SQLITE_OK ){
6773 return rc;
6774 }
danielk197730548662009-07-09 05:07:37 +00006775 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006776 if( rc!=SQLITE_OK ){
6777 return rc;
6778 }
danielk19773b8a05f2007-03-19 17:44:26 +00006779 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk1977003ba062004-11-04 02:57:33 +00006780 if( rc!=SQLITE_OK ){
6781 releasePage(pRoot);
6782 return rc;
6783 }
6784 }else{
6785 pRoot = pPageMove;
6786 }
6787
danielk197742741be2005-01-08 12:42:39 +00006788 /* Update the pointer-map and meta-data with the new root-page number. */
drh98add2e2009-07-20 17:11:49 +00006789 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
danielk1977003ba062004-11-04 02:57:33 +00006790 if( rc ){
6791 releasePage(pRoot);
6792 return rc;
6793 }
danielk1977aef0bf62005-12-30 16:28:01 +00006794 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00006795 if( rc ){
6796 releasePage(pRoot);
6797 return rc;
6798 }
danielk197742741be2005-01-08 12:42:39 +00006799
danielk1977003ba062004-11-04 02:57:33 +00006800 }else{
drh4f0c5872007-03-26 22:05:01 +00006801 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
danielk1977003ba062004-11-04 02:57:33 +00006802 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00006803 }
6804#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006805 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
drhde647132004-05-07 17:57:49 +00006806 zeroPage(pRoot, flags | PTF_LEAF);
danielk19773b8a05f2007-03-19 17:44:26 +00006807 sqlite3PagerUnref(pRoot->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00006808 *piTable = (int)pgnoRoot;
6809 return SQLITE_OK;
6810}
drhd677b3d2007-08-20 22:48:41 +00006811int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6812 int rc;
6813 sqlite3BtreeEnter(p);
6814 rc = btreeCreateTable(p, piTable, flags);
6815 sqlite3BtreeLeave(p);
6816 return rc;
6817}
drh8b2f49b2001-06-08 00:21:52 +00006818
6819/*
6820** Erase the given database page and all its children. Return
6821** the page to the freelist.
6822*/
drh4b70f112004-05-02 21:12:19 +00006823static int clearDatabasePage(
danielk1977aef0bf62005-12-30 16:28:01 +00006824 BtShared *pBt, /* The BTree that contains the table */
drh4b70f112004-05-02 21:12:19 +00006825 Pgno pgno, /* Page number to clear */
danielk1977c7af4842008-10-27 13:59:33 +00006826 int freePageFlag, /* Deallocate page if true */
6827 int *pnChange
drh4b70f112004-05-02 21:12:19 +00006828){
danielk1977146ba992009-07-22 14:08:13 +00006829 MemPage *pPage;
drh8b2f49b2001-06-08 00:21:52 +00006830 int rc;
drh4b70f112004-05-02 21:12:19 +00006831 unsigned char *pCell;
6832 int i;
drh8b2f49b2001-06-08 00:21:52 +00006833
drh1fee73e2007-08-29 04:00:57 +00006834 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197789d40042008-11-17 14:20:56 +00006835 if( pgno>pagerPagecount(pBt) ){
drh49285702005-09-17 15:20:26 +00006836 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00006837 }
6838
danielk197771d5d2c2008-09-29 11:49:47 +00006839 rc = getAndInitPage(pBt, pgno, &pPage);
danielk1977146ba992009-07-22 14:08:13 +00006840 if( rc ) return rc;
drh4b70f112004-05-02 21:12:19 +00006841 for(i=0; i<pPage->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00006842 pCell = findCell(pPage, i);
drh4b70f112004-05-02 21:12:19 +00006843 if( !pPage->leaf ){
danielk197762c14b32008-11-19 09:05:26 +00006844 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006845 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006846 }
drh4b70f112004-05-02 21:12:19 +00006847 rc = clearCell(pPage, pCell);
danielk19776b456a22005-03-21 04:04:02 +00006848 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006849 }
drha34b6762004-05-07 13:30:42 +00006850 if( !pPage->leaf ){
danielk197762c14b32008-11-19 09:05:26 +00006851 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006852 if( rc ) goto cleardatabasepage_out;
danielk1977c7af4842008-10-27 13:59:33 +00006853 }else if( pnChange ){
6854 assert( pPage->intKey );
6855 *pnChange += pPage->nCell;
drh2aa679f2001-06-25 02:11:07 +00006856 }
6857 if( freePageFlag ){
drhc314dc72009-07-21 11:52:34 +00006858 freePage(pPage, &rc);
danielk19773b8a05f2007-03-19 17:44:26 +00006859 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
drh3a4c1412004-05-09 20:40:11 +00006860 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00006861 }
danielk19776b456a22005-03-21 04:04:02 +00006862
6863cleardatabasepage_out:
drh4b70f112004-05-02 21:12:19 +00006864 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00006865 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006866}
6867
6868/*
drhab01f612004-05-22 02:55:23 +00006869** Delete all information from a single table in the database. iTable is
6870** the page number of the root of the table. After this routine returns,
6871** the root page is empty, but still exists.
6872**
6873** This routine will fail with SQLITE_LOCKED if there are any open
6874** read cursors on the table. Open write cursors are moved to the
6875** root of the table.
danielk1977c7af4842008-10-27 13:59:33 +00006876**
6877** If pnChange is not NULL, then table iTable must be an intkey table. The
6878** integer value pointed to by pnChange is incremented by the number of
6879** entries in the table.
drh8b2f49b2001-06-08 00:21:52 +00006880*/
danielk1977c7af4842008-10-27 13:59:33 +00006881int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
drh8b2f49b2001-06-08 00:21:52 +00006882 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00006883 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00006884 sqlite3BtreeEnter(p);
drh64022502009-01-09 14:11:04 +00006885 assert( p->inTrans==TRANS_WRITE );
danielk197796d48e92009-06-29 06:00:37 +00006886
6887 /* Invalidate all incrblob cursors open on table iTable (assuming iTable
6888 ** is the root of a table b-tree - if it is not, the following call is
6889 ** a no-op). */
drheeb844a2009-08-08 18:01:07 +00006890 invalidateIncrblobCursors(p, 0, 1);
danielk197796d48e92009-06-29 06:00:37 +00006891
drhc046e3e2009-07-15 11:26:44 +00006892 rc = saveAllCursors(pBt, (Pgno)iTable, 0);
6893 if( SQLITE_OK==rc ){
danielk197762c14b32008-11-19 09:05:26 +00006894 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
drh8b2f49b2001-06-08 00:21:52 +00006895 }
drhd677b3d2007-08-20 22:48:41 +00006896 sqlite3BtreeLeave(p);
6897 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006898}
6899
6900/*
6901** Erase all information in a table and add the root of the table to
6902** the freelist. Except, the root of the principle table (the one on
drhab01f612004-05-22 02:55:23 +00006903** page 1) is never added to the freelist.
6904**
6905** This routine will fail with SQLITE_LOCKED if there are any open
6906** cursors on the table.
drh205f48e2004-11-05 00:43:11 +00006907**
6908** If AUTOVACUUM is enabled and the page at iTable is not the last
6909** root page in the database file, then the last root page
6910** in the database file is moved into the slot formerly occupied by
6911** iTable and that last slot formerly occupied by the last root page
6912** is added to the freelist instead of iTable. In this say, all
6913** root pages are kept at the beginning of the database file, which
6914** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6915** page number that used to be the last root page in the file before
6916** the move. If no page gets moved, *piMoved is set to 0.
6917** The last root page is recorded in meta[3] and the value of
6918** meta[3] is updated by this procedure.
drh8b2f49b2001-06-08 00:21:52 +00006919*/
danielk197789d40042008-11-17 14:20:56 +00006920static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
drh8b2f49b2001-06-08 00:21:52 +00006921 int rc;
danielk1977a0bf2652004-11-04 14:30:04 +00006922 MemPage *pPage = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00006923 BtShared *pBt = p->pBt;
danielk1977a0bf2652004-11-04 14:30:04 +00006924
drh1fee73e2007-08-29 04:00:57 +00006925 assert( sqlite3BtreeHoldsMutex(p) );
drh64022502009-01-09 14:11:04 +00006926 assert( p->inTrans==TRANS_WRITE );
danielk1977a0bf2652004-11-04 14:30:04 +00006927
danielk1977e6efa742004-11-10 11:55:10 +00006928 /* It is illegal to drop a table if any cursors are open on the
6929 ** database. This is because in auto-vacuum mode the backend may
6930 ** need to move another root-page to fill a gap left by the deleted
6931 ** root page. If an open cursor was using this page a problem would
6932 ** occur.
drhc046e3e2009-07-15 11:26:44 +00006933 **
6934 ** This error is caught long before control reaches this point.
danielk1977e6efa742004-11-10 11:55:10 +00006935 */
drhc046e3e2009-07-15 11:26:44 +00006936 if( NEVER(pBt->pCursor) ){
danielk1977404ca072009-03-16 13:19:36 +00006937 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
6938 return SQLITE_LOCKED_SHAREDCACHE;
drh5df72a52002-06-06 23:16:05 +00006939 }
danielk1977a0bf2652004-11-04 14:30:04 +00006940
danielk197730548662009-07-09 05:07:37 +00006941 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
drh2aa679f2001-06-25 02:11:07 +00006942 if( rc ) return rc;
danielk1977c7af4842008-10-27 13:59:33 +00006943 rc = sqlite3BtreeClearTable(p, iTable, 0);
danielk19776b456a22005-03-21 04:04:02 +00006944 if( rc ){
6945 releasePage(pPage);
6946 return rc;
6947 }
danielk1977a0bf2652004-11-04 14:30:04 +00006948
drh205f48e2004-11-05 00:43:11 +00006949 *piMoved = 0;
danielk1977a0bf2652004-11-04 14:30:04 +00006950
drh4b70f112004-05-02 21:12:19 +00006951 if( iTable>1 ){
danielk1977a0bf2652004-11-04 14:30:04 +00006952#ifdef SQLITE_OMIT_AUTOVACUUM
drhc314dc72009-07-21 11:52:34 +00006953 freePage(pPage, &rc);
danielk1977a0bf2652004-11-04 14:30:04 +00006954 releasePage(pPage);
6955#else
6956 if( pBt->autoVacuum ){
6957 Pgno maxRootPgno;
danielk1977602b4662009-07-02 07:47:33 +00006958 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006959
6960 if( iTable==maxRootPgno ){
6961 /* If the table being dropped is the table with the largest root-page
6962 ** number in the database, put the root page on the free list.
6963 */
drhc314dc72009-07-21 11:52:34 +00006964 freePage(pPage, &rc);
danielk1977a0bf2652004-11-04 14:30:04 +00006965 releasePage(pPage);
6966 if( rc!=SQLITE_OK ){
6967 return rc;
6968 }
6969 }else{
6970 /* The table being dropped does not have the largest root-page
6971 ** number in the database. So move the page that does into the
6972 ** gap left by the deleted root-page.
6973 */
6974 MemPage *pMove;
6975 releasePage(pPage);
danielk197730548662009-07-09 05:07:37 +00006976 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006977 if( rc!=SQLITE_OK ){
6978 return rc;
6979 }
danielk19774c999992008-07-16 18:17:55 +00006980 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006981 releasePage(pMove);
6982 if( rc!=SQLITE_OK ){
6983 return rc;
6984 }
drhfe3313f2009-07-21 19:02:20 +00006985 pMove = 0;
danielk197730548662009-07-09 05:07:37 +00006986 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
drhc314dc72009-07-21 11:52:34 +00006987 freePage(pMove, &rc);
danielk1977a0bf2652004-11-04 14:30:04 +00006988 releasePage(pMove);
6989 if( rc!=SQLITE_OK ){
6990 return rc;
6991 }
6992 *piMoved = maxRootPgno;
6993 }
6994
danielk1977599fcba2004-11-08 07:13:13 +00006995 /* Set the new 'max-root-page' value in the database header. This
6996 ** is the old value less one, less one more if that happens to
6997 ** be a root-page number, less one again if that is the
6998 ** PENDING_BYTE_PAGE.
6999 */
danielk197787a6e732004-11-05 12:58:25 +00007000 maxRootPgno--;
drhe1849652009-07-15 18:15:22 +00007001 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7002 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
danielk197787a6e732004-11-05 12:58:25 +00007003 maxRootPgno--;
7004 }
danielk1977599fcba2004-11-08 07:13:13 +00007005 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7006
danielk1977aef0bf62005-12-30 16:28:01 +00007007 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00007008 }else{
drhc314dc72009-07-21 11:52:34 +00007009 freePage(pPage, &rc);
danielk1977a0bf2652004-11-04 14:30:04 +00007010 releasePage(pPage);
7011 }
7012#endif
drh2aa679f2001-06-25 02:11:07 +00007013 }else{
drhc046e3e2009-07-15 11:26:44 +00007014 /* If sqlite3BtreeDropTable was called on page 1.
7015 ** This really never should happen except in a corrupt
7016 ** database.
7017 */
drha34b6762004-05-07 13:30:42 +00007018 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
danielk1977a0bf2652004-11-04 14:30:04 +00007019 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00007020 }
drh8b2f49b2001-06-08 00:21:52 +00007021 return rc;
7022}
drhd677b3d2007-08-20 22:48:41 +00007023int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7024 int rc;
7025 sqlite3BtreeEnter(p);
7026 rc = btreeDropTable(p, iTable, piMoved);
7027 sqlite3BtreeLeave(p);
7028 return rc;
7029}
drh8b2f49b2001-06-08 00:21:52 +00007030
drh001bbcb2003-03-19 03:14:00 +00007031
drh8b2f49b2001-06-08 00:21:52 +00007032/*
danielk1977602b4662009-07-02 07:47:33 +00007033** This function may only be called if the b-tree connection already
7034** has a read or write transaction open on the database.
7035**
drh23e11ca2004-05-04 17:27:28 +00007036** Read the meta-information out of a database file. Meta[0]
7037** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00007038** through meta[15] are available for use by higher layers. Meta[0]
7039** is read-only, the others are read/write.
7040**
7041** The schema layer numbers meta values differently. At the schema
7042** layer (and the SetCookie and ReadCookie opcodes) the number of
7043** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00007044*/
danielk1977602b4662009-07-02 07:47:33 +00007045void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
danielk1977aef0bf62005-12-30 16:28:01 +00007046 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00007047
drhd677b3d2007-08-20 22:48:41 +00007048 sqlite3BtreeEnter(p);
danielk1977602b4662009-07-02 07:47:33 +00007049 assert( p->inTrans>TRANS_NONE );
danielk1977e0d9e6f2009-07-03 16:25:06 +00007050 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
danielk1977602b4662009-07-02 07:47:33 +00007051 assert( pBt->pPage1 );
drh23e11ca2004-05-04 17:27:28 +00007052 assert( idx>=0 && idx<=15 );
danielk1977ea897302008-09-19 15:10:58 +00007053
danielk1977602b4662009-07-02 07:47:33 +00007054 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
drhae157872004-08-14 19:20:09 +00007055
danielk1977602b4662009-07-02 07:47:33 +00007056 /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7057 ** database, mark the database as read-only. */
danielk1977003ba062004-11-04 02:57:33 +00007058#ifdef SQLITE_OMIT_AUTOVACUUM
danielk19770d19f7a2009-06-03 11:25:07 +00007059 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1;
danielk1977003ba062004-11-04 02:57:33 +00007060#endif
drhae157872004-08-14 19:20:09 +00007061
drhd677b3d2007-08-20 22:48:41 +00007062 sqlite3BtreeLeave(p);
drh8b2f49b2001-06-08 00:21:52 +00007063}
7064
7065/*
drh23e11ca2004-05-04 17:27:28 +00007066** Write meta-information back into the database. Meta[0] is
7067** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00007068*/
danielk1977aef0bf62005-12-30 16:28:01 +00007069int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7070 BtShared *pBt = p->pBt;
drh4b70f112004-05-02 21:12:19 +00007071 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00007072 int rc;
drh23e11ca2004-05-04 17:27:28 +00007073 assert( idx>=1 && idx<=15 );
drhd677b3d2007-08-20 22:48:41 +00007074 sqlite3BtreeEnter(p);
drh64022502009-01-09 14:11:04 +00007075 assert( p->inTrans==TRANS_WRITE );
7076 assert( pBt->pPage1!=0 );
7077 pP1 = pBt->pPage1->aData;
7078 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7079 if( rc==SQLITE_OK ){
7080 put4byte(&pP1[36 + idx*4], iMeta);
danielk19774152e672007-09-12 17:01:45 +00007081#ifndef SQLITE_OMIT_AUTOVACUUM
danielk19770d19f7a2009-06-03 11:25:07 +00007082 if( idx==BTREE_INCR_VACUUM ){
drh64022502009-01-09 14:11:04 +00007083 assert( pBt->autoVacuum || iMeta==0 );
7084 assert( iMeta==0 || iMeta==1 );
7085 pBt->incrVacuum = (u8)iMeta;
drhd677b3d2007-08-20 22:48:41 +00007086 }
drh64022502009-01-09 14:11:04 +00007087#endif
drh5df72a52002-06-06 23:16:05 +00007088 }
drhd677b3d2007-08-20 22:48:41 +00007089 sqlite3BtreeLeave(p);
7090 return rc;
drh8b2f49b2001-06-08 00:21:52 +00007091}
drh8c42ca92001-06-22 19:15:00 +00007092
danielk1977a5533162009-02-24 10:01:51 +00007093#ifndef SQLITE_OMIT_BTREECOUNT
7094/*
7095** The first argument, pCur, is a cursor opened on some b-tree. Count the
7096** number of entries in the b-tree and write the result to *pnEntry.
7097**
7098** SQLITE_OK is returned if the operation is successfully executed.
7099** Otherwise, if an error is encountered (i.e. an IO error or database
7100** corruption) an SQLite error code is returned.
7101*/
7102int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7103 i64 nEntry = 0; /* Value to return in *pnEntry */
7104 int rc; /* Return code */
7105 rc = moveToRoot(pCur);
7106
7107 /* Unless an error occurs, the following loop runs one iteration for each
7108 ** page in the B-Tree structure (not including overflow pages).
7109 */
7110 while( rc==SQLITE_OK ){
7111 int iIdx; /* Index of child node in parent */
7112 MemPage *pPage; /* Current page of the b-tree */
7113
7114 /* If this is a leaf page or the tree is not an int-key tree, then
7115 ** this page contains countable entries. Increment the entry counter
7116 ** accordingly.
7117 */
7118 pPage = pCur->apPage[pCur->iPage];
7119 if( pPage->leaf || !pPage->intKey ){
7120 nEntry += pPage->nCell;
7121 }
7122
7123 /* pPage is a leaf node. This loop navigates the cursor so that it
7124 ** points to the first interior cell that it points to the parent of
7125 ** the next page in the tree that has not yet been visited. The
7126 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7127 ** of the page, or to the number of cells in the page if the next page
7128 ** to visit is the right-child of its parent.
7129 **
7130 ** If all pages in the tree have been visited, return SQLITE_OK to the
7131 ** caller.
7132 */
7133 if( pPage->leaf ){
7134 do {
7135 if( pCur->iPage==0 ){
7136 /* All pages of the b-tree have been visited. Return successfully. */
7137 *pnEntry = nEntry;
7138 return SQLITE_OK;
7139 }
danielk197730548662009-07-09 05:07:37 +00007140 moveToParent(pCur);
danielk1977a5533162009-02-24 10:01:51 +00007141 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7142
7143 pCur->aiIdx[pCur->iPage]++;
7144 pPage = pCur->apPage[pCur->iPage];
7145 }
7146
7147 /* Descend to the child node of the cell that the cursor currently
7148 ** points at. This is the right-child if (iIdx==pPage->nCell).
7149 */
7150 iIdx = pCur->aiIdx[pCur->iPage];
7151 if( iIdx==pPage->nCell ){
7152 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7153 }else{
7154 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7155 }
7156 }
7157
shanebe217792009-03-05 04:20:31 +00007158 /* An error has occurred. Return an error code. */
danielk1977a5533162009-02-24 10:01:51 +00007159 return rc;
7160}
7161#endif
drhdd793422001-06-28 01:54:48 +00007162
drhdd793422001-06-28 01:54:48 +00007163/*
drh5eddca62001-06-30 21:53:53 +00007164** Return the pager associated with a BTree. This routine is used for
7165** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00007166*/
danielk1977aef0bf62005-12-30 16:28:01 +00007167Pager *sqlite3BtreePager(Btree *p){
7168 return p->pBt->pPager;
drhdd793422001-06-28 01:54:48 +00007169}
drh5eddca62001-06-30 21:53:53 +00007170
drhb7f91642004-10-31 02:22:47 +00007171#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00007172/*
7173** Append a message to the error message string.
7174*/
drh2e38c322004-09-03 18:38:44 +00007175static void checkAppendMsg(
7176 IntegrityCk *pCheck,
7177 char *zMsg1,
7178 const char *zFormat,
7179 ...
7180){
7181 va_list ap;
drh1dcdbc02007-01-27 02:24:54 +00007182 if( !pCheck->mxErr ) return;
7183 pCheck->mxErr--;
7184 pCheck->nErr++;
drh2e38c322004-09-03 18:38:44 +00007185 va_start(ap, zFormat);
drhf089aa42008-07-08 19:34:06 +00007186 if( pCheck->errMsg.nChar ){
7187 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
drh5eddca62001-06-30 21:53:53 +00007188 }
drhf089aa42008-07-08 19:34:06 +00007189 if( zMsg1 ){
7190 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7191 }
7192 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7193 va_end(ap);
drhc890fec2008-08-01 20:10:08 +00007194 if( pCheck->errMsg.mallocFailed ){
7195 pCheck->mallocFailed = 1;
7196 }
drh5eddca62001-06-30 21:53:53 +00007197}
drhb7f91642004-10-31 02:22:47 +00007198#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00007199
drhb7f91642004-10-31 02:22:47 +00007200#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00007201/*
7202** Add 1 to the reference count for page iPage. If this is the second
7203** reference to the page, add an error message to pCheck->zErrMsg.
7204** Return 1 if there are 2 ore more references to the page and 0 if
7205** if this is the first reference to the page.
7206**
7207** Also check that the page number is in bounds.
7208*/
danielk197789d40042008-11-17 14:20:56 +00007209static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00007210 if( iPage==0 ) return 1;
danielk197789d40042008-11-17 14:20:56 +00007211 if( iPage>pCheck->nPage ){
drh2e38c322004-09-03 18:38:44 +00007212 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
drh5eddca62001-06-30 21:53:53 +00007213 return 1;
7214 }
7215 if( pCheck->anRef[iPage]==1 ){
drh2e38c322004-09-03 18:38:44 +00007216 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00007217 return 1;
7218 }
7219 return (pCheck->anRef[iPage]++)>1;
7220}
7221
danielk1977afcdd022004-10-31 16:25:42 +00007222#ifndef SQLITE_OMIT_AUTOVACUUM
7223/*
7224** Check that the entry in the pointer-map for page iChild maps to
7225** page iParent, pointer type ptrType. If not, append an error message
7226** to pCheck.
7227*/
7228static void checkPtrmap(
7229 IntegrityCk *pCheck, /* Integrity check context */
7230 Pgno iChild, /* Child page number */
7231 u8 eType, /* Expected pointer map type */
7232 Pgno iParent, /* Expected pointer map parent page number */
7233 char *zContext /* Context description (used for error msg) */
7234){
7235 int rc;
7236 u8 ePtrmapType;
7237 Pgno iPtrmapParent;
7238
7239 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7240 if( rc!=SQLITE_OK ){
drhb56cd552009-05-01 13:16:54 +00007241 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
danielk1977afcdd022004-10-31 16:25:42 +00007242 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7243 return;
7244 }
7245
7246 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7247 checkAppendMsg(pCheck, zContext,
7248 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7249 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7250 }
7251}
7252#endif
7253
drh5eddca62001-06-30 21:53:53 +00007254/*
7255** Check the integrity of the freelist or of an overflow page list.
7256** Verify that the number of pages on the list is N.
7257*/
drh30e58752002-03-02 20:41:57 +00007258static void checkList(
7259 IntegrityCk *pCheck, /* Integrity checking context */
7260 int isFreeList, /* True for a freelist. False for overflow page list */
7261 int iPage, /* Page number for first page in the list */
7262 int N, /* Expected number of pages in the list */
7263 char *zContext /* Context for error messages */
7264){
7265 int i;
drh3a4c1412004-05-09 20:40:11 +00007266 int expected = N;
7267 int iFirst = iPage;
drh1dcdbc02007-01-27 02:24:54 +00007268 while( N-- > 0 && pCheck->mxErr ){
danielk19773b8a05f2007-03-19 17:44:26 +00007269 DbPage *pOvflPage;
7270 unsigned char *pOvflData;
drh5eddca62001-06-30 21:53:53 +00007271 if( iPage<1 ){
drh2e38c322004-09-03 18:38:44 +00007272 checkAppendMsg(pCheck, zContext,
7273 "%d of %d pages missing from overflow list starting at %d",
drh3a4c1412004-05-09 20:40:11 +00007274 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00007275 break;
7276 }
7277 if( checkRef(pCheck, iPage, zContext) ) break;
danielk19773b8a05f2007-03-19 17:44:26 +00007278 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
drh2e38c322004-09-03 18:38:44 +00007279 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00007280 break;
7281 }
danielk19773b8a05f2007-03-19 17:44:26 +00007282 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
drh30e58752002-03-02 20:41:57 +00007283 if( isFreeList ){
danielk19773b8a05f2007-03-19 17:44:26 +00007284 int n = get4byte(&pOvflData[4]);
danielk1977687566d2004-11-02 12:56:41 +00007285#ifndef SQLITE_OMIT_AUTOVACUUM
7286 if( pCheck->pBt->autoVacuum ){
7287 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7288 }
7289#endif
drh45b1fac2008-07-04 17:52:42 +00007290 if( n>pCheck->pBt->usableSize/4-2 ){
drh2e38c322004-09-03 18:38:44 +00007291 checkAppendMsg(pCheck, zContext,
7292 "freelist leaf count too big on page %d", iPage);
drhee696e22004-08-30 16:52:17 +00007293 N--;
7294 }else{
7295 for(i=0; i<n; i++){
danielk19773b8a05f2007-03-19 17:44:26 +00007296 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
danielk1977687566d2004-11-02 12:56:41 +00007297#ifndef SQLITE_OMIT_AUTOVACUUM
7298 if( pCheck->pBt->autoVacuum ){
7299 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7300 }
7301#endif
7302 checkRef(pCheck, iFreePage, zContext);
drhee696e22004-08-30 16:52:17 +00007303 }
7304 N -= n;
drh30e58752002-03-02 20:41:57 +00007305 }
drh30e58752002-03-02 20:41:57 +00007306 }
danielk1977afcdd022004-10-31 16:25:42 +00007307#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00007308 else{
7309 /* If this database supports auto-vacuum and iPage is not the last
7310 ** page in this overflow list, check that the pointer-map entry for
7311 ** the following page matches iPage.
7312 */
7313 if( pCheck->pBt->autoVacuum && N>0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00007314 i = get4byte(pOvflData);
danielk1977687566d2004-11-02 12:56:41 +00007315 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7316 }
danielk1977afcdd022004-10-31 16:25:42 +00007317 }
7318#endif
danielk19773b8a05f2007-03-19 17:44:26 +00007319 iPage = get4byte(pOvflData);
7320 sqlite3PagerUnref(pOvflPage);
drh5eddca62001-06-30 21:53:53 +00007321 }
7322}
drhb7f91642004-10-31 02:22:47 +00007323#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00007324
drhb7f91642004-10-31 02:22:47 +00007325#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00007326/*
7327** Do various sanity checks on a single page of a tree. Return
7328** the tree depth. Root pages return 0. Parents of root pages
7329** return 1, and so forth.
7330**
7331** These checks are done:
7332**
7333** 1. Make sure that cells and freeblocks do not overlap
7334** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00007335** NO 2. Make sure cell keys are in order.
7336** NO 3. Make sure no key is less than or equal to zLowerBound.
7337** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00007338** 5. Check the integrity of overflow pages.
7339** 6. Recursively call checkTreePage on all children.
7340** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00007341** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00007342** the root of the tree.
7343*/
7344static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00007345 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00007346 int iPage, /* Page number of the page to check */
drh74161702006-02-24 02:53:49 +00007347 char *zParentContext /* Parent context */
drh5eddca62001-06-30 21:53:53 +00007348){
7349 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00007350 int i, rc, depth, d2, pgno, cnt;
drh43605152004-05-29 21:46:49 +00007351 int hdr, cellStart;
7352 int nCell;
drhda200cc2004-05-09 11:51:38 +00007353 u8 *data;
danielk1977aef0bf62005-12-30 16:28:01 +00007354 BtShared *pBt;
drh4f26bb62005-09-08 14:17:20 +00007355 int usableSize;
drh5eddca62001-06-30 21:53:53 +00007356 char zContext[100];
shane0af3f892008-11-12 04:55:34 +00007357 char *hit = 0;
drh5eddca62001-06-30 21:53:53 +00007358
drh5bb3eb92007-05-04 13:15:55 +00007359 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
danielk1977ef73ee92004-11-06 12:26:07 +00007360
drh5eddca62001-06-30 21:53:53 +00007361 /* Check that the page exists
7362 */
drhd9cb6ac2005-10-20 07:28:17 +00007363 pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00007364 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00007365 if( iPage==0 ) return 0;
7366 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
danielk197730548662009-07-09 05:07:37 +00007367 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
drh2e38c322004-09-03 18:38:44 +00007368 checkAppendMsg(pCheck, zContext,
7369 "unable to get the page. error code=%d", rc);
drh5eddca62001-06-30 21:53:53 +00007370 return 0;
7371 }
danielk197793caf5a2009-07-11 06:55:33 +00007372
7373 /* Clear MemPage.isInit to make sure the corruption detection code in
7374 ** btreeInitPage() is executed. */
7375 pPage->isInit = 0;
danielk197730548662009-07-09 05:07:37 +00007376 if( (rc = btreeInitPage(pPage))!=0 ){
drh64022502009-01-09 14:11:04 +00007377 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
drh16a9b832007-05-05 18:39:25 +00007378 checkAppendMsg(pCheck, zContext,
danielk197730548662009-07-09 05:07:37 +00007379 "btreeInitPage() returns error code %d", rc);
drh91025292004-05-03 19:49:32 +00007380 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00007381 return 0;
7382 }
7383
7384 /* Check out all the cells.
7385 */
7386 depth = 0;
drh1dcdbc02007-01-27 02:24:54 +00007387 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
drh6f11bef2004-05-13 01:12:56 +00007388 u8 *pCell;
danielk197789d40042008-11-17 14:20:56 +00007389 u32 sz;
drh6f11bef2004-05-13 01:12:56 +00007390 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00007391
7392 /* Check payload overflow pages
7393 */
drh5bb3eb92007-05-04 13:15:55 +00007394 sqlite3_snprintf(sizeof(zContext), zContext,
7395 "On tree page %d cell %d: ", iPage, i);
danielk19771cc5ed82007-05-16 17:28:43 +00007396 pCell = findCell(pPage,i);
danielk197730548662009-07-09 05:07:37 +00007397 btreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00007398 sz = info.nData;
drhf49661a2008-12-10 16:45:50 +00007399 if( !pPage->intKey ) sz += (int)info.nKey;
drh72365832007-03-06 15:53:44 +00007400 assert( sz==info.nPayload );
danielk19775be31f52009-03-30 13:53:43 +00007401 if( (sz>info.nLocal)
7402 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7403 ){
drhb6f41482004-05-14 01:58:11 +00007404 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
danielk1977afcdd022004-10-31 16:25:42 +00007405 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7406#ifndef SQLITE_OMIT_AUTOVACUUM
7407 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00007408 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
danielk1977afcdd022004-10-31 16:25:42 +00007409 }
7410#endif
7411 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00007412 }
7413
7414 /* Check sanity of left child page.
7415 */
drhda200cc2004-05-09 11:51:38 +00007416 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00007417 pgno = get4byte(pCell);
danielk1977afcdd022004-10-31 16:25:42 +00007418#ifndef SQLITE_OMIT_AUTOVACUUM
7419 if( pBt->autoVacuum ){
7420 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7421 }
7422#endif
danielk197762c14b32008-11-19 09:05:26 +00007423 d2 = checkTreePage(pCheck, pgno, zContext);
drhda200cc2004-05-09 11:51:38 +00007424 if( i>0 && d2!=depth ){
7425 checkAppendMsg(pCheck, zContext, "Child page depth differs");
7426 }
7427 depth = d2;
drh5eddca62001-06-30 21:53:53 +00007428 }
drh5eddca62001-06-30 21:53:53 +00007429 }
drhda200cc2004-05-09 11:51:38 +00007430 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00007431 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh5bb3eb92007-05-04 13:15:55 +00007432 sqlite3_snprintf(sizeof(zContext), zContext,
7433 "On page %d at right child: ", iPage);
danielk1977afcdd022004-10-31 16:25:42 +00007434#ifndef SQLITE_OMIT_AUTOVACUUM
7435 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00007436 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
danielk1977afcdd022004-10-31 16:25:42 +00007437 }
7438#endif
danielk197762c14b32008-11-19 09:05:26 +00007439 checkTreePage(pCheck, pgno, zContext);
drhda200cc2004-05-09 11:51:38 +00007440 }
drh5eddca62001-06-30 21:53:53 +00007441
7442 /* Check for complete coverage of the page
7443 */
drhda200cc2004-05-09 11:51:38 +00007444 data = pPage->aData;
7445 hdr = pPage->hdrOffset;
drhf7141992008-06-19 00:16:08 +00007446 hit = sqlite3PageMalloc( pBt->pageSize );
drhc890fec2008-08-01 20:10:08 +00007447 if( hit==0 ){
7448 pCheck->mallocFailed = 1;
7449 }else{
shane5780ebd2008-11-11 17:36:30 +00007450 u16 contentOffset = get2byte(&data[hdr+5]);
drhd7c7ecd2009-07-14 17:48:06 +00007451 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
shane5780ebd2008-11-11 17:36:30 +00007452 memset(hit+contentOffset, 0, usableSize-contentOffset);
7453 memset(hit, 1, contentOffset);
drh2e38c322004-09-03 18:38:44 +00007454 nCell = get2byte(&data[hdr+3]);
7455 cellStart = hdr + 12 - 4*pPage->leaf;
7456 for(i=0; i<nCell; i++){
7457 int pc = get2byte(&data[cellStart+i*2]);
danielk1977daca5432008-08-25 11:57:16 +00007458 u16 size = 1024;
drh2e38c322004-09-03 18:38:44 +00007459 int j;
drh8c2bbb62009-07-10 02:52:20 +00007460 if( pc<=usableSize-4 ){
danielk1977daca5432008-08-25 11:57:16 +00007461 size = cellSizePtr(pPage, &data[pc]);
7462 }
drhd7c7ecd2009-07-14 17:48:06 +00007463 if( (pc+size-1)>=usableSize ){
danielk19777701e812005-01-10 12:59:51 +00007464 checkAppendMsg(pCheck, 0,
7465 "Corruption detected in cell %d on page %d",i,iPage,0);
7466 }else{
7467 for(j=pc+size-1; j>=pc; j--) hit[j]++;
7468 }
drh2e38c322004-09-03 18:38:44 +00007469 }
drh8c2bbb62009-07-10 02:52:20 +00007470 i = get2byte(&data[hdr+1]);
7471 while( i>0 ){
7472 int size, j;
7473 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */
7474 size = get2byte(&data[i+2]);
7475 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
7476 for(j=i+size-1; j>=i; j--) hit[j]++;
7477 j = get2byte(&data[i]);
7478 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
7479 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
7480 i = j;
drh2e38c322004-09-03 18:38:44 +00007481 }
7482 for(i=cnt=0; i<usableSize; i++){
7483 if( hit[i]==0 ){
7484 cnt++;
7485 }else if( hit[i]>1 ){
7486 checkAppendMsg(pCheck, 0,
7487 "Multiple uses for byte %d of page %d", i, iPage);
7488 break;
7489 }
7490 }
7491 if( cnt!=data[hdr+7] ){
7492 checkAppendMsg(pCheck, 0,
drh8c2bbb62009-07-10 02:52:20 +00007493 "Fragmentation of %d bytes reported as %d on page %d",
drh2e38c322004-09-03 18:38:44 +00007494 cnt, data[hdr+7], iPage);
drh5eddca62001-06-30 21:53:53 +00007495 }
7496 }
drh8c2bbb62009-07-10 02:52:20 +00007497 sqlite3PageFree(hit);
drh4b70f112004-05-02 21:12:19 +00007498 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00007499 return depth+1;
drh5eddca62001-06-30 21:53:53 +00007500}
drhb7f91642004-10-31 02:22:47 +00007501#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00007502
drhb7f91642004-10-31 02:22:47 +00007503#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00007504/*
7505** This routine does a complete check of the given BTree file. aRoot[] is
7506** an array of pages numbers were each page number is the root page of
7507** a table. nRoot is the number of entries in aRoot.
7508**
danielk19773509a652009-07-06 18:56:13 +00007509** A read-only or read-write transaction must be opened before calling
7510** this function.
7511**
drhc890fec2008-08-01 20:10:08 +00007512** Write the number of error seen in *pnErr. Except for some memory
drhe43ba702008-12-05 22:40:08 +00007513** allocation errors, an error message held in memory obtained from
drhc890fec2008-08-01 20:10:08 +00007514** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
drhe43ba702008-12-05 22:40:08 +00007515** returned. If a memory allocation error occurs, NULL is returned.
drh5eddca62001-06-30 21:53:53 +00007516*/
drh1dcdbc02007-01-27 02:24:54 +00007517char *sqlite3BtreeIntegrityCheck(
7518 Btree *p, /* The btree to be checked */
7519 int *aRoot, /* An array of root pages numbers for individual trees */
7520 int nRoot, /* Number of entries in aRoot[] */
7521 int mxErr, /* Stop reporting errors after this many */
7522 int *pnErr /* Write number of errors seen to this variable */
7523){
danielk197789d40042008-11-17 14:20:56 +00007524 Pgno i;
drh5eddca62001-06-30 21:53:53 +00007525 int nRef;
drhaaab5722002-02-19 13:39:21 +00007526 IntegrityCk sCheck;
danielk1977aef0bf62005-12-30 16:28:01 +00007527 BtShared *pBt = p->pBt;
drhf089aa42008-07-08 19:34:06 +00007528 char zErr[100];
drh5eddca62001-06-30 21:53:53 +00007529
drhd677b3d2007-08-20 22:48:41 +00007530 sqlite3BtreeEnter(p);
danielk19773509a652009-07-06 18:56:13 +00007531 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
danielk19773b8a05f2007-03-19 17:44:26 +00007532 nRef = sqlite3PagerRefcount(pBt->pPager);
drh5eddca62001-06-30 21:53:53 +00007533 sCheck.pBt = pBt;
7534 sCheck.pPager = pBt->pPager;
danielk197789d40042008-11-17 14:20:56 +00007535 sCheck.nPage = pagerPagecount(sCheck.pBt);
drh1dcdbc02007-01-27 02:24:54 +00007536 sCheck.mxErr = mxErr;
7537 sCheck.nErr = 0;
drhc890fec2008-08-01 20:10:08 +00007538 sCheck.mallocFailed = 0;
drh1dcdbc02007-01-27 02:24:54 +00007539 *pnErr = 0;
drh0de8c112002-07-06 16:32:14 +00007540 if( sCheck.nPage==0 ){
drhd677b3d2007-08-20 22:48:41 +00007541 sqlite3BtreeLeave(p);
drh0de8c112002-07-06 16:32:14 +00007542 return 0;
7543 }
drhe5ae5732008-06-15 02:51:47 +00007544 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
danielk1977ac245ec2005-01-14 13:50:11 +00007545 if( !sCheck.anRef ){
drh1dcdbc02007-01-27 02:24:54 +00007546 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00007547 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00007548 return 0;
danielk1977ac245ec2005-01-14 13:50:11 +00007549 }
drhda200cc2004-05-09 11:51:38 +00007550 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh42cac6d2004-11-20 20:31:11 +00007551 i = PENDING_BYTE_PAGE(pBt);
drh1f595712004-06-15 01:40:29 +00007552 if( i<=sCheck.nPage ){
7553 sCheck.anRef[i] = 1;
7554 }
drhf089aa42008-07-08 19:34:06 +00007555 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
drh5eddca62001-06-30 21:53:53 +00007556
7557 /* Check the integrity of the freelist
7558 */
drha34b6762004-05-07 13:30:42 +00007559 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7560 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00007561
7562 /* Check all the tables.
7563 */
danielk197789d40042008-11-17 14:20:56 +00007564 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
drh4ff6dfa2002-03-03 23:06:00 +00007565 if( aRoot[i]==0 ) continue;
danielk1977687566d2004-11-02 12:56:41 +00007566#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00007567 if( pBt->autoVacuum && aRoot[i]>1 ){
7568 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7569 }
7570#endif
danielk197762c14b32008-11-19 09:05:26 +00007571 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ");
drh5eddca62001-06-30 21:53:53 +00007572 }
7573
7574 /* Make sure every page in the file is referenced
7575 */
drh1dcdbc02007-01-27 02:24:54 +00007576 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
danielk1977afcdd022004-10-31 16:25:42 +00007577#ifdef SQLITE_OMIT_AUTOVACUUM
drh5eddca62001-06-30 21:53:53 +00007578 if( sCheck.anRef[i]==0 ){
drh2e38c322004-09-03 18:38:44 +00007579 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
drh5eddca62001-06-30 21:53:53 +00007580 }
danielk1977afcdd022004-10-31 16:25:42 +00007581#else
7582 /* If the database supports auto-vacuum, make sure no tables contain
7583 ** references to pointer-map pages.
7584 */
7585 if( sCheck.anRef[i]==0 &&
danielk1977266664d2006-02-10 08:24:21 +00007586 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00007587 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7588 }
7589 if( sCheck.anRef[i]!=0 &&
danielk1977266664d2006-02-10 08:24:21 +00007590 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00007591 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7592 }
7593#endif
drh5eddca62001-06-30 21:53:53 +00007594 }
7595
drh64022502009-01-09 14:11:04 +00007596 /* Make sure this analysis did not leave any unref() pages.
7597 ** This is an internal consistency check; an integrity check
7598 ** of the integrity check.
drh5eddca62001-06-30 21:53:53 +00007599 */
drh64022502009-01-09 14:11:04 +00007600 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
drh2e38c322004-09-03 18:38:44 +00007601 checkAppendMsg(&sCheck, 0,
drh5eddca62001-06-30 21:53:53 +00007602 "Outstanding page count goes from %d to %d during this analysis",
danielk19773b8a05f2007-03-19 17:44:26 +00007603 nRef, sqlite3PagerRefcount(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00007604 );
drh5eddca62001-06-30 21:53:53 +00007605 }
7606
7607 /* Clean up and report errors.
7608 */
drhd677b3d2007-08-20 22:48:41 +00007609 sqlite3BtreeLeave(p);
drh17435752007-08-16 04:30:38 +00007610 sqlite3_free(sCheck.anRef);
drhc890fec2008-08-01 20:10:08 +00007611 if( sCheck.mallocFailed ){
7612 sqlite3StrAccumReset(&sCheck.errMsg);
7613 *pnErr = sCheck.nErr+1;
7614 return 0;
7615 }
drh1dcdbc02007-01-27 02:24:54 +00007616 *pnErr = sCheck.nErr;
drhf089aa42008-07-08 19:34:06 +00007617 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7618 return sqlite3StrAccumFinish(&sCheck.errMsg);
drh5eddca62001-06-30 21:53:53 +00007619}
drhb7f91642004-10-31 02:22:47 +00007620#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
paulb95a8862003-04-01 21:16:41 +00007621
drh73509ee2003-04-06 20:44:45 +00007622/*
7623** Return the full pathname of the underlying database file.
drhd0679ed2007-08-28 22:24:34 +00007624**
7625** The pager filename is invariant as long as the pager is
7626** open so it is safe to access without the BtShared mutex.
drh73509ee2003-04-06 20:44:45 +00007627*/
danielk1977aef0bf62005-12-30 16:28:01 +00007628const char *sqlite3BtreeGetFilename(Btree *p){
7629 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007630 return sqlite3PagerFilename(p->pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00007631}
7632
7633/*
danielk19775865e3d2004-06-14 06:03:57 +00007634** Return the pathname of the journal file for this database. The return
7635** value of this routine is the same regardless of whether the journal file
7636** has been created or not.
drhd0679ed2007-08-28 22:24:34 +00007637**
7638** The pager journal filename is invariant as long as the pager is
7639** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007640*/
danielk1977aef0bf62005-12-30 16:28:01 +00007641const char *sqlite3BtreeGetJournalname(Btree *p){
7642 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007643 return sqlite3PagerJournalname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007644}
7645
danielk19771d850a72004-05-31 08:26:49 +00007646/*
7647** Return non-zero if a transaction is active.
7648*/
danielk1977aef0bf62005-12-30 16:28:01 +00007649int sqlite3BtreeIsInTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007650 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00007651 return (p && (p->inTrans==TRANS_WRITE));
danielk19771d850a72004-05-31 08:26:49 +00007652}
7653
7654/*
danielk19772372c2b2006-06-27 16:34:56 +00007655** Return non-zero if a read (or write) transaction is active.
7656*/
7657int sqlite3BtreeIsInReadTrans(Btree *p){
drh64022502009-01-09 14:11:04 +00007658 assert( p );
drhe5fe6902007-12-07 18:55:28 +00007659 assert( sqlite3_mutex_held(p->db->mutex) );
drh64022502009-01-09 14:11:04 +00007660 return p->inTrans!=TRANS_NONE;
danielk19772372c2b2006-06-27 16:34:56 +00007661}
7662
danielk197704103022009-02-03 16:51:24 +00007663int sqlite3BtreeIsInBackup(Btree *p){
7664 assert( p );
7665 assert( sqlite3_mutex_held(p->db->mutex) );
7666 return p->nBackup!=0;
7667}
7668
danielk19772372c2b2006-06-27 16:34:56 +00007669/*
danielk1977da184232006-01-05 11:34:32 +00007670** This function returns a pointer to a blob of memory associated with
drh85b623f2007-12-13 21:54:09 +00007671** a single shared-btree. The memory is used by client code for its own
danielk1977da184232006-01-05 11:34:32 +00007672** purposes (for example, to store a high-level schema associated with
7673** the shared-btree). The btree layer manages reference counting issues.
7674**
7675** The first time this is called on a shared-btree, nBytes bytes of memory
7676** are allocated, zeroed, and returned to the caller. For each subsequent
7677** call the nBytes parameter is ignored and a pointer to the same blob
7678** of memory returned.
7679**
danielk1977171bfed2008-06-23 09:50:50 +00007680** If the nBytes parameter is 0 and the blob of memory has not yet been
7681** allocated, a null pointer is returned. If the blob has already been
7682** allocated, it is returned as normal.
7683**
danielk1977da184232006-01-05 11:34:32 +00007684** Just before the shared-btree is closed, the function passed as the
7685** xFree argument when the memory allocation was made is invoked on the
drh17435752007-08-16 04:30:38 +00007686** blob of allocated memory. This function should not call sqlite3_free()
danielk1977da184232006-01-05 11:34:32 +00007687** on the memory, the btree layer does that.
7688*/
7689void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7690 BtShared *pBt = p->pBt;
drh27641702007-08-22 02:56:42 +00007691 sqlite3BtreeEnter(p);
danielk1977171bfed2008-06-23 09:50:50 +00007692 if( !pBt->pSchema && nBytes ){
drh17435752007-08-16 04:30:38 +00007693 pBt->pSchema = sqlite3MallocZero(nBytes);
danielk1977da184232006-01-05 11:34:32 +00007694 pBt->xFreeSchema = xFree;
7695 }
drh27641702007-08-22 02:56:42 +00007696 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00007697 return pBt->pSchema;
7698}
7699
danielk1977c87d34d2006-01-06 13:00:28 +00007700/*
danielk1977404ca072009-03-16 13:19:36 +00007701** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
7702** btree as the argument handle holds an exclusive lock on the
7703** sqlite_master table. Otherwise SQLITE_OK.
danielk1977c87d34d2006-01-06 13:00:28 +00007704*/
7705int sqlite3BtreeSchemaLocked(Btree *p){
drh27641702007-08-22 02:56:42 +00007706 int rc;
drhe5fe6902007-12-07 18:55:28 +00007707 assert( sqlite3_mutex_held(p->db->mutex) );
drh27641702007-08-22 02:56:42 +00007708 sqlite3BtreeEnter(p);
danielk1977404ca072009-03-16 13:19:36 +00007709 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
7710 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
drh27641702007-08-22 02:56:42 +00007711 sqlite3BtreeLeave(p);
7712 return rc;
danielk1977c87d34d2006-01-06 13:00:28 +00007713}
7714
drha154dcd2006-03-22 22:10:07 +00007715
7716#ifndef SQLITE_OMIT_SHARED_CACHE
7717/*
7718** Obtain a lock on the table whose root page is iTab. The
7719** lock is a write lock if isWritelock is true or a read lock
7720** if it is false.
7721*/
danielk1977c00da102006-01-07 13:21:04 +00007722int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
danielk19772e94d4d2006-01-09 05:36:27 +00007723 int rc = SQLITE_OK;
danielk1977602b4662009-07-02 07:47:33 +00007724 assert( p->inTrans!=TRANS_NONE );
drh6a9ad3d2008-04-02 16:29:30 +00007725 if( p->sharable ){
7726 u8 lockType = READ_LOCK + isWriteLock;
7727 assert( READ_LOCK+1==WRITE_LOCK );
7728 assert( isWriteLock==0 || isWriteLock==1 );
danielk1977602b4662009-07-02 07:47:33 +00007729
drh6a9ad3d2008-04-02 16:29:30 +00007730 sqlite3BtreeEnter(p);
drhc25eabe2009-02-24 18:57:31 +00007731 rc = querySharedCacheTableLock(p, iTab, lockType);
drh6a9ad3d2008-04-02 16:29:30 +00007732 if( rc==SQLITE_OK ){
drhc25eabe2009-02-24 18:57:31 +00007733 rc = setSharedCacheTableLock(p, iTab, lockType);
drh6a9ad3d2008-04-02 16:29:30 +00007734 }
7735 sqlite3BtreeLeave(p);
danielk1977c00da102006-01-07 13:21:04 +00007736 }
7737 return rc;
7738}
drha154dcd2006-03-22 22:10:07 +00007739#endif
danielk1977b82e7ed2006-01-11 14:09:31 +00007740
danielk1977b4e9af92007-05-01 17:49:49 +00007741#ifndef SQLITE_OMIT_INCRBLOB
7742/*
7743** Argument pCsr must be a cursor opened for writing on an
7744** INTKEY table currently pointing at a valid table entry.
7745** This function modifies the data stored as part of that entry.
danielk1977ecaecf92009-07-08 08:05:35 +00007746**
7747** Only the data content may only be modified, it is not possible to
7748** change the length of the data stored. If this function is called with
7749** parameters that attempt to write past the end of the existing data,
7750** no modifications are made and SQLITE_CORRUPT is returned.
danielk1977b4e9af92007-05-01 17:49:49 +00007751*/
danielk1977dcbb5d32007-05-04 18:36:44 +00007752int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
danielk1977c9000e62009-07-08 13:55:28 +00007753 int rc;
drh1fee73e2007-08-29 04:00:57 +00007754 assert( cursorHoldsMutex(pCsr) );
drhe5fe6902007-12-07 18:55:28 +00007755 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
danielk197796d48e92009-06-29 06:00:37 +00007756 assert( pCsr->isIncrblobHandle );
danielk19773588ceb2008-06-10 17:30:26 +00007757
danielk1977c9000e62009-07-08 13:55:28 +00007758 rc = restoreCursorPosition(pCsr);
7759 if( rc!=SQLITE_OK ){
7760 return rc;
7761 }
danielk19773588ceb2008-06-10 17:30:26 +00007762 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7763 if( pCsr->eState!=CURSOR_VALID ){
7764 return SQLITE_ABORT;
danielk1977dcbb5d32007-05-04 18:36:44 +00007765 }
7766
danielk1977c9000e62009-07-08 13:55:28 +00007767 /* Check some assumptions:
danielk1977dcbb5d32007-05-04 18:36:44 +00007768 ** (a) the cursor is open for writing,
danielk1977c9000e62009-07-08 13:55:28 +00007769 ** (b) there is a read/write transaction open,
7770 ** (c) the connection holds a write-lock on the table (if required),
7771 ** (d) there are no conflicting read-locks, and
7772 ** (e) the cursor points at a valid row of an intKey table.
danielk1977d04417962007-05-02 13:16:30 +00007773 */
danielk19774f029602009-07-08 18:45:37 +00007774 if( !pCsr->wrFlag ){
7775 return SQLITE_READONLY;
7776 }
danielk197796d48e92009-06-29 06:00:37 +00007777 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE );
7778 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
7779 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
danielk1977c9000e62009-07-08 13:55:28 +00007780 assert( pCsr->apPage[pCsr->iPage]->intKey );
danielk1977b4e9af92007-05-01 17:49:49 +00007781
drhfb192682009-07-11 18:26:28 +00007782 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
danielk1977b4e9af92007-05-01 17:49:49 +00007783}
danielk19772dec9702007-05-02 16:48:37 +00007784
7785/*
7786** Set a flag on this cursor to cache the locations of pages from the
danielk1977da107192007-05-04 08:32:13 +00007787** overflow list for the current row. This is used by cursors opened
7788** for incremental blob IO only.
7789**
7790** This function sets a flag only. The actual page location cache
7791** (stored in BtCursor.aOverflow[]) is allocated and used by function
7792** accessPayload() (the worker function for sqlite3BtreeData() and
7793** sqlite3BtreePutData()).
danielk19772dec9702007-05-02 16:48:37 +00007794*/
7795void sqlite3BtreeCacheOverflow(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00007796 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00007797 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007798 assert(!pCur->isIncrblobHandle);
danielk19772dec9702007-05-02 16:48:37 +00007799 assert(!pCur->aOverflow);
danielk1977dcbb5d32007-05-04 18:36:44 +00007800 pCur->isIncrblobHandle = 1;
danielk19772dec9702007-05-02 16:48:37 +00007801}
danielk1977b4e9af92007-05-01 17:49:49 +00007802#endif