blob: 6ae18386b1c683c243abcdc6864c4e8254c28648 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
drhdfe88ec2008-11-03 20:55:06 +000012** $Id: btree.c,v 1.527 2008/11/03 20:55:07 drh Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
drha3152892007-05-05 11:48:52 +000015** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
drha059ad02001-04-17 20:09:11 +000017*/
drha3152892007-05-05 11:48:52 +000018#include "btreeInt.h"
paulb95a8862003-04-01 21:16:41 +000019
drh8c42ca92001-06-22 19:15:00 +000020/*
drha3152892007-05-05 11:48:52 +000021** The header string that appears at the beginning of every
22** SQLite database.
drh556b2a22005-06-14 16:04:05 +000023*/
drh556b2a22005-06-14 16:04:05 +000024static const char zMagicHeader[] = SQLITE_FILE_HEADER;
drh08ed44e2001-04-29 23:32:55 +000025
drh8c42ca92001-06-22 19:15:00 +000026/*
drha3152892007-05-05 11:48:52 +000027** Set this global variable to 1 to enable tracing using the TRACE
28** macro.
drh615ae552005-01-16 23:21:00 +000029*/
drhe8f52c52008-07-12 14:52:20 +000030#if 0
mlcreech3a00f902008-03-04 17:45:01 +000031int sqlite3BtreeTrace=0; /* True to enable tracing */
drhe8f52c52008-07-12 14:52:20 +000032# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
33#else
34# define TRACE(X)
drh615ae552005-01-16 23:21:00 +000035#endif
drh615ae552005-01-16 23:21:00 +000036
drhf94a1732008-09-30 17:18:17 +000037/*
38** Sometimes we need a small amount of code such as a variable initialization
39** to setup for a later assert() statement. We do not want this code to
40** appear when assert() is disabled. The following macro is therefore
41** used to contain that setup code. The "VVA" acronym stands for
42** "Verification, Validation, and Accreditation". In other words, the
43** code within VVA_ONLY() will only run during verification processes.
44*/
45#ifndef NDEBUG
46# define VVA_ONLY(X) X
47#else
48# define VVA_ONLY(X)
49#endif
50
drh86f8c192007-08-22 00:39:19 +000051
52
drhe53831d2007-08-17 01:14:38 +000053#ifndef SQLITE_OMIT_SHARED_CACHE
54/*
danielk1977502b4e02008-09-02 14:07:24 +000055** A list of BtShared objects that are eligible for participation
56** in shared cache. This variable has file scope during normal builds,
57** but the test harness needs to access it so we make it global for
58** test builds.
drhe53831d2007-08-17 01:14:38 +000059*/
60#ifdef SQLITE_TEST
drh78f82d12008-09-02 00:52:52 +000061BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000062#else
drh78f82d12008-09-02 00:52:52 +000063static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000064#endif
drhe53831d2007-08-17 01:14:38 +000065#endif /* SQLITE_OMIT_SHARED_CACHE */
66
67#ifndef SQLITE_OMIT_SHARED_CACHE
68/*
69** Enable or disable the shared pager and schema features.
70**
71** This routine has no effect on existing database connections.
72** The shared cache setting effects only future calls to
73** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
74*/
75int sqlite3_enable_shared_cache(int enable){
danielk1977502b4e02008-09-02 14:07:24 +000076 sqlite3GlobalConfig.sharedCacheEnabled = enable;
drhe53831d2007-08-17 01:14:38 +000077 return SQLITE_OK;
78}
79#endif
80
drhd677b3d2007-08-20 22:48:41 +000081
drh615ae552005-01-16 23:21:00 +000082/*
drh66cbd152004-09-01 16:12:25 +000083** Forward declaration
84*/
danielk19773588ceb2008-06-10 17:30:26 +000085static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
drh66cbd152004-09-01 16:12:25 +000086
danielk1977aef0bf62005-12-30 16:28:01 +000087
88#ifdef SQLITE_OMIT_SHARED_CACHE
89 /*
90 ** The functions queryTableLock(), lockTable() and unlockAllTables()
91 ** manipulate entries in the BtShared.pLock linked list used to store
92 ** shared-cache table level locks. If the library is compiled with the
93 ** shared-cache feature disabled, then there is only ever one user
danielk1977da184232006-01-05 11:34:32 +000094 ** of each BtShared structure and so this locking is not necessary.
95 ** So define the lock related functions as no-ops.
danielk1977aef0bf62005-12-30 16:28:01 +000096 */
97 #define queryTableLock(a,b,c) SQLITE_OK
98 #define lockTable(a,b,c) SQLITE_OK
danielk1977da184232006-01-05 11:34:32 +000099 #define unlockAllTables(a)
drhe53831d2007-08-17 01:14:38 +0000100#endif
danielk1977aef0bf62005-12-30 16:28:01 +0000101
drhe53831d2007-08-17 01:14:38 +0000102#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977da184232006-01-05 11:34:32 +0000103/*
danielk1977aef0bf62005-12-30 16:28:01 +0000104** Query to see if btree handle p may obtain a lock of type eLock
105** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
106** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
danielk1977c87d34d2006-01-06 13:00:28 +0000107** SQLITE_LOCKED if not.
danielk1977aef0bf62005-12-30 16:28:01 +0000108*/
109static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
110 BtShared *pBt = p->pBt;
111 BtLock *pIter;
112
drh1fee73e2007-08-29 04:00:57 +0000113 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000114 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
115 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000116
danielk1977da184232006-01-05 11:34:32 +0000117 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000118 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000119 return SQLITE_OK;
120 }
121
danielk1977641b0f42007-12-21 04:47:25 +0000122 /* If some other connection is holding an exclusive lock, the
123 ** requested lock may not be obtained.
124 */
125 if( pBt->pExclusive && pBt->pExclusive!=p ){
126 return SQLITE_LOCKED;
127 }
128
danielk1977da184232006-01-05 11:34:32 +0000129 /* This (along with lockTable()) is where the ReadUncommitted flag is
130 ** dealt with. If the caller is querying for a read-lock and the flag is
131 ** set, it is unconditionally granted - even if there are write-locks
132 ** on the table. If a write-lock is requested, the ReadUncommitted flag
133 ** is not considered.
134 **
135 ** In function lockTable(), if a read-lock is demanded and the
136 ** ReadUncommitted flag is set, no entry is added to the locks list
137 ** (BtShared.pLock).
138 **
139 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
140 ** not create or respect table locks. The locking procedure for a
141 ** write-cursor does not change.
142 */
143 if(
drhe5fe6902007-12-07 18:55:28 +0000144 0==(p->db->flags&SQLITE_ReadUncommitted) ||
danielk1977da184232006-01-05 11:34:32 +0000145 eLock==WRITE_LOCK ||
drh47ded162006-01-06 01:42:58 +0000146 iTab==MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000147 ){
148 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
149 if( pIter->pBtree!=p && pIter->iTable==iTab &&
150 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
danielk1977c87d34d2006-01-06 13:00:28 +0000151 return SQLITE_LOCKED;
danielk1977da184232006-01-05 11:34:32 +0000152 }
danielk1977aef0bf62005-12-30 16:28:01 +0000153 }
154 }
155 return SQLITE_OK;
156}
drhe53831d2007-08-17 01:14:38 +0000157#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000158
drhe53831d2007-08-17 01:14:38 +0000159#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000160/*
161** Add a lock on the table with root-page iTable to the shared-btree used
162** by Btree handle p. Parameter eLock must be either READ_LOCK or
163** WRITE_LOCK.
164**
165** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
166** SQLITE_NOMEM may also be returned.
167*/
168static int lockTable(Btree *p, Pgno iTable, u8 eLock){
169 BtShared *pBt = p->pBt;
170 BtLock *pLock = 0;
171 BtLock *pIter;
172
drh1fee73e2007-08-29 04:00:57 +0000173 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000174 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
175 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000176
danielk1977da184232006-01-05 11:34:32 +0000177 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000178 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000179 return SQLITE_OK;
180 }
181
danielk1977aef0bf62005-12-30 16:28:01 +0000182 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
183
danielk1977da184232006-01-05 11:34:32 +0000184 /* If the read-uncommitted flag is set and a read-lock is requested,
185 ** return early without adding an entry to the BtShared.pLock list. See
186 ** comment in function queryTableLock() for more info on handling
187 ** the ReadUncommitted flag.
188 */
189 if(
drhe5fe6902007-12-07 18:55:28 +0000190 (p->db->flags&SQLITE_ReadUncommitted) &&
danielk1977da184232006-01-05 11:34:32 +0000191 (eLock==READ_LOCK) &&
drh47ded162006-01-06 01:42:58 +0000192 iTable!=MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000193 ){
194 return SQLITE_OK;
195 }
196
danielk1977aef0bf62005-12-30 16:28:01 +0000197 /* First search the list for an existing lock on this table. */
198 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
199 if( pIter->iTable==iTable && pIter->pBtree==p ){
200 pLock = pIter;
201 break;
202 }
203 }
204
205 /* If the above search did not find a BtLock struct associating Btree p
206 ** with table iTable, allocate one and link it into the list.
207 */
208 if( !pLock ){
drh17435752007-08-16 04:30:38 +0000209 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
danielk1977aef0bf62005-12-30 16:28:01 +0000210 if( !pLock ){
211 return SQLITE_NOMEM;
212 }
213 pLock->iTable = iTable;
214 pLock->pBtree = p;
215 pLock->pNext = pBt->pLock;
216 pBt->pLock = pLock;
217 }
218
219 /* Set the BtLock.eLock variable to the maximum of the current lock
220 ** and the requested lock. This means if a write-lock was already held
221 ** and a read-lock requested, we don't incorrectly downgrade the lock.
222 */
223 assert( WRITE_LOCK>READ_LOCK );
danielk19775118b912005-12-30 16:31:53 +0000224 if( eLock>pLock->eLock ){
225 pLock->eLock = eLock;
226 }
danielk1977aef0bf62005-12-30 16:28:01 +0000227
228 return SQLITE_OK;
229}
drhe53831d2007-08-17 01:14:38 +0000230#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000231
drhe53831d2007-08-17 01:14:38 +0000232#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000233/*
234** Release all the table locks (locks obtained via calls to the lockTable()
235** procedure) held by Btree handle p.
236*/
237static void unlockAllTables(Btree *p){
danielk1977641b0f42007-12-21 04:47:25 +0000238 BtShared *pBt = p->pBt;
239 BtLock **ppIter = &pBt->pLock;
danielk1977da184232006-01-05 11:34:32 +0000240
drh1fee73e2007-08-29 04:00:57 +0000241 assert( sqlite3BtreeHoldsMutex(p) );
drhe53831d2007-08-17 01:14:38 +0000242 assert( p->sharable || 0==*ppIter );
danielk1977da184232006-01-05 11:34:32 +0000243
danielk1977aef0bf62005-12-30 16:28:01 +0000244 while( *ppIter ){
245 BtLock *pLock = *ppIter;
danielk1977641b0f42007-12-21 04:47:25 +0000246 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
danielk1977aef0bf62005-12-30 16:28:01 +0000247 if( pLock->pBtree==p ){
248 *ppIter = pLock->pNext;
drh17435752007-08-16 04:30:38 +0000249 sqlite3_free(pLock);
danielk1977aef0bf62005-12-30 16:28:01 +0000250 }else{
251 ppIter = &pLock->pNext;
252 }
253 }
danielk1977641b0f42007-12-21 04:47:25 +0000254
255 if( pBt->pExclusive==p ){
256 pBt->pExclusive = 0;
257 }
danielk1977aef0bf62005-12-30 16:28:01 +0000258}
259#endif /* SQLITE_OMIT_SHARED_CACHE */
260
drh980b1a72006-08-16 16:42:48 +0000261static void releasePage(MemPage *pPage); /* Forward reference */
262
drh1fee73e2007-08-29 04:00:57 +0000263/*
264** Verify that the cursor holds a mutex on the BtShared
265*/
266#ifndef NDEBUG
267static int cursorHoldsMutex(BtCursor *p){
drhff0587c2007-08-29 17:43:19 +0000268 return sqlite3_mutex_held(p->pBt->mutex);
drh1fee73e2007-08-29 04:00:57 +0000269}
270#endif
271
272
danielk197792d4d7a2007-05-04 12:05:56 +0000273#ifndef SQLITE_OMIT_INCRBLOB
274/*
275** Invalidate the overflow page-list cache for cursor pCur, if any.
276*/
277static void invalidateOverflowCache(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000278 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000279 sqlite3_free(pCur->aOverflow);
danielk197792d4d7a2007-05-04 12:05:56 +0000280 pCur->aOverflow = 0;
281}
282
283/*
284** Invalidate the overflow page-list cache for all cursors opened
285** on the shared btree structure pBt.
286*/
287static void invalidateAllOverflowCache(BtShared *pBt){
288 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000289 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +0000290 for(p=pBt->pCursor; p; p=p->pNext){
291 invalidateOverflowCache(p);
292 }
293}
294#else
295 #define invalidateOverflowCache(x)
296 #define invalidateAllOverflowCache(x)
297#endif
298
drh980b1a72006-08-16 16:42:48 +0000299/*
300** Save the current cursor position in the variables BtCursor.nKey
301** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
302*/
303static int saveCursorPosition(BtCursor *pCur){
304 int rc;
305
306 assert( CURSOR_VALID==pCur->eState );
307 assert( 0==pCur->pKey );
drh1fee73e2007-08-29 04:00:57 +0000308 assert( cursorHoldsMutex(pCur) );
drh980b1a72006-08-16 16:42:48 +0000309
310 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
311
312 /* If this is an intKey table, then the above call to BtreeKeySize()
313 ** stores the integer key in pCur->nKey. In this case this value is
314 ** all that is required. Otherwise, if pCur is not open on an intKey
315 ** table, then malloc space for and store the pCur->nKey bytes of key
316 ** data.
317 */
danielk197771d5d2c2008-09-29 11:49:47 +0000318 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
drhe5ae5732008-06-15 02:51:47 +0000319 void *pKey = sqlite3Malloc(pCur->nKey);
drh980b1a72006-08-16 16:42:48 +0000320 if( pKey ){
321 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
322 if( rc==SQLITE_OK ){
323 pCur->pKey = pKey;
324 }else{
drh17435752007-08-16 04:30:38 +0000325 sqlite3_free(pKey);
drh980b1a72006-08-16 16:42:48 +0000326 }
327 }else{
328 rc = SQLITE_NOMEM;
329 }
330 }
danielk197771d5d2c2008-09-29 11:49:47 +0000331 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
drh980b1a72006-08-16 16:42:48 +0000332
333 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +0000334 int i;
335 for(i=0; i<=pCur->iPage; i++){
336 releasePage(pCur->apPage[i]);
337 pCur->apPage[i] = 0;
338 }
339 pCur->iPage = -1;
drh980b1a72006-08-16 16:42:48 +0000340 pCur->eState = CURSOR_REQUIRESEEK;
341 }
342
danielk197792d4d7a2007-05-04 12:05:56 +0000343 invalidateOverflowCache(pCur);
drh980b1a72006-08-16 16:42:48 +0000344 return rc;
345}
346
347/*
348** Save the positions of all cursors except pExcept open on the table
349** with root-page iRoot. Usually, this is called just before cursor
350** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
351*/
352static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
353 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000354 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +0000355 assert( pExcept==0 || pExcept->pBt==pBt );
drh980b1a72006-08-16 16:42:48 +0000356 for(p=pBt->pCursor; p; p=p->pNext){
357 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
358 p->eState==CURSOR_VALID ){
359 int rc = saveCursorPosition(p);
360 if( SQLITE_OK!=rc ){
361 return rc;
362 }
363 }
364 }
365 return SQLITE_OK;
366}
367
368/*
drhbf700f32007-03-31 02:36:44 +0000369** Clear the current cursor position.
370*/
danielk1977be51a652008-10-08 17:58:48 +0000371void sqlite3BtreeClearCursor(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000372 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000373 sqlite3_free(pCur->pKey);
drhbf700f32007-03-31 02:36:44 +0000374 pCur->pKey = 0;
375 pCur->eState = CURSOR_INVALID;
376}
377
378/*
drh980b1a72006-08-16 16:42:48 +0000379** Restore the cursor to the position it was in (or as close to as possible)
380** when saveCursorPosition() was called. Note that this call deletes the
381** saved position info stored by saveCursorPosition(), so there can be
drha3460582008-07-11 21:02:53 +0000382** at most one effective restoreCursorPosition() call after each
drh980b1a72006-08-16 16:42:48 +0000383** saveCursorPosition().
drh980b1a72006-08-16 16:42:48 +0000384*/
drha3460582008-07-11 21:02:53 +0000385int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
drhbf700f32007-03-31 02:36:44 +0000386 int rc;
drh1fee73e2007-08-29 04:00:57 +0000387 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +0000388 assert( pCur->eState>=CURSOR_REQUIRESEEK );
389 if( pCur->eState==CURSOR_FAULT ){
390 return pCur->skip;
391 }
drh980b1a72006-08-16 16:42:48 +0000392 pCur->eState = CURSOR_INVALID;
drhe63d9992008-08-13 19:11:48 +0000393 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
drh980b1a72006-08-16 16:42:48 +0000394 if( rc==SQLITE_OK ){
drh17435752007-08-16 04:30:38 +0000395 sqlite3_free(pCur->pKey);
drh980b1a72006-08-16 16:42:48 +0000396 pCur->pKey = 0;
drhbf700f32007-03-31 02:36:44 +0000397 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
drh980b1a72006-08-16 16:42:48 +0000398 }
399 return rc;
400}
401
drha3460582008-07-11 21:02:53 +0000402#define restoreCursorPosition(p) \
drhfb982642007-08-30 01:19:59 +0000403 (p->eState>=CURSOR_REQUIRESEEK ? \
drha3460582008-07-11 21:02:53 +0000404 sqlite3BtreeRestoreCursorPosition(p) : \
drh16a9b832007-05-05 18:39:25 +0000405 SQLITE_OK)
drh980b1a72006-08-16 16:42:48 +0000406
drha3460582008-07-11 21:02:53 +0000407/*
408** Determine whether or not a cursor has moved from the position it
drhdfe88ec2008-11-03 20:55:06 +0000409** was last placed at. Cursors can move when the row they are pointing
drha3460582008-07-11 21:02:53 +0000410** at is deleted out from under them.
411**
412** This routine returns an error code if something goes wrong. The
413** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
414*/
415int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
416 int rc;
417
418 rc = restoreCursorPosition(pCur);
419 if( rc ){
420 *pHasMoved = 1;
421 return rc;
422 }
423 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
424 *pHasMoved = 1;
425 }else{
426 *pHasMoved = 0;
427 }
428 return SQLITE_OK;
429}
430
danielk1977599fcba2004-11-08 07:13:13 +0000431#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977afcdd022004-10-31 16:25:42 +0000432/*
drha3152892007-05-05 11:48:52 +0000433** Given a page number of a regular database page, return the page
434** number for the pointer-map page that contains the entry for the
435** input page number.
danielk1977afcdd022004-10-31 16:25:42 +0000436*/
danielk1977266664d2006-02-10 08:24:21 +0000437static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
drhd677b3d2007-08-20 22:48:41 +0000438 int nPagesPerMapPage, iPtrMap, ret;
drh1fee73e2007-08-29 04:00:57 +0000439 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000440 nPagesPerMapPage = (pBt->usableSize/5)+1;
441 iPtrMap = (pgno-2)/nPagesPerMapPage;
442 ret = (iPtrMap*nPagesPerMapPage) + 2;
danielk1977266664d2006-02-10 08:24:21 +0000443 if( ret==PENDING_BYTE_PAGE(pBt) ){
444 ret++;
445 }
446 return ret;
447}
danielk1977a19df672004-11-03 11:37:07 +0000448
danielk1977afcdd022004-10-31 16:25:42 +0000449/*
danielk1977afcdd022004-10-31 16:25:42 +0000450** Write an entry into the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000451**
452** This routine updates the pointer map entry for page number 'key'
453** so that it maps to type 'eType' and parent page number 'pgno'.
454** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000455*/
danielk1977aef0bf62005-12-30 16:28:01 +0000456static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
danielk19773b8a05f2007-03-19 17:44:26 +0000457 DbPage *pDbPage; /* The pointer map page */
458 u8 *pPtrmap; /* The pointer map data */
459 Pgno iPtrmap; /* The pointer map page number */
460 int offset; /* Offset in pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000461 int rc;
462
drh1fee73e2007-08-29 04:00:57 +0000463 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977266664d2006-02-10 08:24:21 +0000464 /* The master-journal page number must never be used as a pointer map page */
465 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
466
danielk1977ac11ee62005-01-15 12:45:51 +0000467 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +0000468 if( key==0 ){
drh49285702005-09-17 15:20:26 +0000469 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +0000470 }
danielk1977266664d2006-02-10 08:24:21 +0000471 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000472 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977687566d2004-11-02 12:56:41 +0000473 if( rc!=SQLITE_OK ){
danielk1977afcdd022004-10-31 16:25:42 +0000474 return rc;
475 }
danielk19778c666b12008-07-18 09:34:57 +0000476 offset = PTRMAP_PTROFFSET(iPtrmap, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000477 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000478
drh615ae552005-01-16 23:21:00 +0000479 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
480 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
danielk19773b8a05f2007-03-19 17:44:26 +0000481 rc = sqlite3PagerWrite(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000482 if( rc==SQLITE_OK ){
483 pPtrmap[offset] = eType;
484 put4byte(&pPtrmap[offset+1], parent);
danielk1977afcdd022004-10-31 16:25:42 +0000485 }
danielk1977afcdd022004-10-31 16:25:42 +0000486 }
487
danielk19773b8a05f2007-03-19 17:44:26 +0000488 sqlite3PagerUnref(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000489 return rc;
danielk1977afcdd022004-10-31 16:25:42 +0000490}
491
492/*
493** Read an entry from the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000494**
495** This routine retrieves the pointer map entry for page 'key', writing
496** the type and parent page number to *pEType and *pPgno respectively.
497** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000498*/
danielk1977aef0bf62005-12-30 16:28:01 +0000499static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
danielk19773b8a05f2007-03-19 17:44:26 +0000500 DbPage *pDbPage; /* The pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000501 int iPtrmap; /* Pointer map page index */
502 u8 *pPtrmap; /* Pointer map page data */
503 int offset; /* Offset of entry in pointer map */
504 int rc;
505
drh1fee73e2007-08-29 04:00:57 +0000506 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000507
danielk1977266664d2006-02-10 08:24:21 +0000508 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000509 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000510 if( rc!=0 ){
511 return rc;
512 }
danielk19773b8a05f2007-03-19 17:44:26 +0000513 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000514
danielk19778c666b12008-07-18 09:34:57 +0000515 offset = PTRMAP_PTROFFSET(iPtrmap, key);
drh43617e92006-03-06 20:55:46 +0000516 assert( pEType!=0 );
517 *pEType = pPtrmap[offset];
danielk1977687566d2004-11-02 12:56:41 +0000518 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
danielk1977afcdd022004-10-31 16:25:42 +0000519
danielk19773b8a05f2007-03-19 17:44:26 +0000520 sqlite3PagerUnref(pDbPage);
drh49285702005-09-17 15:20:26 +0000521 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
danielk1977afcdd022004-10-31 16:25:42 +0000522 return SQLITE_OK;
523}
524
danielk197785d90ca2008-07-19 14:25:15 +0000525#else /* if defined SQLITE_OMIT_AUTOVACUUM */
526 #define ptrmapPut(w,x,y,z) SQLITE_OK
527 #define ptrmapGet(w,x,y,z) SQLITE_OK
528 #define ptrmapPutOvfl(y,z) SQLITE_OK
529#endif
danielk1977afcdd022004-10-31 16:25:42 +0000530
drh0d316a42002-08-11 20:10:47 +0000531/*
drh271efa52004-05-30 19:19:05 +0000532** Given a btree page and a cell index (0 means the first cell on
533** the page, 1 means the second cell, and so forth) return a pointer
534** to the cell content.
535**
536** This routine works only for pages that do not contain overflow cells.
drh3aac2dd2004-04-26 14:10:20 +0000537*/
drh1688c862008-07-18 02:44:17 +0000538#define findCell(P,I) \
539 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
drh43605152004-05-29 21:46:49 +0000540
541/*
drh93a960a2008-07-10 00:32:42 +0000542** This a more complex version of findCell() that works for
drh43605152004-05-29 21:46:49 +0000543** pages that do contain overflow cells. See insert
544*/
545static u8 *findOverflowCell(MemPage *pPage, int iCell){
546 int i;
drh1fee73e2007-08-29 04:00:57 +0000547 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +0000548 for(i=pPage->nOverflow-1; i>=0; i--){
drh6d08b4d2004-07-20 12:45:22 +0000549 int k;
550 struct _OvflCell *pOvfl;
551 pOvfl = &pPage->aOvfl[i];
552 k = pOvfl->idx;
553 if( k<=iCell ){
554 if( k==iCell ){
555 return pOvfl->pCell;
drh43605152004-05-29 21:46:49 +0000556 }
557 iCell--;
558 }
559 }
danielk19771cc5ed82007-05-16 17:28:43 +0000560 return findCell(pPage, iCell);
drh43605152004-05-29 21:46:49 +0000561}
562
563/*
564** Parse a cell content block and fill in the CellInfo structure. There
drh16a9b832007-05-05 18:39:25 +0000565** are two versions of this function. sqlite3BtreeParseCell() takes a
566** cell index as the second argument and sqlite3BtreeParseCellPtr()
567** takes a pointer to the body of the cell as its second argument.
danielk19771cc5ed82007-05-16 17:28:43 +0000568**
569** Within this file, the parseCell() macro can be called instead of
570** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
drh43605152004-05-29 21:46:49 +0000571*/
drh16a9b832007-05-05 18:39:25 +0000572void sqlite3BtreeParseCellPtr(
drh3aac2dd2004-04-26 14:10:20 +0000573 MemPage *pPage, /* Page containing the cell */
drh43605152004-05-29 21:46:49 +0000574 u8 *pCell, /* Pointer to the cell text. */
drh6f11bef2004-05-13 01:12:56 +0000575 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000576){
drh271efa52004-05-30 19:19:05 +0000577 int n; /* Number bytes in cell content header */
578 u32 nPayload; /* Number of bytes of cell payload */
drh43605152004-05-29 21:46:49 +0000579
drh1fee73e2007-08-29 04:00:57 +0000580 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000581
drh43605152004-05-29 21:46:49 +0000582 pInfo->pCell = pCell;
drhab01f612004-05-22 02:55:23 +0000583 assert( pPage->leaf==0 || pPage->leaf==1 );
drh271efa52004-05-30 19:19:05 +0000584 n = pPage->childPtrSize;
585 assert( n==4-4*pPage->leaf );
drh504b6982006-01-22 21:52:56 +0000586 if( pPage->intKey ){
drh79df1f42008-07-18 00:57:33 +0000587 if( pPage->hasData ){
588 n += getVarint32(&pCell[n], nPayload);
589 }else{
590 nPayload = 0;
591 }
592 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
593 pInfo->nData = nPayload;
drh504b6982006-01-22 21:52:56 +0000594 }else{
drh79df1f42008-07-18 00:57:33 +0000595 pInfo->nData = 0;
596 n += getVarint32(&pCell[n], nPayload);
597 pInfo->nKey = nPayload;
drh6f11bef2004-05-13 01:12:56 +0000598 }
drh72365832007-03-06 15:53:44 +0000599 pInfo->nPayload = nPayload;
drh504b6982006-01-22 21:52:56 +0000600 pInfo->nHeader = n;
drh79df1f42008-07-18 00:57:33 +0000601 if( likely(nPayload<=pPage->maxLocal) ){
drh271efa52004-05-30 19:19:05 +0000602 /* This is the (easy) common case where the entire payload fits
603 ** on the local page. No overflow is required.
604 */
605 int nSize; /* Total size of cell content in bytes */
drh79df1f42008-07-18 00:57:33 +0000606 nSize = nPayload + n;
drh6f11bef2004-05-13 01:12:56 +0000607 pInfo->nLocal = nPayload;
608 pInfo->iOverflow = 0;
drh79df1f42008-07-18 00:57:33 +0000609 if( (nSize & ~3)==0 ){
drh271efa52004-05-30 19:19:05 +0000610 nSize = 4; /* Minimum cell size is 4 */
drh43605152004-05-29 21:46:49 +0000611 }
drh271efa52004-05-30 19:19:05 +0000612 pInfo->nSize = nSize;
drh6f11bef2004-05-13 01:12:56 +0000613 }else{
drh271efa52004-05-30 19:19:05 +0000614 /* If the payload will not fit completely on the local page, we have
615 ** to decide how much to store locally and how much to spill onto
616 ** overflow pages. The strategy is to minimize the amount of unused
617 ** space on overflow pages while keeping the amount of local storage
618 ** in between minLocal and maxLocal.
619 **
620 ** Warning: changing the way overflow payload is distributed in any
621 ** way will result in an incompatible file format.
622 */
623 int minLocal; /* Minimum amount of payload held locally */
624 int maxLocal; /* Maximum amount of payload held locally */
625 int surplus; /* Overflow payload available for local storage */
626
627 minLocal = pPage->minLocal;
628 maxLocal = pPage->maxLocal;
629 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +0000630 if( surplus <= maxLocal ){
631 pInfo->nLocal = surplus;
632 }else{
633 pInfo->nLocal = minLocal;
634 }
635 pInfo->iOverflow = pInfo->nLocal + n;
636 pInfo->nSize = pInfo->iOverflow + 4;
637 }
drh3aac2dd2004-04-26 14:10:20 +0000638}
danielk19771cc5ed82007-05-16 17:28:43 +0000639#define parseCell(pPage, iCell, pInfo) \
640 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
drh16a9b832007-05-05 18:39:25 +0000641void sqlite3BtreeParseCell(
drh43605152004-05-29 21:46:49 +0000642 MemPage *pPage, /* Page containing the cell */
643 int iCell, /* The cell index. First cell is 0 */
644 CellInfo *pInfo /* Fill in this structure */
645){
danielk19771cc5ed82007-05-16 17:28:43 +0000646 parseCell(pPage, iCell, pInfo);
drh43605152004-05-29 21:46:49 +0000647}
drh3aac2dd2004-04-26 14:10:20 +0000648
649/*
drh43605152004-05-29 21:46:49 +0000650** Compute the total number of bytes that a Cell needs in the cell
651** data area of the btree-page. The return number includes the cell
652** data header and the local payload, but not any overflow page or
653** the space used by the cell pointer.
drh3b7511c2001-05-26 13:15:44 +0000654*/
danielk1977bc6ada42004-06-30 08:20:16 +0000655#ifndef NDEBUG
drha9121e42008-02-19 14:59:35 +0000656static u16 cellSize(MemPage *pPage, int iCell){
drh6f11bef2004-05-13 01:12:56 +0000657 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000658 sqlite3BtreeParseCell(pPage, iCell, &info);
drh43605152004-05-29 21:46:49 +0000659 return info.nSize;
660}
danielk1977bc6ada42004-06-30 08:20:16 +0000661#endif
drha9121e42008-02-19 14:59:35 +0000662static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
drh43605152004-05-29 21:46:49 +0000663 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000664 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +0000665 return info.nSize;
drh3b7511c2001-05-26 13:15:44 +0000666}
667
danielk197779a40da2005-01-16 08:00:01 +0000668#ifndef SQLITE_OMIT_AUTOVACUUM
drh3b7511c2001-05-26 13:15:44 +0000669/*
danielk197726836652005-01-17 01:33:13 +0000670** If the cell pCell, part of page pPage contains a pointer
danielk197779a40da2005-01-16 08:00:01 +0000671** to an overflow page, insert an entry into the pointer-map
672** for the overflow page.
danielk1977ac11ee62005-01-15 12:45:51 +0000673*/
danielk197726836652005-01-17 01:33:13 +0000674static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
drhfa67c3c2008-07-11 02:21:40 +0000675 CellInfo info;
676 assert( pCell!=0 );
677 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
678 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
679 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
680 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
681 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977ac11ee62005-01-15 12:45:51 +0000682 }
danielk197779a40da2005-01-16 08:00:01 +0000683 return SQLITE_OK;
danielk1977ac11ee62005-01-15 12:45:51 +0000684}
danielk197726836652005-01-17 01:33:13 +0000685/*
686** If the cell with index iCell on page pPage contains a pointer
687** to an overflow page, insert an entry into the pointer-map
688** for the overflow page.
689*/
690static int ptrmapPutOvfl(MemPage *pPage, int iCell){
691 u8 *pCell;
drh1fee73e2007-08-29 04:00:57 +0000692 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197726836652005-01-17 01:33:13 +0000693 pCell = findOverflowCell(pPage, iCell);
694 return ptrmapPutOvflPtr(pPage, pCell);
695}
danielk197779a40da2005-01-16 08:00:01 +0000696#endif
697
danielk1977ac11ee62005-01-15 12:45:51 +0000698
drhda200cc2004-05-09 11:51:38 +0000699/*
drh72f82862001-05-24 21:06:34 +0000700** Defragment the page given. All Cells are moved to the
drh3a4a2d42005-11-24 14:24:28 +0000701** end of the page and all free space is collected into one
702** big FreeBlk that occurs in between the header and cell
drh31beae92005-11-24 14:34:36 +0000703** pointer array and the cell content area.
drh365d68f2001-05-11 11:02:46 +0000704*/
danielk1977474b7cc2008-07-09 11:49:46 +0000705static void defragmentPage(MemPage *pPage){
drh43605152004-05-29 21:46:49 +0000706 int i; /* Loop counter */
707 int pc; /* Address of a i-th cell */
708 int addr; /* Offset of first byte after cell pointer array */
709 int hdr; /* Offset to the page header */
710 int size; /* Size of a cell */
711 int usableSize; /* Number of usable bytes on a page */
712 int cellOffset; /* Offset to the cell pointer array */
drh281b21d2008-08-22 12:57:08 +0000713 int cbrk; /* Offset to the cell content area */
drh43605152004-05-29 21:46:49 +0000714 int nCell; /* Number of cells on the page */
drh2e38c322004-09-03 18:38:44 +0000715 unsigned char *data; /* The page data */
716 unsigned char *temp; /* Temp area for cell content */
drh2af926b2001-05-15 00:39:25 +0000717
danielk19773b8a05f2007-03-19 17:44:26 +0000718 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000719 assert( pPage->pBt!=0 );
drh90f5ecb2004-07-22 01:19:35 +0000720 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
drh43605152004-05-29 21:46:49 +0000721 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +0000722 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh26b79942007-11-28 16:19:56 +0000723 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
drh43605152004-05-29 21:46:49 +0000724 data = pPage->aData;
drh9e572e62004-04-23 23:43:10 +0000725 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000726 cellOffset = pPage->cellOffset;
727 nCell = pPage->nCell;
728 assert( nCell==get2byte(&data[hdr+3]) );
729 usableSize = pPage->pBt->usableSize;
drh281b21d2008-08-22 12:57:08 +0000730 cbrk = get2byte(&data[hdr+5]);
731 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
732 cbrk = usableSize;
drh43605152004-05-29 21:46:49 +0000733 for(i=0; i<nCell; i++){
734 u8 *pAddr; /* The i-th cell pointer */
735 pAddr = &data[cellOffset + i*2];
736 pc = get2byte(pAddr);
737 assert( pc<pPage->pBt->usableSize );
738 size = cellSizePtr(pPage, &temp[pc]);
drh281b21d2008-08-22 12:57:08 +0000739 cbrk -= size;
740 memcpy(&data[cbrk], &temp[pc], size);
741 put2byte(pAddr, cbrk);
drh2af926b2001-05-15 00:39:25 +0000742 }
drh281b21d2008-08-22 12:57:08 +0000743 assert( cbrk>=cellOffset+2*nCell );
744 put2byte(&data[hdr+5], cbrk);
drh43605152004-05-29 21:46:49 +0000745 data[hdr+1] = 0;
746 data[hdr+2] = 0;
747 data[hdr+7] = 0;
748 addr = cellOffset+2*nCell;
drh281b21d2008-08-22 12:57:08 +0000749 memset(&data[addr], 0, cbrk-addr);
drh365d68f2001-05-11 11:02:46 +0000750}
751
drha059ad02001-04-17 20:09:11 +0000752/*
drh43605152004-05-29 21:46:49 +0000753** Allocate nByte bytes of space on a page.
drhbd03cae2001-06-02 02:40:57 +0000754**
drh9e572e62004-04-23 23:43:10 +0000755** Return the index into pPage->aData[] of the first byte of
drhfa67c3c2008-07-11 02:21:40 +0000756** the new allocation. The caller guarantees that there is enough
757** space. This routine will never fail.
drh2af926b2001-05-15 00:39:25 +0000758**
drh72f82862001-05-24 21:06:34 +0000759** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000760** nBytes of contiguous free space, then this routine automatically
761** calls defragementPage() to consolidate all free space before
762** allocating the new chunk.
drh7e3b0a02001-04-28 16:52:40 +0000763*/
drh9e572e62004-04-23 23:43:10 +0000764static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000765 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000766 int size;
drh24cd67e2004-05-10 16:18:47 +0000767 int nFrag;
drh43605152004-05-29 21:46:49 +0000768 int top;
769 int nCell;
770 int cellOffset;
drh9e572e62004-04-23 23:43:10 +0000771 unsigned char *data;
drh43605152004-05-29 21:46:49 +0000772
drh9e572e62004-04-23 23:43:10 +0000773 data = pPage->aData;
danielk19773b8a05f2007-03-19 17:44:26 +0000774 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000775 assert( pPage->pBt );
drh1fee73e2007-08-29 04:00:57 +0000776 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa67c3c2008-07-11 02:21:40 +0000777 assert( nByte>=0 ); /* Minimum cell size is 4 */
778 assert( pPage->nFree>=nByte );
779 assert( pPage->nOverflow==0 );
drh43605152004-05-29 21:46:49 +0000780 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000781 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000782
783 nFrag = data[hdr+7];
784 if( nFrag<60 ){
785 /* Search the freelist looking for a slot big enough to satisfy the
786 ** space request. */
787 addr = hdr+1;
788 while( (pc = get2byte(&data[addr]))>0 ){
789 size = get2byte(&data[pc+2]);
790 if( size>=nByte ){
791 if( size<nByte+4 ){
792 memcpy(&data[addr], &data[pc], 2);
793 data[hdr+7] = nFrag + size - nByte;
794 return pc;
795 }else{
796 put2byte(&data[pc+2], size-nByte);
797 return pc + size - nByte;
798 }
799 }
800 addr = pc;
drh9e572e62004-04-23 23:43:10 +0000801 }
802 }
drh43605152004-05-29 21:46:49 +0000803
804 /* Allocate memory from the gap in between the cell pointer array
805 ** and the cell content area.
806 */
807 top = get2byte(&data[hdr+5]);
808 nCell = get2byte(&data[hdr+3]);
809 cellOffset = pPage->cellOffset;
810 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
danielk1977474b7cc2008-07-09 11:49:46 +0000811 defragmentPage(pPage);
drh43605152004-05-29 21:46:49 +0000812 top = get2byte(&data[hdr+5]);
drh2af926b2001-05-15 00:39:25 +0000813 }
drh43605152004-05-29 21:46:49 +0000814 top -= nByte;
815 assert( cellOffset + 2*nCell <= top );
816 put2byte(&data[hdr+5], top);
817 return top;
drh7e3b0a02001-04-28 16:52:40 +0000818}
819
820/*
drh9e572e62004-04-23 23:43:10 +0000821** Return a section of the pPage->aData to the freelist.
822** The first byte of the new free block is pPage->aDisk[start]
823** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000824**
825** Most of the effort here is involved in coalesing adjacent
826** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000827*/
drh9e572e62004-04-23 23:43:10 +0000828static void freeSpace(MemPage *pPage, int start, int size){
drh43605152004-05-29 21:46:49 +0000829 int addr, pbegin, hdr;
drh9e572e62004-04-23 23:43:10 +0000830 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000831
drh9e572e62004-04-23 23:43:10 +0000832 assert( pPage->pBt!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +0000833 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000834 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
danielk1977bc6ada42004-06-30 08:20:16 +0000835 assert( (start + size)<=pPage->pBt->usableSize );
drh1fee73e2007-08-29 04:00:57 +0000836 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh34004ce2008-07-11 16:15:17 +0000837 assert( size>=0 ); /* Minimum cell size is 4 */
drh9e572e62004-04-23 23:43:10 +0000838
drhfcce93f2006-02-22 03:08:32 +0000839#ifdef SQLITE_SECURE_DELETE
840 /* Overwrite deleted information with zeros when the SECURE_DELETE
841 ** option is enabled at compile-time */
842 memset(&data[start], 0, size);
843#endif
844
drh9e572e62004-04-23 23:43:10 +0000845 /* Add the space back into the linked list of freeblocks */
drh43605152004-05-29 21:46:49 +0000846 hdr = pPage->hdrOffset;
847 addr = hdr + 1;
drh3aac2dd2004-04-26 14:10:20 +0000848 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drhb6f41482004-05-14 01:58:11 +0000849 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000850 assert( pbegin>addr );
851 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000852 }
drhb6f41482004-05-14 01:58:11 +0000853 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000854 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000855 put2byte(&data[addr], start);
856 put2byte(&data[start], pbegin);
857 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000858 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000859
860 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000861 addr = pPage->hdrOffset + 1;
862 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000863 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000864 assert( pbegin>addr );
drh43605152004-05-29 21:46:49 +0000865 assert( pbegin<=pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +0000866 pnext = get2byte(&data[pbegin]);
867 psize = get2byte(&data[pbegin+2]);
868 if( pbegin + psize + 3 >= pnext && pnext>0 ){
869 int frag = pnext - (pbegin+psize);
drh43605152004-05-29 21:46:49 +0000870 assert( frag<=data[pPage->hdrOffset+7] );
871 data[pPage->hdrOffset+7] -= frag;
drh9e572e62004-04-23 23:43:10 +0000872 put2byte(&data[pbegin], get2byte(&data[pnext]));
873 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
874 }else{
drh3aac2dd2004-04-26 14:10:20 +0000875 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000876 }
877 }
drh7e3b0a02001-04-28 16:52:40 +0000878
drh43605152004-05-29 21:46:49 +0000879 /* If the cell content area begins with a freeblock, remove it. */
880 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
881 int top;
882 pbegin = get2byte(&data[hdr+1]);
883 memcpy(&data[hdr+1], &data[pbegin], 2);
884 top = get2byte(&data[hdr+5]);
885 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
drh4b70f112004-05-02 21:12:19 +0000886 }
drh4b70f112004-05-02 21:12:19 +0000887}
888
889/*
drh271efa52004-05-30 19:19:05 +0000890** Decode the flags byte (the first byte of the header) for a page
891** and initialize fields of the MemPage structure accordingly.
drh44845222008-07-17 18:39:57 +0000892**
893** Only the following combinations are supported. Anything different
894** indicates a corrupt database files:
895**
896** PTF_ZERODATA
897** PTF_ZERODATA | PTF_LEAF
898** PTF_LEAFDATA | PTF_INTKEY
899** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
drh271efa52004-05-30 19:19:05 +0000900*/
drh44845222008-07-17 18:39:57 +0000901static int decodeFlags(MemPage *pPage, int flagByte){
danielk1977aef0bf62005-12-30 16:28:01 +0000902 BtShared *pBt; /* A copy of pPage->pBt */
drh271efa52004-05-30 19:19:05 +0000903
904 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
drh1fee73e2007-08-29 04:00:57 +0000905 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh44845222008-07-17 18:39:57 +0000906 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
907 flagByte &= ~PTF_LEAF;
908 pPage->childPtrSize = 4-4*pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000909 pBt = pPage->pBt;
drh44845222008-07-17 18:39:57 +0000910 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
911 pPage->intKey = 1;
912 pPage->hasData = pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000913 pPage->maxLocal = pBt->maxLeaf;
914 pPage->minLocal = pBt->minLeaf;
drh44845222008-07-17 18:39:57 +0000915 }else if( flagByte==PTF_ZERODATA ){
916 pPage->intKey = 0;
917 pPage->hasData = 0;
drh271efa52004-05-30 19:19:05 +0000918 pPage->maxLocal = pBt->maxLocal;
919 pPage->minLocal = pBt->minLocal;
drh44845222008-07-17 18:39:57 +0000920 }else{
921 return SQLITE_CORRUPT_BKPT;
drh271efa52004-05-30 19:19:05 +0000922 }
drh44845222008-07-17 18:39:57 +0000923 return SQLITE_OK;
drh271efa52004-05-30 19:19:05 +0000924}
925
926/*
drh7e3b0a02001-04-28 16:52:40 +0000927** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000928**
929** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000930** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000931** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
932** guarantee that the page is well-formed. It only shows that
933** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000934*/
danielk197771d5d2c2008-09-29 11:49:47 +0000935int sqlite3BtreeInitPage(MemPage *pPage){
drh2af926b2001-05-15 00:39:25 +0000936
danielk197771d5d2c2008-09-29 11:49:47 +0000937 assert( pPage->pBt!=0 );
938 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +0000939 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
drhbf4bca52007-09-06 22:19:14 +0000940 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
941 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +0000942
943 if( !pPage->isInit ){
944 int pc; /* Address of a freeblock within pPage->aData[] */
945 int hdr; /* Offset to beginning of page header */
946 u8 *data; /* Equal to pPage->aData */
947 BtShared *pBt; /* The main btree structure */
948 int usableSize; /* Amount of usable space on each page */
949 int cellOffset; /* Offset from start of page to first cell pointer */
950 int nFree; /* Number of unused bytes on the page */
951 int top; /* First byte of the cell content area */
952
953 pBt = pPage->pBt;
954
danielk1977eaa06f62008-09-18 17:34:44 +0000955 hdr = pPage->hdrOffset;
956 data = pPage->aData;
957 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
958 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
959 pPage->maskPage = pBt->pageSize - 1;
960 pPage->nOverflow = 0;
danielk1977eaa06f62008-09-18 17:34:44 +0000961 usableSize = pBt->usableSize;
962 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
963 top = get2byte(&data[hdr+5]);
964 pPage->nCell = get2byte(&data[hdr+3]);
965 if( pPage->nCell>MX_CELL(pBt) ){
966 /* To many cells for a single page. The page must be corrupt */
967 return SQLITE_CORRUPT_BKPT;
968 }
danielk1977eaa06f62008-09-18 17:34:44 +0000969
970 /* Compute the total free space on the page */
971 pc = get2byte(&data[hdr+1]);
972 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
973 while( pc>0 ){
974 int next, size;
975 if( pc>usableSize-4 ){
976 /* Free block is off the page */
977 return SQLITE_CORRUPT_BKPT;
978 }
979 next = get2byte(&data[pc]);
980 size = get2byte(&data[pc+2]);
981 if( next>0 && next<=pc+size+3 ){
982 /* Free blocks must be in accending order */
983 return SQLITE_CORRUPT_BKPT;
984 }
985 nFree += size;
986 pc = next;
987 }
988 pPage->nFree = nFree;
989 if( nFree>=usableSize ){
990 /* Free space cannot exceed total page size */
drh49285702005-09-17 15:20:26 +0000991 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +0000992 }
drh9e572e62004-04-23 23:43:10 +0000993
drh1688c862008-07-18 02:44:17 +0000994#if 0
995 /* Check that all the offsets in the cell offset array are within range.
996 **
997 ** Omitting this consistency check and using the pPage->maskPage mask
998 ** to prevent overrunning the page buffer in findCell() results in a
999 ** 2.5% performance gain.
1000 */
1001 {
1002 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1003 u8 *pEnd; /* Pointer to end of cell offset array */
1004 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1005 mask = ~(((u8)(pBt->pageSize>>8))-1);
1006 pEnd = &data[cellOffset + pPage->nCell*2];
1007 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1008 if( pOff!=pEnd ){
1009 return SQLITE_CORRUPT_BKPT;
1010 }
danielk1977e16535f2008-06-11 18:15:29 +00001011 }
drh1688c862008-07-18 02:44:17 +00001012#endif
danielk1977e16535f2008-06-11 18:15:29 +00001013
danielk197771d5d2c2008-09-29 11:49:47 +00001014 pPage->isInit = 1;
1015 }
drh9e572e62004-04-23 23:43:10 +00001016 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001017}
1018
1019/*
drh8b2f49b2001-06-08 00:21:52 +00001020** Set up a raw page so that it looks like a database page holding
1021** no entries.
drhbd03cae2001-06-02 02:40:57 +00001022*/
drh9e572e62004-04-23 23:43:10 +00001023static void zeroPage(MemPage *pPage, int flags){
1024 unsigned char *data = pPage->aData;
danielk1977aef0bf62005-12-30 16:28:01 +00001025 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00001026 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +00001027 int first;
1028
danielk19773b8a05f2007-03-19 17:44:26 +00001029 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
drhbf4bca52007-09-06 22:19:14 +00001030 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1031 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
danielk19773b8a05f2007-03-19 17:44:26 +00001032 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00001033 assert( sqlite3_mutex_held(pBt->mutex) );
drh1af4a6e2008-07-18 03:32:51 +00001034 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
drh9e572e62004-04-23 23:43:10 +00001035 data[hdr] = flags;
drh43605152004-05-29 21:46:49 +00001036 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1037 memset(&data[hdr+1], 0, 4);
1038 data[hdr+7] = 0;
1039 put2byte(&data[hdr+5], pBt->usableSize);
drhb6f41482004-05-14 01:58:11 +00001040 pPage->nFree = pBt->usableSize - first;
drh271efa52004-05-30 19:19:05 +00001041 decodeFlags(pPage, flags);
drh9e572e62004-04-23 23:43:10 +00001042 pPage->hdrOffset = hdr;
drh43605152004-05-29 21:46:49 +00001043 pPage->cellOffset = first;
1044 pPage->nOverflow = 0;
drh1688c862008-07-18 02:44:17 +00001045 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1046 pPage->maskPage = pBt->pageSize - 1;
drh43605152004-05-29 21:46:49 +00001047 pPage->nCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001048 pPage->isInit = 1;
drhbd03cae2001-06-02 02:40:57 +00001049}
1050
drh897a8202008-09-18 01:08:15 +00001051
1052/*
1053** Convert a DbPage obtained from the pager into a MemPage used by
1054** the btree layer.
1055*/
1056static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1057 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1058 pPage->aData = sqlite3PagerGetData(pDbPage);
1059 pPage->pDbPage = pDbPage;
1060 pPage->pBt = pBt;
1061 pPage->pgno = pgno;
1062 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1063 return pPage;
1064}
1065
drhbd03cae2001-06-02 02:40:57 +00001066/*
drh3aac2dd2004-04-26 14:10:20 +00001067** Get a page from the pager. Initialize the MemPage.pBt and
1068** MemPage.aData elements if needed.
drh538f5702007-04-13 02:14:30 +00001069**
1070** If the noContent flag is set, it means that we do not care about
1071** the content of the page at this time. So do not go to the disk
1072** to fetch the content. Just fill in the content with zeros for now.
1073** If in the future we call sqlite3PagerWrite() on this page, that
1074** means we have started to be concerned about content and the disk
1075** read should occur at that point.
drh3aac2dd2004-04-26 14:10:20 +00001076*/
drh16a9b832007-05-05 18:39:25 +00001077int sqlite3BtreeGetPage(
1078 BtShared *pBt, /* The btree */
1079 Pgno pgno, /* Number of the page to fetch */
1080 MemPage **ppPage, /* Return the page in this parameter */
1081 int noContent /* Do not load page content if true */
1082){
drh3aac2dd2004-04-26 14:10:20 +00001083 int rc;
danielk19773b8a05f2007-03-19 17:44:26 +00001084 DbPage *pDbPage;
1085
drh1fee73e2007-08-29 04:00:57 +00001086 assert( sqlite3_mutex_held(pBt->mutex) );
drh538f5702007-04-13 02:14:30 +00001087 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
drh3aac2dd2004-04-26 14:10:20 +00001088 if( rc ) return rc;
drh897a8202008-09-18 01:08:15 +00001089 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
drh3aac2dd2004-04-26 14:10:20 +00001090 return SQLITE_OK;
1091}
1092
1093/*
danielk197767fd7a92008-09-10 17:53:35 +00001094** Return the size of the database file in pages. Or return -1 if
1095** there is any kind of error.
1096*/
1097static int pagerPagecount(Pager *pPager){
1098 int rc;
1099 int nPage;
1100 rc = sqlite3PagerPagecount(pPager, &nPage);
1101 return (rc==SQLITE_OK?nPage:-1);
1102}
1103
1104/*
drhde647132004-05-07 17:57:49 +00001105** Get a page from the pager and initialize it. This routine
1106** is just a convenience wrapper around separate calls to
drh16a9b832007-05-05 18:39:25 +00001107** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
drhde647132004-05-07 17:57:49 +00001108*/
1109static int getAndInitPage(
danielk1977aef0bf62005-12-30 16:28:01 +00001110 BtShared *pBt, /* The database file */
drhde647132004-05-07 17:57:49 +00001111 Pgno pgno, /* Number of the page to get */
danielk197771d5d2c2008-09-29 11:49:47 +00001112 MemPage **ppPage /* Write the page pointer here */
drhde647132004-05-07 17:57:49 +00001113){
1114 int rc;
drh897a8202008-09-18 01:08:15 +00001115 DbPage *pDbPage;
1116 MemPage *pPage;
1117
drh1fee73e2007-08-29 04:00:57 +00001118 assert( sqlite3_mutex_held(pBt->mutex) );
drh897a8202008-09-18 01:08:15 +00001119 if( pgno==0 ){
drh49285702005-09-17 15:20:26 +00001120 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001121 }
danielk19779f580ad2008-09-10 14:45:57 +00001122
drh897a8202008-09-18 01:08:15 +00001123 /* It is often the case that the page we want is already in cache.
1124 ** If so, get it directly. This saves us from having to call
1125 ** pagerPagecount() to make sure pgno is within limits, which results
1126 ** in a measureable performance improvements.
1127 */
1128 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1129 if( pDbPage ){
1130 /* Page is already in cache */
1131 *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1132 rc = SQLITE_OK;
1133 }else{
1134 /* Page not in cache. Acquire it. */
1135 if( pgno>pagerPagecount(pBt->pPager) ){
1136 return SQLITE_CORRUPT_BKPT;
1137 }
1138 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1139 if( rc ) return rc;
1140 pPage = *ppPage;
1141 }
danielk197771d5d2c2008-09-29 11:49:47 +00001142 if( !pPage->isInit ){
1143 rc = sqlite3BtreeInitPage(pPage);
drh897a8202008-09-18 01:08:15 +00001144 }
1145 if( rc!=SQLITE_OK ){
1146 releasePage(pPage);
1147 *ppPage = 0;
1148 }
drhde647132004-05-07 17:57:49 +00001149 return rc;
1150}
1151
1152/*
drh3aac2dd2004-04-26 14:10:20 +00001153** Release a MemPage. This should be called once for each prior
drh16a9b832007-05-05 18:39:25 +00001154** call to sqlite3BtreeGetPage.
drh3aac2dd2004-04-26 14:10:20 +00001155*/
drh4b70f112004-05-02 21:12:19 +00001156static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +00001157 if( pPage ){
1158 assert( pPage->aData );
1159 assert( pPage->pBt );
drhbf4bca52007-09-06 22:19:14 +00001160 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1161 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
drh1fee73e2007-08-29 04:00:57 +00001162 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +00001163 sqlite3PagerUnref(pPage->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00001164 }
1165}
1166
1167/*
drha6abd042004-06-09 17:37:22 +00001168** During a rollback, when the pager reloads information into the cache
1169** so that the cache is restored to its original state at the start of
1170** the transaction, for each page restored this routine is called.
1171**
1172** This routine needs to reset the extra data section at the end of the
1173** page to agree with the restored data.
1174*/
danielk1977eaa06f62008-09-18 17:34:44 +00001175static void pageReinit(DbPage *pData){
drh07d183d2005-05-01 22:52:42 +00001176 MemPage *pPage;
danielk19773b8a05f2007-03-19 17:44:26 +00001177 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
danielk197771d5d2c2008-09-29 11:49:47 +00001178 if( pPage->isInit ){
drh1fee73e2007-08-29 04:00:57 +00001179 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drha6abd042004-06-09 17:37:22 +00001180 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001181 if( sqlite3PagerPageRefcount(pData)>0 ){
1182 sqlite3BtreeInitPage(pPage);
1183 }
drha6abd042004-06-09 17:37:22 +00001184 }
1185}
1186
1187/*
drhe5fe6902007-12-07 18:55:28 +00001188** Invoke the busy handler for a btree.
1189*/
1190static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1191 BtShared *pBt = (BtShared*)pArg;
1192 assert( pBt->db );
1193 assert( sqlite3_mutex_held(pBt->db->mutex) );
1194 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1195}
1196
1197/*
drhad3e0102004-09-03 23:32:18 +00001198** Open a database file.
1199**
drh382c0242001-10-06 16:33:02 +00001200** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +00001201** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +00001202** database file will be deleted when sqlite3BtreeClose() is called.
drhe53831d2007-08-17 01:14:38 +00001203** If zFilename is ":memory:" then an in-memory database is created
1204** that is automatically destroyed when it is closed.
drha059ad02001-04-17 20:09:11 +00001205*/
drh23e11ca2004-05-04 17:27:28 +00001206int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +00001207 const char *zFilename, /* Name of the file containing the BTree database */
drhe5fe6902007-12-07 18:55:28 +00001208 sqlite3 *db, /* Associated database handle */
drh3aac2dd2004-04-26 14:10:20 +00001209 Btree **ppBtree, /* Pointer to new Btree object written here */
drh33f4e022007-09-03 15:19:34 +00001210 int flags, /* Options */
1211 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
drh6019e162001-07-02 17:51:45 +00001212){
drhd677b3d2007-08-20 22:48:41 +00001213 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
drhe53831d2007-08-17 01:14:38 +00001214 BtShared *pBt = 0; /* Shared part of btree structure */
danielk1977aef0bf62005-12-30 16:28:01 +00001215 Btree *p; /* Handle to return */
danielk1977dddbcdc2007-04-26 14:42:34 +00001216 int rc = SQLITE_OK;
drh90f5ecb2004-07-22 01:19:35 +00001217 int nReserve;
1218 unsigned char zDbHeader[100];
danielk1977aef0bf62005-12-30 16:28:01 +00001219
1220 /* Set the variable isMemdb to true for an in-memory database, or
1221 ** false for a file-based database. This symbol is only required if
1222 ** either of the shared-data or autovacuum features are compiled
1223 ** into the library.
1224 */
1225#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1226 #ifdef SQLITE_OMIT_MEMORYDB
drh980b1a72006-08-16 16:42:48 +00001227 const int isMemdb = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00001228 #else
drh980b1a72006-08-16 16:42:48 +00001229 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
danielk1977aef0bf62005-12-30 16:28:01 +00001230 #endif
1231#endif
1232
drhe5fe6902007-12-07 18:55:28 +00001233 assert( db!=0 );
1234 assert( sqlite3_mutex_held(db->mutex) );
drh153c62c2007-08-24 03:51:33 +00001235
drhe5fe6902007-12-07 18:55:28 +00001236 pVfs = db->pVfs;
drh17435752007-08-16 04:30:38 +00001237 p = sqlite3MallocZero(sizeof(Btree));
danielk1977aef0bf62005-12-30 16:28:01 +00001238 if( !p ){
1239 return SQLITE_NOMEM;
1240 }
1241 p->inTrans = TRANS_NONE;
drhe5fe6902007-12-07 18:55:28 +00001242 p->db = db;
danielk1977aef0bf62005-12-30 16:28:01 +00001243
drh198bf392006-01-06 21:52:49 +00001244#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001245 /*
1246 ** If this Btree is a candidate for shared cache, try to find an
1247 ** existing BtShared object that we can share with
1248 */
drh34004ce2008-07-11 16:15:17 +00001249 if( isMemdb==0
drhe5fe6902007-12-07 18:55:28 +00001250 && (db->flags & SQLITE_Vtab)==0
drhe53831d2007-08-17 01:14:38 +00001251 && zFilename && zFilename[0]
drhe53831d2007-08-17 01:14:38 +00001252 ){
danielk1977502b4e02008-09-02 14:07:24 +00001253 if( sqlite3GlobalConfig.sharedCacheEnabled ){
danielk1977adfb9b02007-09-17 07:02:56 +00001254 int nFullPathname = pVfs->mxPathname+1;
drhe5ae5732008-06-15 02:51:47 +00001255 char *zFullPathname = sqlite3Malloc(nFullPathname);
drhff0587c2007-08-29 17:43:19 +00001256 sqlite3_mutex *mutexShared;
1257 p->sharable = 1;
drh34004ce2008-07-11 16:15:17 +00001258 db->flags |= SQLITE_SharedCache;
drhff0587c2007-08-29 17:43:19 +00001259 if( !zFullPathname ){
1260 sqlite3_free(p);
1261 return SQLITE_NOMEM;
1262 }
danielk1977adfb9b02007-09-17 07:02:56 +00001263 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
danielk197759f8c082008-06-18 17:09:10 +00001264 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhff0587c2007-08-29 17:43:19 +00001265 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001266 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
drhff0587c2007-08-29 17:43:19 +00001267 assert( pBt->nRef>0 );
1268 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1269 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1270 p->pBt = pBt;
1271 pBt->nRef++;
1272 break;
1273 }
1274 }
1275 sqlite3_mutex_leave(mutexShared);
1276 sqlite3_free(zFullPathname);
danielk1977aef0bf62005-12-30 16:28:01 +00001277 }
drhff0587c2007-08-29 17:43:19 +00001278#ifdef SQLITE_DEBUG
1279 else{
1280 /* In debug mode, we mark all persistent databases as sharable
1281 ** even when they are not. This exercises the locking code and
1282 ** gives more opportunity for asserts(sqlite3_mutex_held())
1283 ** statements to find locking problems.
1284 */
1285 p->sharable = 1;
1286 }
1287#endif
danielk1977aef0bf62005-12-30 16:28:01 +00001288 }
1289#endif
drha059ad02001-04-17 20:09:11 +00001290 if( pBt==0 ){
drhe53831d2007-08-17 01:14:38 +00001291 /*
1292 ** The following asserts make sure that structures used by the btree are
1293 ** the right size. This is to guard against size changes that result
1294 ** when compiling on a different architecture.
danielk197703aded42004-11-22 05:26:27 +00001295 */
drhe53831d2007-08-17 01:14:38 +00001296 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1297 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1298 assert( sizeof(u32)==4 );
1299 assert( sizeof(u16)==2 );
1300 assert( sizeof(Pgno)==4 );
1301
1302 pBt = sqlite3MallocZero( sizeof(*pBt) );
1303 if( pBt==0 ){
1304 rc = SQLITE_NOMEM;
1305 goto btree_open_out;
1306 }
drhe5fe6902007-12-07 18:55:28 +00001307 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1308 pBt->busyHdr.pArg = pBt;
danielk197771d5d2c2008-09-29 11:49:47 +00001309 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
drh33f4e022007-09-03 15:19:34 +00001310 EXTRA_SIZE, flags, vfsFlags);
drhe53831d2007-08-17 01:14:38 +00001311 if( rc==SQLITE_OK ){
1312 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1313 }
1314 if( rc!=SQLITE_OK ){
1315 goto btree_open_out;
1316 }
drhe5fe6902007-12-07 18:55:28 +00001317 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
drhe53831d2007-08-17 01:14:38 +00001318 p->pBt = pBt;
1319
drhe53831d2007-08-17 01:14:38 +00001320 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1321 pBt->pCursor = 0;
1322 pBt->pPage1 = 0;
1323 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1324 pBt->pageSize = get2byte(&zDbHeader[16]);
1325 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1326 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
danielk1977a1644fd2007-08-29 12:31:25 +00001327 pBt->pageSize = 0;
1328 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001329#ifndef SQLITE_OMIT_AUTOVACUUM
1330 /* If the magic name ":memory:" will create an in-memory database, then
1331 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1332 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1333 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1334 ** regular file-name. In this case the auto-vacuum applies as per normal.
1335 */
1336 if( zFilename && !isMemdb ){
1337 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1338 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1339 }
1340#endif
1341 nReserve = 0;
1342 }else{
1343 nReserve = zDbHeader[20];
drhe53831d2007-08-17 01:14:38 +00001344 pBt->pageSizeFixed = 1;
1345#ifndef SQLITE_OMIT_AUTOVACUUM
1346 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1347 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1348#endif
1349 }
1350 pBt->usableSize = pBt->pageSize - nReserve;
1351 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
danielk1977a1644fd2007-08-29 12:31:25 +00001352 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001353
1354#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1355 /* Add the new BtShared object to the linked list sharable BtShareds.
1356 */
1357 if( p->sharable ){
1358 sqlite3_mutex *mutexShared;
1359 pBt->nRef = 1;
danielk197759f8c082008-06-18 17:09:10 +00001360 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
danielk1977075c23a2008-09-01 18:34:20 +00001361 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
danielk197759f8c082008-06-18 17:09:10 +00001362 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
drh3285db22007-09-03 22:00:39 +00001363 if( pBt->mutex==0 ){
1364 rc = SQLITE_NOMEM;
drhe5fe6902007-12-07 18:55:28 +00001365 db->mallocFailed = 0;
drh3285db22007-09-03 22:00:39 +00001366 goto btree_open_out;
1367 }
drhff0587c2007-08-29 17:43:19 +00001368 }
drhe53831d2007-08-17 01:14:38 +00001369 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001370 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1371 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
drhe53831d2007-08-17 01:14:38 +00001372 sqlite3_mutex_leave(mutexShared);
danielk1977951af802004-11-05 15:45:09 +00001373 }
drheee46cf2004-11-06 00:02:48 +00001374#endif
drh90f5ecb2004-07-22 01:19:35 +00001375 }
danielk1977aef0bf62005-12-30 16:28:01 +00001376
drhcfed7bc2006-03-13 14:28:05 +00001377#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001378 /* If the new Btree uses a sharable pBtShared, then link the new
1379 ** Btree into the list of all sharable Btrees for the same connection.
drhabddb0c2007-08-20 13:14:28 +00001380 ** The list is kept in ascending order by pBt address.
danielk197754f01982006-01-18 15:25:17 +00001381 */
drhe53831d2007-08-17 01:14:38 +00001382 if( p->sharable ){
1383 int i;
1384 Btree *pSib;
drhe5fe6902007-12-07 18:55:28 +00001385 for(i=0; i<db->nDb; i++){
1386 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
drhe53831d2007-08-17 01:14:38 +00001387 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1388 if( p->pBt<pSib->pBt ){
1389 p->pNext = pSib;
1390 p->pPrev = 0;
1391 pSib->pPrev = p;
1392 }else{
drhabddb0c2007-08-20 13:14:28 +00001393 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
drhe53831d2007-08-17 01:14:38 +00001394 pSib = pSib->pNext;
1395 }
1396 p->pNext = pSib->pNext;
1397 p->pPrev = pSib;
1398 if( p->pNext ){
1399 p->pNext->pPrev = p;
1400 }
1401 pSib->pNext = p;
1402 }
1403 break;
1404 }
1405 }
danielk1977aef0bf62005-12-30 16:28:01 +00001406 }
danielk1977aef0bf62005-12-30 16:28:01 +00001407#endif
1408 *ppBtree = p;
danielk1977dddbcdc2007-04-26 14:42:34 +00001409
1410btree_open_out:
1411 if( rc!=SQLITE_OK ){
1412 if( pBt && pBt->pPager ){
1413 sqlite3PagerClose(pBt->pPager);
1414 }
drh17435752007-08-16 04:30:38 +00001415 sqlite3_free(pBt);
1416 sqlite3_free(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001417 *ppBtree = 0;
1418 }
1419 return rc;
drha059ad02001-04-17 20:09:11 +00001420}
1421
1422/*
drhe53831d2007-08-17 01:14:38 +00001423** Decrement the BtShared.nRef counter. When it reaches zero,
1424** remove the BtShared structure from the sharing list. Return
1425** true if the BtShared.nRef counter reaches zero and return
1426** false if it is still positive.
1427*/
1428static int removeFromSharingList(BtShared *pBt){
1429#ifndef SQLITE_OMIT_SHARED_CACHE
1430 sqlite3_mutex *pMaster;
1431 BtShared *pList;
1432 int removed = 0;
1433
drhd677b3d2007-08-20 22:48:41 +00001434 assert( sqlite3_mutex_notheld(pBt->mutex) );
danielk197759f8c082008-06-18 17:09:10 +00001435 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhe53831d2007-08-17 01:14:38 +00001436 sqlite3_mutex_enter(pMaster);
1437 pBt->nRef--;
1438 if( pBt->nRef<=0 ){
drh78f82d12008-09-02 00:52:52 +00001439 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1440 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
drhe53831d2007-08-17 01:14:38 +00001441 }else{
drh78f82d12008-09-02 00:52:52 +00001442 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
drh34004ce2008-07-11 16:15:17 +00001443 while( ALWAYS(pList) && pList->pNext!=pBt ){
drhe53831d2007-08-17 01:14:38 +00001444 pList=pList->pNext;
1445 }
drh34004ce2008-07-11 16:15:17 +00001446 if( ALWAYS(pList) ){
drhe53831d2007-08-17 01:14:38 +00001447 pList->pNext = pBt->pNext;
1448 }
1449 }
drh3285db22007-09-03 22:00:39 +00001450 if( SQLITE_THREADSAFE ){
1451 sqlite3_mutex_free(pBt->mutex);
1452 }
drhe53831d2007-08-17 01:14:38 +00001453 removed = 1;
1454 }
1455 sqlite3_mutex_leave(pMaster);
1456 return removed;
1457#else
1458 return 1;
1459#endif
1460}
1461
1462/*
drhf7141992008-06-19 00:16:08 +00001463** Make sure pBt->pTmpSpace points to an allocation of
1464** MX_CELL_SIZE(pBt) bytes.
1465*/
1466static void allocateTempSpace(BtShared *pBt){
1467 if( !pBt->pTmpSpace ){
1468 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1469 }
1470}
1471
1472/*
1473** Free the pBt->pTmpSpace allocation
1474*/
1475static void freeTempSpace(BtShared *pBt){
1476 sqlite3PageFree( pBt->pTmpSpace);
1477 pBt->pTmpSpace = 0;
1478}
1479
1480/*
drha059ad02001-04-17 20:09:11 +00001481** Close an open database and invalidate all cursors.
1482*/
danielk1977aef0bf62005-12-30 16:28:01 +00001483int sqlite3BtreeClose(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00001484 BtShared *pBt = p->pBt;
1485 BtCursor *pCur;
1486
danielk1977aef0bf62005-12-30 16:28:01 +00001487 /* Close all cursors opened via this handle. */
drhe5fe6902007-12-07 18:55:28 +00001488 assert( sqlite3_mutex_held(p->db->mutex) );
drhe53831d2007-08-17 01:14:38 +00001489 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001490 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001491 pCur = pBt->pCursor;
1492 while( pCur ){
1493 BtCursor *pTmp = pCur;
1494 pCur = pCur->pNext;
1495 if( pTmp->pBtree==p ){
1496 sqlite3BtreeCloseCursor(pTmp);
1497 }
drha059ad02001-04-17 20:09:11 +00001498 }
danielk1977aef0bf62005-12-30 16:28:01 +00001499
danielk19778d34dfd2006-01-24 16:37:57 +00001500 /* Rollback any active transaction and free the handle structure.
1501 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1502 ** this handle.
1503 */
danielk1977b597f742006-01-15 11:39:18 +00001504 sqlite3BtreeRollback(p);
drhe53831d2007-08-17 01:14:38 +00001505 sqlite3BtreeLeave(p);
danielk1977aef0bf62005-12-30 16:28:01 +00001506
danielk1977aef0bf62005-12-30 16:28:01 +00001507 /* If there are still other outstanding references to the shared-btree
1508 ** structure, return now. The remainder of this procedure cleans
1509 ** up the shared-btree.
1510 */
drhe53831d2007-08-17 01:14:38 +00001511 assert( p->wantToLock==0 && p->locked==0 );
1512 if( !p->sharable || removeFromSharingList(pBt) ){
1513 /* The pBt is no longer on the sharing list, so we can access
1514 ** it without having to hold the mutex.
1515 **
1516 ** Clean out and delete the BtShared object.
1517 */
1518 assert( !pBt->pCursor );
drhe53831d2007-08-17 01:14:38 +00001519 sqlite3PagerClose(pBt->pPager);
1520 if( pBt->xFreeSchema && pBt->pSchema ){
1521 pBt->xFreeSchema(pBt->pSchema);
1522 }
1523 sqlite3_free(pBt->pSchema);
drhf7141992008-06-19 00:16:08 +00001524 freeTempSpace(pBt);
drh65bbf292008-06-19 01:03:17 +00001525 sqlite3_free(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00001526 }
1527
drhe53831d2007-08-17 01:14:38 +00001528#ifndef SQLITE_OMIT_SHARED_CACHE
drhcab5ed72007-08-22 11:41:18 +00001529 assert( p->wantToLock==0 );
1530 assert( p->locked==0 );
1531 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1532 if( p->pNext ) p->pNext->pPrev = p->pPrev;
danielk1977aef0bf62005-12-30 16:28:01 +00001533#endif
1534
drhe53831d2007-08-17 01:14:38 +00001535 sqlite3_free(p);
drha059ad02001-04-17 20:09:11 +00001536 return SQLITE_OK;
1537}
1538
1539/*
drhda47d772002-12-02 04:25:19 +00001540** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +00001541**
1542** The maximum number of cache pages is set to the absolute
1543** value of mxPage. If mxPage is negative, the pager will
1544** operate asynchronously - it will not stop to do fsync()s
1545** to insure data is written to the disk surface before
1546** continuing. Transactions still work if synchronous is off,
1547** and the database cannot be corrupted if this program
1548** crashes. But if the operating system crashes or there is
1549** an abrupt power failure when synchronous is off, the database
1550** could be left in an inconsistent and unrecoverable state.
1551** Synchronous is on by default so database corruption is not
1552** normally a worry.
drhf57b14a2001-09-14 18:54:08 +00001553*/
danielk1977aef0bf62005-12-30 16:28:01 +00001554int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1555 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001556 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001557 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001558 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
drhd677b3d2007-08-20 22:48:41 +00001559 sqlite3BtreeLeave(p);
drhf57b14a2001-09-14 18:54:08 +00001560 return SQLITE_OK;
1561}
1562
1563/*
drh973b6e32003-02-12 14:09:42 +00001564** Change the way data is synced to disk in order to increase or decrease
1565** how well the database resists damage due to OS crashes and power
1566** failures. Level 1 is the same as asynchronous (no syncs() occur and
1567** there is a high probability of damage) Level 2 is the default. There
1568** is a very low but non-zero probability of damage. Level 3 reduces the
1569** probability of damage to near zero but with a write performance reduction.
1570*/
danielk197793758c82005-01-21 08:13:14 +00001571#ifndef SQLITE_OMIT_PAGER_PRAGMAS
drhac530b12006-02-11 01:25:50 +00001572int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
danielk1977aef0bf62005-12-30 16:28:01 +00001573 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001574 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001575 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001576 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
drhd677b3d2007-08-20 22:48:41 +00001577 sqlite3BtreeLeave(p);
drh973b6e32003-02-12 14:09:42 +00001578 return SQLITE_OK;
1579}
danielk197793758c82005-01-21 08:13:14 +00001580#endif
drh973b6e32003-02-12 14:09:42 +00001581
drh2c8997b2005-08-27 16:36:48 +00001582/*
1583** Return TRUE if the given btree is set to safety level 1. In other
1584** words, return TRUE if no sync() occurs on the disk files.
1585*/
danielk1977aef0bf62005-12-30 16:28:01 +00001586int sqlite3BtreeSyncDisabled(Btree *p){
1587 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001588 int rc;
drhe5fe6902007-12-07 18:55:28 +00001589 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001590 sqlite3BtreeEnter(p);
drhd0679ed2007-08-28 22:24:34 +00001591 assert( pBt && pBt->pPager );
drhd677b3d2007-08-20 22:48:41 +00001592 rc = sqlite3PagerNosync(pBt->pPager);
1593 sqlite3BtreeLeave(p);
1594 return rc;
drh2c8997b2005-08-27 16:36:48 +00001595}
1596
danielk1977576ec6b2005-01-21 11:55:25 +00001597#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
drh973b6e32003-02-12 14:09:42 +00001598/*
drh90f5ecb2004-07-22 01:19:35 +00001599** Change the default pages size and the number of reserved bytes per page.
drh06f50212004-11-02 14:24:33 +00001600**
1601** The page size must be a power of 2 between 512 and 65536. If the page
1602** size supplied does not meet this constraint then the page size is not
1603** changed.
1604**
1605** Page sizes are constrained to be a power of two so that the region
1606** of the database file used for locking (beginning at PENDING_BYTE,
1607** the first byte past the 1GB boundary, 0x40000000) needs to occur
1608** at the beginning of a page.
danielk197728129562005-01-11 10:25:06 +00001609**
1610** If parameter nReserve is less than zero, then the number of reserved
1611** bytes per page is left unchanged.
drh90f5ecb2004-07-22 01:19:35 +00001612*/
danielk1977aef0bf62005-12-30 16:28:01 +00001613int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
danielk1977a1644fd2007-08-29 12:31:25 +00001614 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00001615 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001616 sqlite3BtreeEnter(p);
drh90f5ecb2004-07-22 01:19:35 +00001617 if( pBt->pageSizeFixed ){
drhd677b3d2007-08-20 22:48:41 +00001618 sqlite3BtreeLeave(p);
drh90f5ecb2004-07-22 01:19:35 +00001619 return SQLITE_READONLY;
1620 }
1621 if( nReserve<0 ){
1622 nReserve = pBt->pageSize - pBt->usableSize;
1623 }
drh06f50212004-11-02 14:24:33 +00001624 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1625 ((pageSize-1)&pageSize)==0 ){
drh07d183d2005-05-01 22:52:42 +00001626 assert( (pageSize & 7)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00001627 assert( !pBt->pPage1 && !pBt->pCursor );
danielk1977a1644fd2007-08-29 12:31:25 +00001628 pBt->pageSize = pageSize;
drhf7141992008-06-19 00:16:08 +00001629 freeTempSpace(pBt);
danielk1977a1644fd2007-08-29 12:31:25 +00001630 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drh90f5ecb2004-07-22 01:19:35 +00001631 }
1632 pBt->usableSize = pBt->pageSize - nReserve;
drhd677b3d2007-08-20 22:48:41 +00001633 sqlite3BtreeLeave(p);
danielk1977a1644fd2007-08-29 12:31:25 +00001634 return rc;
drh90f5ecb2004-07-22 01:19:35 +00001635}
1636
1637/*
1638** Return the currently defined page size
1639*/
danielk1977aef0bf62005-12-30 16:28:01 +00001640int sqlite3BtreeGetPageSize(Btree *p){
1641 return p->pBt->pageSize;
drh90f5ecb2004-07-22 01:19:35 +00001642}
danielk1977aef0bf62005-12-30 16:28:01 +00001643int sqlite3BtreeGetReserve(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00001644 int n;
1645 sqlite3BtreeEnter(p);
1646 n = p->pBt->pageSize - p->pBt->usableSize;
1647 sqlite3BtreeLeave(p);
1648 return n;
drh2011d5f2004-07-22 02:40:37 +00001649}
drhf8e632b2007-05-08 14:51:36 +00001650
1651/*
1652** Set the maximum page count for a database if mxPage is positive.
1653** No changes are made if mxPage is 0 or negative.
1654** Regardless of the value of mxPage, return the maximum page count.
1655*/
1656int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
drhd677b3d2007-08-20 22:48:41 +00001657 int n;
1658 sqlite3BtreeEnter(p);
1659 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1660 sqlite3BtreeLeave(p);
1661 return n;
drhf8e632b2007-05-08 14:51:36 +00001662}
danielk1977576ec6b2005-01-21 11:55:25 +00001663#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
drh90f5ecb2004-07-22 01:19:35 +00001664
1665/*
danielk1977951af802004-11-05 15:45:09 +00001666** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1667** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1668** is disabled. The default value for the auto-vacuum property is
1669** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1670*/
danielk1977aef0bf62005-12-30 16:28:01 +00001671int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
danielk1977951af802004-11-05 15:45:09 +00001672#ifdef SQLITE_OMIT_AUTOVACUUM
drheee46cf2004-11-06 00:02:48 +00001673 return SQLITE_READONLY;
danielk1977951af802004-11-05 15:45:09 +00001674#else
danielk1977dddbcdc2007-04-26 14:42:34 +00001675 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001676 int rc = SQLITE_OK;
danielk1977dddbcdc2007-04-26 14:42:34 +00001677 int av = (autoVacuum?1:0);
drhd677b3d2007-08-20 22:48:41 +00001678
1679 sqlite3BtreeEnter(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001680 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00001681 rc = SQLITE_READONLY;
1682 }else{
1683 pBt->autoVacuum = av;
danielk1977951af802004-11-05 15:45:09 +00001684 }
drhd677b3d2007-08-20 22:48:41 +00001685 sqlite3BtreeLeave(p);
1686 return rc;
danielk1977951af802004-11-05 15:45:09 +00001687#endif
1688}
1689
1690/*
1691** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1692** enabled 1 is returned. Otherwise 0.
1693*/
danielk1977aef0bf62005-12-30 16:28:01 +00001694int sqlite3BtreeGetAutoVacuum(Btree *p){
danielk1977951af802004-11-05 15:45:09 +00001695#ifdef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001696 return BTREE_AUTOVACUUM_NONE;
danielk1977951af802004-11-05 15:45:09 +00001697#else
drhd677b3d2007-08-20 22:48:41 +00001698 int rc;
1699 sqlite3BtreeEnter(p);
1700 rc = (
danielk1977dddbcdc2007-04-26 14:42:34 +00001701 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1702 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1703 BTREE_AUTOVACUUM_INCR
1704 );
drhd677b3d2007-08-20 22:48:41 +00001705 sqlite3BtreeLeave(p);
1706 return rc;
danielk1977951af802004-11-05 15:45:09 +00001707#endif
1708}
1709
1710
1711/*
drha34b6762004-05-07 13:30:42 +00001712** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +00001713** also acquire a readlock on that file.
1714**
1715** SQLITE_OK is returned on success. If the file is not a
1716** well-formed database file, then SQLITE_CORRUPT is returned.
1717** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
drh4f0ee682007-03-30 20:43:40 +00001718** is returned if we run out of memory.
drh306dc212001-05-21 13:45:10 +00001719*/
danielk1977aef0bf62005-12-30 16:28:01 +00001720static int lockBtree(BtShared *pBt){
danielk1977f653d782008-03-20 11:04:21 +00001721 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001722 MemPage *pPage1;
danielk197793f7af92008-05-09 16:57:50 +00001723 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001724
drh1fee73e2007-08-29 04:00:57 +00001725 assert( sqlite3_mutex_held(pBt->mutex) );
drha34b6762004-05-07 13:30:42 +00001726 if( pBt->pPage1 ) return SQLITE_OK;
drh16a9b832007-05-05 18:39:25 +00001727 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
drh306dc212001-05-21 13:45:10 +00001728 if( rc!=SQLITE_OK ) return rc;
drh306dc212001-05-21 13:45:10 +00001729
1730 /* Do some checking to help insure the file we opened really is
1731 ** a valid database file.
1732 */
danielk1977ad0132d2008-06-07 08:58:22 +00001733 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1734 if( rc!=SQLITE_OK ){
danielk197793f7af92008-05-09 16:57:50 +00001735 goto page1_init_failed;
1736 }else if( nPage>0 ){
danielk1977f653d782008-03-20 11:04:21 +00001737 int pageSize;
1738 int usableSize;
drhb6f41482004-05-14 01:58:11 +00001739 u8 *page1 = pPage1->aData;
danielk1977ad0132d2008-06-07 08:58:22 +00001740 rc = SQLITE_NOTADB;
drhb6f41482004-05-14 01:58:11 +00001741 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00001742 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00001743 }
drh309169a2007-04-24 17:27:51 +00001744 if( page1[18]>1 ){
1745 pBt->readOnly = 1;
1746 }
1747 if( page1[19]>1 ){
drhb6f41482004-05-14 01:58:11 +00001748 goto page1_init_failed;
1749 }
drhe5ae5732008-06-15 02:51:47 +00001750
1751 /* The maximum embedded fraction must be exactly 25%. And the minimum
1752 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1753 ** The original design allowed these amounts to vary, but as of
1754 ** version 3.6.0, we require them to be fixed.
1755 */
1756 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1757 goto page1_init_failed;
1758 }
drh07d183d2005-05-01 22:52:42 +00001759 pageSize = get2byte(&page1[16]);
drh7dc385e2007-09-06 23:39:36 +00001760 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1761 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1762 ){
drh07d183d2005-05-01 22:52:42 +00001763 goto page1_init_failed;
1764 }
1765 assert( (pageSize & 7)==0 );
danielk1977f653d782008-03-20 11:04:21 +00001766 usableSize = pageSize - page1[20];
1767 if( pageSize!=pBt->pageSize ){
1768 /* After reading the first page of the database assuming a page size
1769 ** of BtShared.pageSize, we have discovered that the page-size is
1770 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1771 ** zero and return SQLITE_OK. The caller will call this function
1772 ** again with the correct page-size.
1773 */
1774 releasePage(pPage1);
1775 pBt->usableSize = usableSize;
1776 pBt->pageSize = pageSize;
drhf7141992008-06-19 00:16:08 +00001777 freeTempSpace(pBt);
danielk1977f653d782008-03-20 11:04:21 +00001778 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1779 return SQLITE_OK;
1780 }
1781 if( usableSize<500 ){
drhb6f41482004-05-14 01:58:11 +00001782 goto page1_init_failed;
1783 }
danielk1977f653d782008-03-20 11:04:21 +00001784 pBt->pageSize = pageSize;
1785 pBt->usableSize = usableSize;
drh057cd3a2005-02-15 16:23:02 +00001786#ifndef SQLITE_OMIT_AUTOVACUUM
1787 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
danielk197727b1f952007-06-25 08:16:58 +00001788 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
drh057cd3a2005-02-15 16:23:02 +00001789#endif
drh306dc212001-05-21 13:45:10 +00001790 }
drhb6f41482004-05-14 01:58:11 +00001791
1792 /* maxLocal is the maximum amount of payload to store locally for
1793 ** a cell. Make sure it is small enough so that at least minFanout
1794 ** cells can will fit on one page. We assume a 10-byte page header.
1795 ** Besides the payload, the cell must store:
drh43605152004-05-29 21:46:49 +00001796 ** 2-byte pointer to the cell
drhb6f41482004-05-14 01:58:11 +00001797 ** 4-byte child pointer
1798 ** 9-byte nKey value
1799 ** 4-byte nData value
1800 ** 4-byte overflow page pointer
drh43605152004-05-29 21:46:49 +00001801 ** So a cell consists of a 2-byte poiner, a header which is as much as
1802 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1803 ** page pointer.
drhb6f41482004-05-14 01:58:11 +00001804 */
drhe5ae5732008-06-15 02:51:47 +00001805 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1806 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
drh43605152004-05-29 21:46:49 +00001807 pBt->maxLeaf = pBt->usableSize - 35;
drhe5ae5732008-06-15 02:51:47 +00001808 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
drh2e38c322004-09-03 18:38:44 +00001809 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00001810 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00001811 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00001812
drh72f82862001-05-24 21:06:34 +00001813page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00001814 releasePage(pPage1);
1815 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00001816 return rc;
drh306dc212001-05-21 13:45:10 +00001817}
1818
1819/*
drhb8ef32c2005-03-14 02:01:49 +00001820** This routine works like lockBtree() except that it also invokes the
1821** busy callback if there is lock contention.
1822*/
danielk1977aef0bf62005-12-30 16:28:01 +00001823static int lockBtreeWithRetry(Btree *pRef){
drhb8ef32c2005-03-14 02:01:49 +00001824 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00001825
drh1fee73e2007-08-29 04:00:57 +00001826 assert( sqlite3BtreeHoldsMutex(pRef) );
danielk1977aef0bf62005-12-30 16:28:01 +00001827 if( pRef->inTrans==TRANS_NONE ){
1828 u8 inTransaction = pRef->pBt->inTransaction;
1829 btreeIntegrity(pRef);
1830 rc = sqlite3BtreeBeginTrans(pRef, 0);
1831 pRef->pBt->inTransaction = inTransaction;
1832 pRef->inTrans = TRANS_NONE;
1833 if( rc==SQLITE_OK ){
1834 pRef->pBt->nTransaction--;
1835 }
1836 btreeIntegrity(pRef);
drhb8ef32c2005-03-14 02:01:49 +00001837 }
1838 return rc;
1839}
1840
1841
1842/*
drhb8ca3072001-12-05 00:21:20 +00001843** If there are no outstanding cursors and we are not in the middle
1844** of a transaction but there is a read lock on the database, then
1845** this routine unrefs the first page of the database file which
1846** has the effect of releasing the read lock.
1847**
1848** If there are any outstanding cursors, this routine is a no-op.
1849**
1850** If there is a transaction in progress, this routine is a no-op.
1851*/
danielk1977aef0bf62005-12-30 16:28:01 +00001852static void unlockBtreeIfUnused(BtShared *pBt){
drh1fee73e2007-08-29 04:00:57 +00001853 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00001854 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00001855 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
drhde4fcfd2008-01-19 23:50:26 +00001856 assert( pBt->pPage1->aData );
1857#if 0
drh24c9a2e2007-01-05 02:00:47 +00001858 if( pBt->pPage1->aData==0 ){
1859 MemPage *pPage = pBt->pPage1;
drhbf4bca52007-09-06 22:19:14 +00001860 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
drh24c9a2e2007-01-05 02:00:47 +00001861 pPage->pBt = pBt;
1862 pPage->pgno = 1;
1863 }
drhde4fcfd2008-01-19 23:50:26 +00001864#endif
drh24c9a2e2007-01-05 02:00:47 +00001865 releasePage(pBt->pPage1);
drh51c6d962004-06-06 00:42:25 +00001866 }
drh3aac2dd2004-04-26 14:10:20 +00001867 pBt->pPage1 = 0;
drh3aac2dd2004-04-26 14:10:20 +00001868 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001869 }
1870}
1871
1872/*
drh9e572e62004-04-23 23:43:10 +00001873** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001874** file.
drh8b2f49b2001-06-08 00:21:52 +00001875*/
danielk1977aef0bf62005-12-30 16:28:01 +00001876static int newDatabase(BtShared *pBt){
drh9e572e62004-04-23 23:43:10 +00001877 MemPage *pP1;
1878 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001879 int rc;
danielk1977ad0132d2008-06-07 08:58:22 +00001880 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001881
drh1fee73e2007-08-29 04:00:57 +00001882 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977ad0132d2008-06-07 08:58:22 +00001883 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1884 if( rc!=SQLITE_OK || nPage>0 ){
1885 return rc;
1886 }
drh3aac2dd2004-04-26 14:10:20 +00001887 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001888 assert( pP1!=0 );
1889 data = pP1->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00001890 rc = sqlite3PagerWrite(pP1->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00001891 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001892 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1893 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00001894 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00001895 data[18] = 1;
1896 data[19] = 1;
drhb6f41482004-05-14 01:58:11 +00001897 data[20] = pBt->pageSize - pBt->usableSize;
drhe5ae5732008-06-15 02:51:47 +00001898 data[21] = 64;
1899 data[22] = 32;
1900 data[23] = 32;
drhb6f41482004-05-14 01:58:11 +00001901 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00001902 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drhf2a611c2004-09-05 00:33:43 +00001903 pBt->pageSizeFixed = 1;
danielk1977003ba062004-11-04 02:57:33 +00001904#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001905 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
danielk1977418899a2007-06-24 10:14:00 +00001906 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
danielk1977dddbcdc2007-04-26 14:42:34 +00001907 put4byte(&data[36 + 4*4], pBt->autoVacuum);
danielk1977418899a2007-06-24 10:14:00 +00001908 put4byte(&data[36 + 7*4], pBt->incrVacuum);
danielk1977003ba062004-11-04 02:57:33 +00001909#endif
drh8b2f49b2001-06-08 00:21:52 +00001910 return SQLITE_OK;
1911}
1912
1913/*
danielk1977ee5741e2004-05-31 10:01:34 +00001914** Attempt to start a new transaction. A write-transaction
drh684917c2004-10-05 02:41:42 +00001915** is started if the second argument is nonzero, otherwise a read-
1916** transaction. If the second argument is 2 or more and exclusive
1917** transaction is started, meaning that no other process is allowed
1918** to access the database. A preexisting transaction may not be
drhb8ef32c2005-03-14 02:01:49 +00001919** upgraded to exclusive by calling this routine a second time - the
drh684917c2004-10-05 02:41:42 +00001920** exclusivity flag only works for a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001921**
danielk1977ee5741e2004-05-31 10:01:34 +00001922** A write-transaction must be started before attempting any
1923** changes to the database. None of the following routines
1924** will work unless a transaction is started first:
drh8b2f49b2001-06-08 00:21:52 +00001925**
drh23e11ca2004-05-04 17:27:28 +00001926** sqlite3BtreeCreateTable()
1927** sqlite3BtreeCreateIndex()
1928** sqlite3BtreeClearTable()
1929** sqlite3BtreeDropTable()
1930** sqlite3BtreeInsert()
1931** sqlite3BtreeDelete()
1932** sqlite3BtreeUpdateMeta()
danielk197713adf8a2004-06-03 16:08:41 +00001933**
drhb8ef32c2005-03-14 02:01:49 +00001934** If an initial attempt to acquire the lock fails because of lock contention
1935** and the database was previously unlocked, then invoke the busy handler
1936** if there is one. But if there was previously a read-lock, do not
1937** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1938** returned when there is already a read-lock in order to avoid a deadlock.
1939**
1940** Suppose there are two processes A and B. A has a read lock and B has
1941** a reserved lock. B tries to promote to exclusive but is blocked because
1942** of A's read lock. A tries to promote to reserved but is blocked by B.
1943** One or the other of the two processes must give way or there can be
1944** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1945** when A already has a read lock, we encourage A to give up and let B
1946** proceed.
drha059ad02001-04-17 20:09:11 +00001947*/
danielk1977aef0bf62005-12-30 16:28:01 +00001948int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1949 BtShared *pBt = p->pBt;
danielk1977ee5741e2004-05-31 10:01:34 +00001950 int rc = SQLITE_OK;
1951
drhd677b3d2007-08-20 22:48:41 +00001952 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001953 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001954 btreeIntegrity(p);
1955
danielk1977ee5741e2004-05-31 10:01:34 +00001956 /* If the btree is already in a write-transaction, or it
1957 ** is already in a read-transaction and a read-transaction
1958 ** is requested, this is a no-op.
1959 */
danielk1977aef0bf62005-12-30 16:28:01 +00001960 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
drhd677b3d2007-08-20 22:48:41 +00001961 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001962 }
drhb8ef32c2005-03-14 02:01:49 +00001963
1964 /* Write transactions are not possible on a read-only database */
danielk1977ee5741e2004-05-31 10:01:34 +00001965 if( pBt->readOnly && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00001966 rc = SQLITE_READONLY;
1967 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001968 }
1969
danielk1977aef0bf62005-12-30 16:28:01 +00001970 /* If another database handle has already opened a write transaction
1971 ** on this shared-btree structure and a second write transaction is
1972 ** requested, return SQLITE_BUSY.
1973 */
1974 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00001975 rc = SQLITE_BUSY;
1976 goto trans_begun;
danielk1977aef0bf62005-12-30 16:28:01 +00001977 }
1978
danielk1977641b0f42007-12-21 04:47:25 +00001979#ifndef SQLITE_OMIT_SHARED_CACHE
1980 if( wrflag>1 ){
1981 BtLock *pIter;
1982 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1983 if( pIter->pBtree!=p ){
1984 rc = SQLITE_BUSY;
1985 goto trans_begun;
1986 }
1987 }
1988 }
1989#endif
1990
drhb8ef32c2005-03-14 02:01:49 +00001991 do {
drh8a9c17f2008-05-02 14:23:54 +00001992 if( pBt->pPage1==0 ){
1993 do{
1994 rc = lockBtree(pBt);
1995 }while( pBt->pPage1==0 && rc==SQLITE_OK );
drh8c42ca92001-06-22 19:15:00 +00001996 }
drh309169a2007-04-24 17:27:51 +00001997
drhb8ef32c2005-03-14 02:01:49 +00001998 if( rc==SQLITE_OK && wrflag ){
drh309169a2007-04-24 17:27:51 +00001999 if( pBt->readOnly ){
2000 rc = SQLITE_READONLY;
2001 }else{
2002 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2003 if( rc==SQLITE_OK ){
2004 rc = newDatabase(pBt);
2005 }
drhb8ef32c2005-03-14 02:01:49 +00002006 }
2007 }
2008
2009 if( rc==SQLITE_OK ){
drhb8ef32c2005-03-14 02:01:49 +00002010 if( wrflag ) pBt->inStmt = 0;
2011 }else{
2012 unlockBtreeIfUnused(pBt);
2013 }
danielk1977aef0bf62005-12-30 16:28:01 +00002014 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
drhe5fe6902007-12-07 18:55:28 +00002015 sqlite3BtreeInvokeBusyHandler(pBt, 0) );
danielk1977aef0bf62005-12-30 16:28:01 +00002016
2017 if( rc==SQLITE_OK ){
2018 if( p->inTrans==TRANS_NONE ){
2019 pBt->nTransaction++;
2020 }
2021 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2022 if( p->inTrans>pBt->inTransaction ){
2023 pBt->inTransaction = p->inTrans;
2024 }
danielk1977641b0f42007-12-21 04:47:25 +00002025#ifndef SQLITE_OMIT_SHARED_CACHE
2026 if( wrflag>1 ){
2027 assert( !pBt->pExclusive );
2028 pBt->pExclusive = p;
2029 }
2030#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002031 }
2032
drhd677b3d2007-08-20 22:48:41 +00002033
2034trans_begun:
danielk1977aef0bf62005-12-30 16:28:01 +00002035 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002036 sqlite3BtreeLeave(p);
drhb8ca3072001-12-05 00:21:20 +00002037 return rc;
drha059ad02001-04-17 20:09:11 +00002038}
2039
drh4a0611d2008-07-18 17:16:26 +00002040
danielk1977687566d2004-11-02 12:56:41 +00002041#ifndef SQLITE_OMIT_AUTOVACUUM
2042
2043/*
2044** Set the pointer-map entries for all children of page pPage. Also, if
2045** pPage contains cells that point to overflow pages, set the pointer
2046** map entries for the overflow pages as well.
2047*/
2048static int setChildPtrmaps(MemPage *pPage){
2049 int i; /* Counter variable */
2050 int nCell; /* Number of cells in page pPage */
danielk19772df71c72007-05-24 07:22:42 +00002051 int rc; /* Return code */
danielk1977aef0bf62005-12-30 16:28:01 +00002052 BtShared *pBt = pPage->pBt;
danielk1977687566d2004-11-02 12:56:41 +00002053 int isInitOrig = pPage->isInit;
2054 Pgno pgno = pPage->pgno;
2055
drh1fee73e2007-08-29 04:00:57 +00002056 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00002057 rc = sqlite3BtreeInitPage(pPage);
danielk19772df71c72007-05-24 07:22:42 +00002058 if( rc!=SQLITE_OK ){
2059 goto set_child_ptrmaps_out;
2060 }
danielk1977687566d2004-11-02 12:56:41 +00002061 nCell = pPage->nCell;
2062
2063 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002064 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002065
danielk197726836652005-01-17 01:33:13 +00002066 rc = ptrmapPutOvflPtr(pPage, pCell);
2067 if( rc!=SQLITE_OK ){
2068 goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002069 }
danielk197726836652005-01-17 01:33:13 +00002070
danielk1977687566d2004-11-02 12:56:41 +00002071 if( !pPage->leaf ){
2072 Pgno childPgno = get4byte(pCell);
2073 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
danielk197700a696d2008-09-29 16:41:31 +00002074 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002075 }
2076 }
2077
2078 if( !pPage->leaf ){
2079 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2080 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2081 }
2082
2083set_child_ptrmaps_out:
2084 pPage->isInit = isInitOrig;
2085 return rc;
2086}
2087
2088/*
2089** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2090** page, is a pointer to page iFrom. Modify this pointer so that it points to
2091** iTo. Parameter eType describes the type of pointer to be modified, as
2092** follows:
2093**
2094** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2095** page of pPage.
2096**
2097** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2098** page pointed to by one of the cells on pPage.
2099**
2100** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2101** overflow page in the list.
2102*/
danielk1977fdb7cdb2005-01-17 02:12:18 +00002103static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
drh1fee73e2007-08-29 04:00:57 +00002104 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk1977687566d2004-11-02 12:56:41 +00002105 if( eType==PTRMAP_OVERFLOW2 ){
danielk1977f78fc082004-11-02 14:40:32 +00002106 /* The pointer is always the first 4 bytes of the page in this case. */
danielk1977fdb7cdb2005-01-17 02:12:18 +00002107 if( get4byte(pPage->aData)!=iFrom ){
drh49285702005-09-17 15:20:26 +00002108 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002109 }
danielk1977f78fc082004-11-02 14:40:32 +00002110 put4byte(pPage->aData, iTo);
danielk1977687566d2004-11-02 12:56:41 +00002111 }else{
2112 int isInitOrig = pPage->isInit;
2113 int i;
2114 int nCell;
2115
danielk197771d5d2c2008-09-29 11:49:47 +00002116 sqlite3BtreeInitPage(pPage);
danielk1977687566d2004-11-02 12:56:41 +00002117 nCell = pPage->nCell;
2118
danielk1977687566d2004-11-02 12:56:41 +00002119 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002120 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002121 if( eType==PTRMAP_OVERFLOW1 ){
2122 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00002123 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
danielk1977687566d2004-11-02 12:56:41 +00002124 if( info.iOverflow ){
2125 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2126 put4byte(&pCell[info.iOverflow], iTo);
2127 break;
2128 }
2129 }
2130 }else{
2131 if( get4byte(pCell)==iFrom ){
2132 put4byte(pCell, iTo);
2133 break;
2134 }
2135 }
2136 }
2137
2138 if( i==nCell ){
danielk1977fdb7cdb2005-01-17 02:12:18 +00002139 if( eType!=PTRMAP_BTREE ||
2140 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
drh49285702005-09-17 15:20:26 +00002141 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002142 }
danielk1977687566d2004-11-02 12:56:41 +00002143 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2144 }
2145
2146 pPage->isInit = isInitOrig;
2147 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002148 return SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002149}
2150
danielk1977003ba062004-11-04 02:57:33 +00002151
danielk19777701e812005-01-10 12:59:51 +00002152/*
2153** Move the open database page pDbPage to location iFreePage in the
2154** database. The pDbPage reference remains valid.
2155*/
danielk1977003ba062004-11-04 02:57:33 +00002156static int relocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00002157 BtShared *pBt, /* Btree */
danielk19777701e812005-01-10 12:59:51 +00002158 MemPage *pDbPage, /* Open page to move */
2159 u8 eType, /* Pointer map 'type' entry for pDbPage */
2160 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
danielk19774c999992008-07-16 18:17:55 +00002161 Pgno iFreePage, /* The location to move pDbPage to */
2162 int isCommit
danielk1977003ba062004-11-04 02:57:33 +00002163){
2164 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2165 Pgno iDbPage = pDbPage->pgno;
2166 Pager *pPager = pBt->pPager;
2167 int rc;
2168
danielk1977a0bf2652004-11-04 14:30:04 +00002169 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2170 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
drh1fee73e2007-08-29 04:00:57 +00002171 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +00002172 assert( pDbPage->pBt==pBt );
danielk1977003ba062004-11-04 02:57:33 +00002173
drh85b623f2007-12-13 21:54:09 +00002174 /* Move page iDbPage from its current location to page number iFreePage */
danielk1977003ba062004-11-04 02:57:33 +00002175 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2176 iDbPage, iFreePage, iPtrPage, eType));
danielk19774c999992008-07-16 18:17:55 +00002177 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
danielk1977003ba062004-11-04 02:57:33 +00002178 if( rc!=SQLITE_OK ){
2179 return rc;
2180 }
2181 pDbPage->pgno = iFreePage;
2182
2183 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2184 ** that point to overflow pages. The pointer map entries for all these
2185 ** pages need to be changed.
2186 **
2187 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2188 ** pointer to a subsequent overflow page. If this is the case, then
2189 ** the pointer map needs to be updated for the subsequent overflow page.
2190 */
danielk1977a0bf2652004-11-04 14:30:04 +00002191 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00002192 rc = setChildPtrmaps(pDbPage);
2193 if( rc!=SQLITE_OK ){
2194 return rc;
2195 }
2196 }else{
2197 Pgno nextOvfl = get4byte(pDbPage->aData);
2198 if( nextOvfl!=0 ){
danielk1977003ba062004-11-04 02:57:33 +00002199 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2200 if( rc!=SQLITE_OK ){
2201 return rc;
2202 }
2203 }
2204 }
2205
2206 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2207 ** that it points at iFreePage. Also fix the pointer map entry for
2208 ** iPtrPage.
2209 */
danielk1977a0bf2652004-11-04 14:30:04 +00002210 if( eType!=PTRMAP_ROOTPAGE ){
drh16a9b832007-05-05 18:39:25 +00002211 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00002212 if( rc!=SQLITE_OK ){
2213 return rc;
2214 }
danielk19773b8a05f2007-03-19 17:44:26 +00002215 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
danielk1977a0bf2652004-11-04 14:30:04 +00002216 if( rc!=SQLITE_OK ){
2217 releasePage(pPtrPage);
2218 return rc;
2219 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002220 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
danielk1977003ba062004-11-04 02:57:33 +00002221 releasePage(pPtrPage);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002222 if( rc==SQLITE_OK ){
2223 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2224 }
danielk1977003ba062004-11-04 02:57:33 +00002225 }
danielk1977003ba062004-11-04 02:57:33 +00002226 return rc;
2227}
2228
danielk1977dddbcdc2007-04-26 14:42:34 +00002229/* Forward declaration required by incrVacuumStep(). */
drh4f0c5872007-03-26 22:05:01 +00002230static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
danielk1977687566d2004-11-02 12:56:41 +00002231
2232/*
danielk1977dddbcdc2007-04-26 14:42:34 +00002233** Perform a single step of an incremental-vacuum. If successful,
2234** return SQLITE_OK. If there is no work to do (and therefore no
2235** point in calling this function again), return SQLITE_DONE.
2236**
2237** More specificly, this function attempts to re-organize the
2238** database so that the last page of the file currently in use
2239** is no longer in use.
2240**
2241** If the nFin parameter is non-zero, the implementation assumes
2242** that the caller will keep calling incrVacuumStep() until
2243** it returns SQLITE_DONE or an error, and that nFin is the
2244** number of pages the database file will contain after this
2245** process is complete.
2246*/
2247static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2248 Pgno iLastPg; /* Last page in the database */
2249 Pgno nFreeList; /* Number of pages still on the free-list */
2250
drh1fee73e2007-08-29 04:00:57 +00002251 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977dddbcdc2007-04-26 14:42:34 +00002252 iLastPg = pBt->nTrunc;
2253 if( iLastPg==0 ){
danielk1977ad0132d2008-06-07 08:58:22 +00002254 iLastPg = pagerPagecount(pBt->pPager);
danielk1977dddbcdc2007-04-26 14:42:34 +00002255 }
2256
2257 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2258 int rc;
2259 u8 eType;
2260 Pgno iPtrPage;
2261
2262 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2263 if( nFreeList==0 || nFin==iLastPg ){
2264 return SQLITE_DONE;
2265 }
2266
2267 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2268 if( rc!=SQLITE_OK ){
2269 return rc;
2270 }
2271 if( eType==PTRMAP_ROOTPAGE ){
2272 return SQLITE_CORRUPT_BKPT;
2273 }
2274
2275 if( eType==PTRMAP_FREEPAGE ){
2276 if( nFin==0 ){
2277 /* Remove the page from the files free-list. This is not required
danielk19774ef24492007-05-23 09:52:41 +00002278 ** if nFin is non-zero. In that case, the free-list will be
danielk1977dddbcdc2007-04-26 14:42:34 +00002279 ** truncated to zero after this function returns, so it doesn't
2280 ** matter if it still contains some garbage entries.
2281 */
2282 Pgno iFreePg;
2283 MemPage *pFreePg;
2284 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2285 if( rc!=SQLITE_OK ){
2286 return rc;
2287 }
2288 assert( iFreePg==iLastPg );
2289 releasePage(pFreePg);
2290 }
2291 } else {
2292 Pgno iFreePg; /* Index of free page to move pLastPg to */
2293 MemPage *pLastPg;
2294
drh16a9b832007-05-05 18:39:25 +00002295 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002296 if( rc!=SQLITE_OK ){
2297 return rc;
2298 }
2299
danielk1977b4626a32007-04-28 15:47:43 +00002300 /* If nFin is zero, this loop runs exactly once and page pLastPg
2301 ** is swapped with the first free page pulled off the free list.
2302 **
2303 ** On the other hand, if nFin is greater than zero, then keep
2304 ** looping until a free-page located within the first nFin pages
2305 ** of the file is found.
2306 */
danielk1977dddbcdc2007-04-26 14:42:34 +00002307 do {
2308 MemPage *pFreePg;
2309 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2310 if( rc!=SQLITE_OK ){
2311 releasePage(pLastPg);
2312 return rc;
2313 }
2314 releasePage(pFreePg);
2315 }while( nFin!=0 && iFreePg>nFin );
2316 assert( iFreePg<iLastPg );
danielk1977b4626a32007-04-28 15:47:43 +00002317
2318 rc = sqlite3PagerWrite(pLastPg->pDbPage);
danielk1977662278e2007-11-05 15:30:12 +00002319 if( rc==SQLITE_OK ){
danielk19774c999992008-07-16 18:17:55 +00002320 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
danielk1977662278e2007-11-05 15:30:12 +00002321 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002322 releasePage(pLastPg);
2323 if( rc!=SQLITE_OK ){
2324 return rc;
danielk1977662278e2007-11-05 15:30:12 +00002325 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002326 }
2327 }
2328
2329 pBt->nTrunc = iLastPg - 1;
2330 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2331 pBt->nTrunc--;
2332 }
2333 return SQLITE_OK;
2334}
2335
2336/*
2337** A write-transaction must be opened before calling this function.
2338** It performs a single unit of work towards an incremental vacuum.
2339**
2340** If the incremental vacuum is finished after this function has run,
2341** SQLITE_DONE is returned. If it is not finished, but no error occured,
2342** SQLITE_OK is returned. Otherwise an SQLite error code.
2343*/
2344int sqlite3BtreeIncrVacuum(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00002345 int rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002346 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002347
2348 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002349 pBt->db = p->db;
danielk1977dddbcdc2007-04-26 14:42:34 +00002350 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2351 if( !pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00002352 rc = SQLITE_DONE;
2353 }else{
2354 invalidateAllOverflowCache(pBt);
2355 rc = incrVacuumStep(pBt, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002356 }
drhd677b3d2007-08-20 22:48:41 +00002357 sqlite3BtreeLeave(p);
2358 return rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002359}
2360
2361/*
danielk19773b8a05f2007-03-19 17:44:26 +00002362** This routine is called prior to sqlite3PagerCommit when a transaction
danielk1977687566d2004-11-02 12:56:41 +00002363** is commited for an auto-vacuum database.
danielk197724168722007-04-02 05:07:47 +00002364**
2365** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2366** the database file should be truncated to during the commit process.
2367** i.e. the database has been reorganized so that only the first *pnTrunc
2368** pages are in use.
danielk1977687566d2004-11-02 12:56:41 +00002369*/
danielk197724168722007-04-02 05:07:47 +00002370static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
danielk1977dddbcdc2007-04-26 14:42:34 +00002371 int rc = SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002372 Pager *pPager = pBt->pPager;
drhf94a1732008-09-30 17:18:17 +00002373 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002374
drh1fee73e2007-08-29 04:00:57 +00002375 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +00002376 invalidateAllOverflowCache(pBt);
danielk1977dddbcdc2007-04-26 14:42:34 +00002377 assert(pBt->autoVacuum);
2378 if( !pBt->incrVacuum ){
2379 Pgno nFin = 0;
danielk1977687566d2004-11-02 12:56:41 +00002380
danielk1977dddbcdc2007-04-26 14:42:34 +00002381 if( pBt->nTrunc==0 ){
2382 Pgno nFree;
2383 Pgno nPtrmap;
2384 const int pgsz = pBt->pageSize;
danielk1977ad0132d2008-06-07 08:58:22 +00002385 int nOrig = pagerPagecount(pBt->pPager);
danielk1977e5321f02007-04-27 07:05:44 +00002386
2387 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2388 return SQLITE_CORRUPT_BKPT;
2389 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002390 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2391 nOrig--;
danielk1977687566d2004-11-02 12:56:41 +00002392 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002393 nFree = get4byte(&pBt->pPage1->aData[36]);
2394 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2395 nFin = nOrig - nFree - nPtrmap;
2396 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2397 nFin--;
danielk1977ac11ee62005-01-15 12:45:51 +00002398 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002399 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2400 nFin--;
2401 }
2402 }
danielk1977687566d2004-11-02 12:56:41 +00002403
danielk1977dddbcdc2007-04-26 14:42:34 +00002404 while( rc==SQLITE_OK ){
2405 rc = incrVacuumStep(pBt, nFin);
2406 }
2407 if( rc==SQLITE_DONE ){
2408 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2409 rc = SQLITE_OK;
danielk19770ba32df2008-05-07 07:13:16 +00002410 if( pBt->nTrunc && nFin ){
drh67f80b62007-07-23 19:26:17 +00002411 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
danielk1977dddbcdc2007-04-26 14:42:34 +00002412 put4byte(&pBt->pPage1->aData[32], 0);
2413 put4byte(&pBt->pPage1->aData[36], 0);
2414 pBt->nTrunc = nFin;
2415 }
2416 }
2417 if( rc!=SQLITE_OK ){
2418 sqlite3PagerRollback(pPager);
2419 }
danielk1977687566d2004-11-02 12:56:41 +00002420 }
2421
danielk1977dddbcdc2007-04-26 14:42:34 +00002422 if( rc==SQLITE_OK ){
2423 *pnTrunc = pBt->nTrunc;
2424 pBt->nTrunc = 0;
2425 }
danielk19773b8a05f2007-03-19 17:44:26 +00002426 assert( nRef==sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002427 return rc;
2428}
danielk1977dddbcdc2007-04-26 14:42:34 +00002429
danielk1977687566d2004-11-02 12:56:41 +00002430#endif
2431
2432/*
drh80e35f42007-03-30 14:06:34 +00002433** This routine does the first phase of a two-phase commit. This routine
2434** causes a rollback journal to be created (if it does not already exist)
2435** and populated with enough information so that if a power loss occurs
2436** the database can be restored to its original state by playing back
2437** the journal. Then the contents of the journal are flushed out to
2438** the disk. After the journal is safely on oxide, the changes to the
2439** database are written into the database file and flushed to oxide.
2440** At the end of this call, the rollback journal still exists on the
2441** disk and we are still holding all locks, so the transaction has not
2442** committed. See sqlite3BtreeCommit() for the second phase of the
2443** commit process.
2444**
2445** This call is a no-op if no write-transaction is currently active on pBt.
2446**
2447** Otherwise, sync the database file for the btree pBt. zMaster points to
2448** the name of a master journal file that should be written into the
2449** individual journal file, or is NULL, indicating no master journal file
2450** (single database transaction).
2451**
2452** When this is called, the master journal should already have been
2453** created, populated with this journal pointer and synced to disk.
2454**
2455** Once this is routine has returned, the only thing required to commit
2456** the write-transaction for this database file is to delete the journal.
2457*/
2458int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2459 int rc = SQLITE_OK;
2460 if( p->inTrans==TRANS_WRITE ){
2461 BtShared *pBt = p->pBt;
2462 Pgno nTrunc = 0;
drhd677b3d2007-08-20 22:48:41 +00002463 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002464 pBt->db = p->db;
drh80e35f42007-03-30 14:06:34 +00002465#ifndef SQLITE_OMIT_AUTOVACUUM
2466 if( pBt->autoVacuum ){
2467 rc = autoVacuumCommit(pBt, &nTrunc);
2468 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002469 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002470 return rc;
2471 }
2472 }
2473#endif
danielk1977f653d782008-03-20 11:04:21 +00002474 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
drhd677b3d2007-08-20 22:48:41 +00002475 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002476 }
2477 return rc;
2478}
2479
2480/*
drh2aa679f2001-06-25 02:11:07 +00002481** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00002482**
drh6e345992007-03-30 11:12:08 +00002483** This routine implements the second phase of a 2-phase commit. The
2484** sqlite3BtreeSync() routine does the first phase and should be invoked
2485** prior to calling this routine. The sqlite3BtreeSync() routine did
2486** all the work of writing information out to disk and flushing the
2487** contents so that they are written onto the disk platter. All this
2488** routine has to do is delete or truncate the rollback journal
2489** (which causes the transaction to commit) and drop locks.
2490**
drh5e00f6c2001-09-13 13:46:56 +00002491** This will release the write lock on the database file. If there
2492** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002493*/
drh80e35f42007-03-30 14:06:34 +00002494int sqlite3BtreeCommitPhaseTwo(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00002495 BtShared *pBt = p->pBt;
2496
drhd677b3d2007-08-20 22:48:41 +00002497 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002498 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002499 btreeIntegrity(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002500
2501 /* If the handle has a write-transaction open, commit the shared-btrees
2502 ** transaction and set the shared state to TRANS_READ.
2503 */
2504 if( p->inTrans==TRANS_WRITE ){
danielk19777f7bc662006-01-23 13:47:47 +00002505 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002506 assert( pBt->inTransaction==TRANS_WRITE );
2507 assert( pBt->nTransaction>0 );
drh80e35f42007-03-30 14:06:34 +00002508 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
danielk19777f7bc662006-01-23 13:47:47 +00002509 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002510 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002511 return rc;
2512 }
danielk1977aef0bf62005-12-30 16:28:01 +00002513 pBt->inTransaction = TRANS_READ;
2514 pBt->inStmt = 0;
danielk1977ee5741e2004-05-31 10:01:34 +00002515 }
danielk19777f7bc662006-01-23 13:47:47 +00002516 unlockAllTables(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002517
2518 /* If the handle has any kind of transaction open, decrement the transaction
2519 ** count of the shared btree. If the transaction count reaches 0, set
2520 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2521 ** will unlock the pager.
2522 */
2523 if( p->inTrans!=TRANS_NONE ){
2524 pBt->nTransaction--;
2525 if( 0==pBt->nTransaction ){
2526 pBt->inTransaction = TRANS_NONE;
2527 }
2528 }
2529
2530 /* Set the handles current transaction state to TRANS_NONE and unlock
2531 ** the pager if this call closed the only read or write transaction.
2532 */
2533 p->inTrans = TRANS_NONE;
drh5e00f6c2001-09-13 13:46:56 +00002534 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002535
2536 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002537 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002538 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002539}
2540
drh80e35f42007-03-30 14:06:34 +00002541/*
2542** Do both phases of a commit.
2543*/
2544int sqlite3BtreeCommit(Btree *p){
2545 int rc;
drhd677b3d2007-08-20 22:48:41 +00002546 sqlite3BtreeEnter(p);
drh80e35f42007-03-30 14:06:34 +00002547 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2548 if( rc==SQLITE_OK ){
2549 rc = sqlite3BtreeCommitPhaseTwo(p);
2550 }
drhd677b3d2007-08-20 22:48:41 +00002551 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002552 return rc;
2553}
2554
danielk1977fbcd5852004-06-15 02:44:18 +00002555#ifndef NDEBUG
2556/*
2557** Return the number of write-cursors open on this handle. This is for use
2558** in assert() expressions, so it is only compiled if NDEBUG is not
2559** defined.
drhfb982642007-08-30 01:19:59 +00002560**
2561** For the purposes of this routine, a write-cursor is any cursor that
2562** is capable of writing to the databse. That means the cursor was
2563** originally opened for writing and the cursor has not be disabled
2564** by having its state changed to CURSOR_FAULT.
danielk1977fbcd5852004-06-15 02:44:18 +00002565*/
danielk1977aef0bf62005-12-30 16:28:01 +00002566static int countWriteCursors(BtShared *pBt){
danielk1977fbcd5852004-06-15 02:44:18 +00002567 BtCursor *pCur;
2568 int r = 0;
2569 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drhfb982642007-08-30 01:19:59 +00002570 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
danielk1977fbcd5852004-06-15 02:44:18 +00002571 }
2572 return r;
2573}
2574#endif
2575
drhc39e0002004-05-07 23:50:57 +00002576/*
drhfb982642007-08-30 01:19:59 +00002577** This routine sets the state to CURSOR_FAULT and the error
2578** code to errCode for every cursor on BtShared that pBtree
2579** references.
2580**
2581** Every cursor is tripped, including cursors that belong
2582** to other database connections that happen to be sharing
2583** the cache with pBtree.
2584**
2585** This routine gets called when a rollback occurs.
2586** All cursors using the same cache must be tripped
2587** to prevent them from trying to use the btree after
2588** the rollback. The rollback may have deleted tables
2589** or moved root pages, so it is not sufficient to
2590** save the state of the cursor. The cursor must be
2591** invalidated.
2592*/
2593void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2594 BtCursor *p;
2595 sqlite3BtreeEnter(pBtree);
2596 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
danielk1977be51a652008-10-08 17:58:48 +00002597 sqlite3BtreeClearCursor(p);
drhfb982642007-08-30 01:19:59 +00002598 p->eState = CURSOR_FAULT;
2599 p->skip = errCode;
2600 }
2601 sqlite3BtreeLeave(pBtree);
2602}
2603
2604/*
drhecdc7532001-09-23 02:35:53 +00002605** Rollback the transaction in progress. All cursors will be
2606** invalided by this operation. Any attempt to use a cursor
2607** that was open at the beginning of this operation will result
2608** in an error.
drh5e00f6c2001-09-13 13:46:56 +00002609**
2610** This will release the write lock on the database file. If there
2611** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002612*/
danielk1977aef0bf62005-12-30 16:28:01 +00002613int sqlite3BtreeRollback(Btree *p){
danielk19778d34dfd2006-01-24 16:37:57 +00002614 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002615 BtShared *pBt = p->pBt;
drh24cd67e2004-05-10 16:18:47 +00002616 MemPage *pPage1;
danielk1977aef0bf62005-12-30 16:28:01 +00002617
drhd677b3d2007-08-20 22:48:41 +00002618 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002619 pBt->db = p->db;
danielk19772b8c13e2006-01-24 14:21:24 +00002620 rc = saveAllCursors(pBt, 0, 0);
danielk19778d34dfd2006-01-24 16:37:57 +00002621#ifndef SQLITE_OMIT_SHARED_CACHE
danielk19772b8c13e2006-01-24 14:21:24 +00002622 if( rc!=SQLITE_OK ){
danielk19778d34dfd2006-01-24 16:37:57 +00002623 /* This is a horrible situation. An IO or malloc() error occured whilst
2624 ** trying to save cursor positions. If this is an automatic rollback (as
2625 ** the result of a constraint, malloc() failure or IO error) then
2626 ** the cache may be internally inconsistent (not contain valid trees) so
2627 ** we cannot simply return the error to the caller. Instead, abort
2628 ** all queries that may be using any of the cursors that failed to save.
2629 */
drhfb982642007-08-30 01:19:59 +00002630 sqlite3BtreeTripAllCursors(p, rc);
danielk19772b8c13e2006-01-24 14:21:24 +00002631 }
danielk19778d34dfd2006-01-24 16:37:57 +00002632#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002633 btreeIntegrity(p);
2634 unlockAllTables(p);
2635
2636 if( p->inTrans==TRANS_WRITE ){
danielk19778d34dfd2006-01-24 16:37:57 +00002637 int rc2;
danielk1977aef0bf62005-12-30 16:28:01 +00002638
danielk1977dddbcdc2007-04-26 14:42:34 +00002639#ifndef SQLITE_OMIT_AUTOVACUUM
2640 pBt->nTrunc = 0;
2641#endif
2642
danielk19778d34dfd2006-01-24 16:37:57 +00002643 assert( TRANS_WRITE==pBt->inTransaction );
danielk19773b8a05f2007-03-19 17:44:26 +00002644 rc2 = sqlite3PagerRollback(pBt->pPager);
danielk19778d34dfd2006-01-24 16:37:57 +00002645 if( rc2!=SQLITE_OK ){
2646 rc = rc2;
2647 }
2648
drh24cd67e2004-05-10 16:18:47 +00002649 /* The rollback may have destroyed the pPage1->aData value. So
drh16a9b832007-05-05 18:39:25 +00002650 ** call sqlite3BtreeGetPage() on page 1 again to make
2651 ** sure pPage1->aData is set correctly. */
2652 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
drh24cd67e2004-05-10 16:18:47 +00002653 releasePage(pPage1);
2654 }
danielk1977fbcd5852004-06-15 02:44:18 +00002655 assert( countWriteCursors(pBt)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00002656 pBt->inTransaction = TRANS_READ;
drh24cd67e2004-05-10 16:18:47 +00002657 }
danielk1977aef0bf62005-12-30 16:28:01 +00002658
2659 if( p->inTrans!=TRANS_NONE ){
2660 assert( pBt->nTransaction>0 );
2661 pBt->nTransaction--;
2662 if( 0==pBt->nTransaction ){
2663 pBt->inTransaction = TRANS_NONE;
2664 }
2665 }
2666
2667 p->inTrans = TRANS_NONE;
danielk1977ee5741e2004-05-31 10:01:34 +00002668 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00002669 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002670
2671 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002672 sqlite3BtreeLeave(p);
drha059ad02001-04-17 20:09:11 +00002673 return rc;
2674}
2675
2676/*
drhab01f612004-05-22 02:55:23 +00002677** Start a statement subtransaction. The subtransaction can
2678** can be rolled back independently of the main transaction.
2679** You must start a transaction before starting a subtransaction.
2680** The subtransaction is ended automatically if the main transaction
drh663fc632002-02-02 18:49:19 +00002681** commits or rolls back.
2682**
drhab01f612004-05-22 02:55:23 +00002683** Only one subtransaction may be active at a time. It is an error to try
2684** to start a new subtransaction if another subtransaction is already active.
2685**
2686** Statement subtransactions are used around individual SQL statements
2687** that are contained within a BEGIN...COMMIT block. If a constraint
2688** error occurs within the statement, the effect of that one statement
2689** can be rolled back without having to rollback the entire transaction.
drh663fc632002-02-02 18:49:19 +00002690*/
danielk1977aef0bf62005-12-30 16:28:01 +00002691int sqlite3BtreeBeginStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002692 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002693 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002694 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002695 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002696 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
drhd677b3d2007-08-20 22:48:41 +00002697 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2698 }else{
2699 assert( pBt->inTransaction==TRANS_WRITE );
2700 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2701 pBt->inStmt = 1;
drh0d65dc02002-02-03 00:56:09 +00002702 }
drhd677b3d2007-08-20 22:48:41 +00002703 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002704 return rc;
2705}
2706
2707
2708/*
drhab01f612004-05-22 02:55:23 +00002709** Commit the statment subtransaction currently in progress. If no
2710** subtransaction is active, this is a no-op.
drh663fc632002-02-02 18:49:19 +00002711*/
danielk1977aef0bf62005-12-30 16:28:01 +00002712int sqlite3BtreeCommitStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002713 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002714 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002715 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002716 pBt->db = p->db;
drh3aac2dd2004-04-26 14:10:20 +00002717 if( pBt->inStmt && !pBt->readOnly ){
danielk19773b8a05f2007-03-19 17:44:26 +00002718 rc = sqlite3PagerStmtCommit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00002719 }else{
2720 rc = SQLITE_OK;
2721 }
drh3aac2dd2004-04-26 14:10:20 +00002722 pBt->inStmt = 0;
drhd677b3d2007-08-20 22:48:41 +00002723 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002724 return rc;
2725}
2726
2727/*
drhab01f612004-05-22 02:55:23 +00002728** Rollback the active statement subtransaction. If no subtransaction
2729** is active this routine is a no-op.
drh663fc632002-02-02 18:49:19 +00002730**
drhab01f612004-05-22 02:55:23 +00002731** All cursors will be invalidated by this operation. Any attempt
drh663fc632002-02-02 18:49:19 +00002732** to use a cursor that was open at the beginning of this operation
2733** will result in an error.
2734*/
danielk1977aef0bf62005-12-30 16:28:01 +00002735int sqlite3BtreeRollbackStmt(Btree *p){
danielk197797a227c2006-01-20 16:32:04 +00002736 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002737 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002738 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002739 pBt->db = p->db;
danielk197797a227c2006-01-20 16:32:04 +00002740 if( pBt->inStmt && !pBt->readOnly ){
danielk19773b8a05f2007-03-19 17:44:26 +00002741 rc = sqlite3PagerStmtRollback(pBt->pPager);
danielk197797a227c2006-01-20 16:32:04 +00002742 pBt->inStmt = 0;
2743 }
drhd677b3d2007-08-20 22:48:41 +00002744 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002745 return rc;
2746}
2747
2748/*
drh8b2f49b2001-06-08 00:21:52 +00002749** Create a new cursor for the BTree whose root is on the page
2750** iTable. The act of acquiring a cursor gets a read lock on
2751** the database file.
drh1bee3d72001-10-15 00:44:35 +00002752**
2753** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00002754** If wrFlag==1, then the cursor can be used for reading or for
2755** writing if other conditions for writing are also met. These
2756** are the conditions that must be met in order for writing to
2757** be allowed:
drh6446c4d2001-12-15 14:22:18 +00002758**
drhf74b8d92002-09-01 23:20:45 +00002759** 1: The cursor must have been opened with wrFlag==1
2760**
drhfe5d71d2007-03-19 11:54:10 +00002761** 2: Other database connections that share the same pager cache
2762** but which are not in the READ_UNCOMMITTED state may not have
2763** cursors open with wrFlag==0 on the same table. Otherwise
2764** the changes made by this write cursor would be visible to
2765** the read cursors in the other database connection.
drhf74b8d92002-09-01 23:20:45 +00002766**
2767** 3: The database must be writable (not on read-only media)
2768**
2769** 4: There must be an active transaction.
2770**
drh6446c4d2001-12-15 14:22:18 +00002771** No checking is done to make sure that page iTable really is the
2772** root page of a b-tree. If it is not, then the cursor acquired
2773** will not work correctly.
danielk197771d5d2c2008-09-29 11:49:47 +00002774**
2775** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
2776** pointed to by pCur have been zeroed by the caller.
drha059ad02001-04-17 20:09:11 +00002777*/
drhd677b3d2007-08-20 22:48:41 +00002778static int btreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002779 Btree *p, /* The btree */
2780 int iTable, /* Root page of table to open */
2781 int wrFlag, /* 1 to write. 0 read-only */
2782 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2783 BtCursor *pCur /* Space for new cursor */
drh3aac2dd2004-04-26 14:10:20 +00002784){
drha059ad02001-04-17 20:09:11 +00002785 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002786 BtShared *pBt = p->pBt;
drhecdc7532001-09-23 02:35:53 +00002787
drh1fee73e2007-08-29 04:00:57 +00002788 assert( sqlite3BtreeHoldsMutex(p) );
drh8dcd7ca2004-08-08 19:43:29 +00002789 if( wrFlag ){
drh8dcd7ca2004-08-08 19:43:29 +00002790 if( pBt->readOnly ){
2791 return SQLITE_READONLY;
2792 }
danielk19773588ceb2008-06-10 17:30:26 +00002793 if( checkReadLocks(p, iTable, 0, 0) ){
drh8dcd7ca2004-08-08 19:43:29 +00002794 return SQLITE_LOCKED;
2795 }
drha0c9a112004-03-10 13:42:37 +00002796 }
danielk1977aef0bf62005-12-30 16:28:01 +00002797
drh4b70f112004-05-02 21:12:19 +00002798 if( pBt->pPage1==0 ){
danielk1977aef0bf62005-12-30 16:28:01 +00002799 rc = lockBtreeWithRetry(p);
drha059ad02001-04-17 20:09:11 +00002800 if( rc!=SQLITE_OK ){
drha059ad02001-04-17 20:09:11 +00002801 return rc;
2802 }
drh1831f182007-04-24 17:35:59 +00002803 if( pBt->readOnly && wrFlag ){
2804 return SQLITE_READONLY;
2805 }
drha059ad02001-04-17 20:09:11 +00002806 }
drh8b2f49b2001-06-08 00:21:52 +00002807 pCur->pgnoRoot = (Pgno)iTable;
danielk1977ad0132d2008-06-07 08:58:22 +00002808 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
drh24cd67e2004-05-10 16:18:47 +00002809 rc = SQLITE_EMPTY;
2810 goto create_cursor_exception;
2811 }
danielk197771d5d2c2008-09-29 11:49:47 +00002812 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
drhbd03cae2001-06-02 02:40:57 +00002813 if( rc!=SQLITE_OK ){
2814 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00002815 }
danielk1977aef0bf62005-12-30 16:28:01 +00002816
danielk1977aef0bf62005-12-30 16:28:01 +00002817 /* Now that no other errors can occur, finish filling in the BtCursor
2818 ** variables, link the cursor into the BtShared list and set *ppCur (the
2819 ** output argument to this function).
2820 */
drh1e968a02008-03-25 00:22:21 +00002821 pCur->pKeyInfo = pKeyInfo;
danielk1977aef0bf62005-12-30 16:28:01 +00002822 pCur->pBtree = p;
drhd0679ed2007-08-28 22:24:34 +00002823 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00002824 pCur->wrFlag = wrFlag;
drha059ad02001-04-17 20:09:11 +00002825 pCur->pNext = pBt->pCursor;
2826 if( pCur->pNext ){
2827 pCur->pNext->pPrev = pCur;
2828 }
2829 pBt->pCursor = pCur;
danielk1977da184232006-01-05 11:34:32 +00002830 pCur->eState = CURSOR_INVALID;
drhbd03cae2001-06-02 02:40:57 +00002831
danielk1977aef0bf62005-12-30 16:28:01 +00002832 return SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00002833
drhbd03cae2001-06-02 02:40:57 +00002834create_cursor_exception:
danielk197771d5d2c2008-09-29 11:49:47 +00002835 releasePage(pCur->apPage[0]);
drh5e00f6c2001-09-13 13:46:56 +00002836 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00002837 return rc;
drha059ad02001-04-17 20:09:11 +00002838}
drhd677b3d2007-08-20 22:48:41 +00002839int sqlite3BtreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002840 Btree *p, /* The btree */
2841 int iTable, /* Root page of table to open */
2842 int wrFlag, /* 1 to write. 0 read-only */
2843 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2844 BtCursor *pCur /* Write new cursor here */
drhd677b3d2007-08-20 22:48:41 +00002845){
2846 int rc;
2847 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002848 p->pBt->db = p->db;
danielk1977cd3e8f72008-03-25 09:47:35 +00002849 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
drhd677b3d2007-08-20 22:48:41 +00002850 sqlite3BtreeLeave(p);
2851 return rc;
2852}
danielk1977cd3e8f72008-03-25 09:47:35 +00002853int sqlite3BtreeCursorSize(){
2854 return sizeof(BtCursor);
2855}
2856
drhd677b3d2007-08-20 22:48:41 +00002857
drha059ad02001-04-17 20:09:11 +00002858
2859/*
drh5e00f6c2001-09-13 13:46:56 +00002860** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00002861** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00002862*/
drh3aac2dd2004-04-26 14:10:20 +00002863int sqlite3BtreeCloseCursor(BtCursor *pCur){
drhff0587c2007-08-29 17:43:19 +00002864 Btree *pBtree = pCur->pBtree;
danielk1977cd3e8f72008-03-25 09:47:35 +00002865 if( pBtree ){
danielk197771d5d2c2008-09-29 11:49:47 +00002866 int i;
danielk1977cd3e8f72008-03-25 09:47:35 +00002867 BtShared *pBt = pCur->pBt;
2868 sqlite3BtreeEnter(pBtree);
2869 pBt->db = pBtree->db;
danielk1977be51a652008-10-08 17:58:48 +00002870 sqlite3BtreeClearCursor(pCur);
danielk1977cd3e8f72008-03-25 09:47:35 +00002871 if( pCur->pPrev ){
2872 pCur->pPrev->pNext = pCur->pNext;
2873 }else{
2874 pBt->pCursor = pCur->pNext;
2875 }
2876 if( pCur->pNext ){
2877 pCur->pNext->pPrev = pCur->pPrev;
2878 }
danielk197771d5d2c2008-09-29 11:49:47 +00002879 for(i=0; i<=pCur->iPage; i++){
2880 releasePage(pCur->apPage[i]);
2881 }
danielk1977cd3e8f72008-03-25 09:47:35 +00002882 unlockBtreeIfUnused(pBt);
2883 invalidateOverflowCache(pCur);
2884 /* sqlite3_free(pCur); */
2885 sqlite3BtreeLeave(pBtree);
drha059ad02001-04-17 20:09:11 +00002886 }
drh8c42ca92001-06-22 19:15:00 +00002887 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002888}
2889
drh7e3b0a02001-04-28 16:52:40 +00002890/*
drh5e2f8b92001-05-28 00:41:15 +00002891** Make a temporary cursor by filling in the fields of pTempCur.
2892** The temporary cursor is not on the cursor list for the Btree.
2893*/
drh16a9b832007-05-05 18:39:25 +00002894void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002895 int i;
drh1fee73e2007-08-29 04:00:57 +00002896 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002897 memcpy(pTempCur, pCur, sizeof(BtCursor));
drh5e2f8b92001-05-28 00:41:15 +00002898 pTempCur->pNext = 0;
2899 pTempCur->pPrev = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00002900 for(i=0; i<=pTempCur->iPage; i++){
2901 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002902 }
drh5e2f8b92001-05-28 00:41:15 +00002903}
2904
2905/*
drhbd03cae2001-06-02 02:40:57 +00002906** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00002907** function above.
2908*/
drh16a9b832007-05-05 18:39:25 +00002909void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002910 int i;
drh1fee73e2007-08-29 04:00:57 +00002911 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002912 for(i=0; i<=pCur->iPage; i++){
2913 sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002914 }
drh5e2f8b92001-05-28 00:41:15 +00002915}
2916
2917/*
drh86057612007-06-26 01:04:48 +00002918** Make sure the BtCursor* given in the argument has a valid
2919** BtCursor.info structure. If it is not already valid, call
danielk19771cc5ed82007-05-16 17:28:43 +00002920** sqlite3BtreeParseCell() to fill it in.
drhab01f612004-05-22 02:55:23 +00002921**
2922** BtCursor.info is a cache of the information in the current cell.
drh16a9b832007-05-05 18:39:25 +00002923** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
drh86057612007-06-26 01:04:48 +00002924**
2925** 2007-06-25: There is a bug in some versions of MSVC that cause the
2926** compiler to crash when getCellInfo() is implemented as a macro.
2927** But there is a measureable speed advantage to using the macro on gcc
2928** (when less compiler optimizations like -Os or -O0 are used and the
2929** compiler is not doing agressive inlining.) So we use a real function
2930** for MSVC and a macro for everything else. Ticket #2457.
drh9188b382004-05-14 21:12:22 +00002931*/
drh9188b382004-05-14 21:12:22 +00002932#ifndef NDEBUG
danielk19771cc5ed82007-05-16 17:28:43 +00002933 static void assertCellInfo(BtCursor *pCur){
drh9188b382004-05-14 21:12:22 +00002934 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00002935 int iPage = pCur->iPage;
drh51c6d962004-06-06 00:42:25 +00002936 memset(&info, 0, sizeof(info));
danielk197771d5d2c2008-09-29 11:49:47 +00002937 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
drh9188b382004-05-14 21:12:22 +00002938 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
drh9188b382004-05-14 21:12:22 +00002939 }
danielk19771cc5ed82007-05-16 17:28:43 +00002940#else
2941 #define assertCellInfo(x)
2942#endif
drh86057612007-06-26 01:04:48 +00002943#ifdef _MSC_VER
2944 /* Use a real function in MSVC to work around bugs in that compiler. */
2945 static void getCellInfo(BtCursor *pCur){
2946 if( pCur->info.nSize==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00002947 int iPage = pCur->iPage;
2948 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
drha2c20e42008-03-29 16:01:04 +00002949 pCur->validNKey = 1;
drh86057612007-06-26 01:04:48 +00002950 }else{
2951 assertCellInfo(pCur);
2952 }
2953 }
2954#else /* if not _MSC_VER */
2955 /* Use a macro in all other compilers so that the function is inlined */
danielk197771d5d2c2008-09-29 11:49:47 +00002956#define getCellInfo(pCur) \
2957 if( pCur->info.nSize==0 ){ \
2958 int iPage = pCur->iPage; \
2959 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
2960 pCur->validNKey = 1; \
2961 }else{ \
2962 assertCellInfo(pCur); \
drh86057612007-06-26 01:04:48 +00002963 }
2964#endif /* _MSC_VER */
drh9188b382004-05-14 21:12:22 +00002965
2966/*
drh3aac2dd2004-04-26 14:10:20 +00002967** Set *pSize to the size of the buffer needed to hold the value of
2968** the key for the current entry. If the cursor is not pointing
2969** to a valid entry, *pSize is set to 0.
2970**
drh4b70f112004-05-02 21:12:19 +00002971** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00002972** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00002973*/
drh4a1c3802004-05-12 15:15:47 +00002974int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
drhd677b3d2007-08-20 22:48:41 +00002975 int rc;
2976
drh1fee73e2007-08-29 04:00:57 +00002977 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00002978 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00002979 if( rc==SQLITE_OK ){
2980 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2981 if( pCur->eState==CURSOR_INVALID ){
2982 *pSize = 0;
2983 }else{
drh86057612007-06-26 01:04:48 +00002984 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00002985 *pSize = pCur->info.nKey;
2986 }
drh72f82862001-05-24 21:06:34 +00002987 }
danielk1977da184232006-01-05 11:34:32 +00002988 return rc;
drha059ad02001-04-17 20:09:11 +00002989}
drh2af926b2001-05-15 00:39:25 +00002990
drh72f82862001-05-24 21:06:34 +00002991/*
drh0e1c19e2004-05-11 00:58:56 +00002992** Set *pSize to the number of bytes of data in the entry the
2993** cursor currently points to. Always return SQLITE_OK.
2994** Failure is not possible. If the cursor is not currently
2995** pointing to an entry (which can happen, for example, if
2996** the database is empty) then *pSize is set to 0.
2997*/
2998int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drhd677b3d2007-08-20 22:48:41 +00002999 int rc;
3000
drh1fee73e2007-08-29 04:00:57 +00003001 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003002 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003003 if( rc==SQLITE_OK ){
3004 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3005 if( pCur->eState==CURSOR_INVALID ){
3006 /* Not pointing at a valid entry - set *pSize to 0. */
3007 *pSize = 0;
3008 }else{
drh86057612007-06-26 01:04:48 +00003009 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00003010 *pSize = pCur->info.nData;
3011 }
drh0e1c19e2004-05-11 00:58:56 +00003012 }
danielk1977da184232006-01-05 11:34:32 +00003013 return rc;
drh0e1c19e2004-05-11 00:58:56 +00003014}
3015
3016/*
danielk1977d04417962007-05-02 13:16:30 +00003017** Given the page number of an overflow page in the database (parameter
3018** ovfl), this function finds the page number of the next page in the
3019** linked list of overflow pages. If possible, it uses the auto-vacuum
3020** pointer-map data instead of reading the content of page ovfl to do so.
3021**
3022** If an error occurs an SQLite error code is returned. Otherwise:
3023**
3024** Unless pPgnoNext is NULL, the page number of the next overflow
3025** page in the linked list is written to *pPgnoNext. If page ovfl
drh85b623f2007-12-13 21:54:09 +00003026** is the last page in its linked list, *pPgnoNext is set to zero.
danielk1977d04417962007-05-02 13:16:30 +00003027**
3028** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3029** for page ovfl. The underlying pager page may have been requested
3030** with the noContent flag set, so the page data accessable via
3031** this handle may not be trusted.
3032*/
3033static int getOverflowPage(
3034 BtShared *pBt,
3035 Pgno ovfl, /* Overflow page */
3036 MemPage **ppPage, /* OUT: MemPage handle */
3037 Pgno *pPgnoNext /* OUT: Next overflow page number */
3038){
3039 Pgno next = 0;
3040 int rc;
3041
drh1fee73e2007-08-29 04:00:57 +00003042 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977d04417962007-05-02 13:16:30 +00003043 /* One of these must not be NULL. Otherwise, why call this function? */
3044 assert(ppPage || pPgnoNext);
3045
3046 /* If pPgnoNext is NULL, then this function is being called to obtain
3047 ** a MemPage* reference only. No page-data is required in this case.
3048 */
3049 if( !pPgnoNext ){
drh16a9b832007-05-05 18:39:25 +00003050 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
danielk1977d04417962007-05-02 13:16:30 +00003051 }
3052
3053#ifndef SQLITE_OMIT_AUTOVACUUM
3054 /* Try to find the next page in the overflow list using the
3055 ** autovacuum pointer-map pages. Guess that the next page in
3056 ** the overflow list is page number (ovfl+1). If that guess turns
3057 ** out to be wrong, fall back to loading the data of page
3058 ** number ovfl to determine the next page number.
3059 */
3060 if( pBt->autoVacuum ){
3061 Pgno pgno;
3062 Pgno iGuess = ovfl+1;
3063 u8 eType;
3064
3065 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3066 iGuess++;
3067 }
3068
danielk1977ad0132d2008-06-07 08:58:22 +00003069 if( iGuess<=pagerPagecount(pBt->pPager) ){
danielk1977d04417962007-05-02 13:16:30 +00003070 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3071 if( rc!=SQLITE_OK ){
3072 return rc;
3073 }
3074 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3075 next = iGuess;
3076 }
3077 }
3078 }
3079#endif
3080
3081 if( next==0 || ppPage ){
3082 MemPage *pPage = 0;
3083
drh16a9b832007-05-05 18:39:25 +00003084 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
danielk1977d04417962007-05-02 13:16:30 +00003085 assert(rc==SQLITE_OK || pPage==0);
3086 if( next==0 && rc==SQLITE_OK ){
3087 next = get4byte(pPage->aData);
3088 }
3089
3090 if( ppPage ){
3091 *ppPage = pPage;
3092 }else{
3093 releasePage(pPage);
3094 }
3095 }
3096 *pPgnoNext = next;
3097
3098 return rc;
3099}
3100
danielk1977da107192007-05-04 08:32:13 +00003101/*
3102** Copy data from a buffer to a page, or from a page to a buffer.
3103**
3104** pPayload is a pointer to data stored on database page pDbPage.
3105** If argument eOp is false, then nByte bytes of data are copied
3106** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3107** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3108** of data are copied from the buffer pBuf to pPayload.
3109**
3110** SQLITE_OK is returned on success, otherwise an error code.
3111*/
3112static int copyPayload(
3113 void *pPayload, /* Pointer to page data */
3114 void *pBuf, /* Pointer to buffer */
3115 int nByte, /* Number of bytes to copy */
3116 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3117 DbPage *pDbPage /* Page containing pPayload */
3118){
3119 if( eOp ){
3120 /* Copy data from buffer to page (a write operation) */
3121 int rc = sqlite3PagerWrite(pDbPage);
3122 if( rc!=SQLITE_OK ){
3123 return rc;
3124 }
3125 memcpy(pPayload, pBuf, nByte);
3126 }else{
3127 /* Copy data from page to buffer (a read operation) */
3128 memcpy(pBuf, pPayload, nByte);
3129 }
3130 return SQLITE_OK;
3131}
danielk1977d04417962007-05-02 13:16:30 +00003132
3133/*
danielk19779f8d6402007-05-02 17:48:45 +00003134** This function is used to read or overwrite payload information
3135** for the entry that the pCur cursor is pointing to. If the eOp
3136** parameter is 0, this is a read operation (data copied into
3137** buffer pBuf). If it is non-zero, a write (data copied from
3138** buffer pBuf).
3139**
3140** A total of "amt" bytes are read or written beginning at "offset".
3141** Data is read to or from the buffer pBuf.
drh72f82862001-05-24 21:06:34 +00003142**
3143** This routine does not make a distinction between key and data.
danielk19779f8d6402007-05-02 17:48:45 +00003144** It just reads or writes bytes from the payload area. Data might
3145** appear on the main page or be scattered out on multiple overflow
3146** pages.
danielk1977da107192007-05-04 08:32:13 +00003147**
danielk1977dcbb5d32007-05-04 18:36:44 +00003148** If the BtCursor.isIncrblobHandle flag is set, and the current
danielk1977da107192007-05-04 08:32:13 +00003149** cursor entry uses one or more overflow pages, this function
3150** allocates space for and lazily popluates the overflow page-list
3151** cache array (BtCursor.aOverflow). Subsequent calls use this
3152** cache to make seeking to the supplied offset more efficient.
3153**
3154** Once an overflow page-list cache has been allocated, it may be
3155** invalidated if some other cursor writes to the same table, or if
3156** the cursor is moved to a different row. Additionally, in auto-vacuum
3157** mode, the following events may invalidate an overflow page-list cache.
3158**
3159** * An incremental vacuum,
3160** * A commit in auto_vacuum="full" mode,
3161** * Creating a table (may require moving an overflow page).
drh72f82862001-05-24 21:06:34 +00003162*/
danielk19779f8d6402007-05-02 17:48:45 +00003163static int accessPayload(
drh3aac2dd2004-04-26 14:10:20 +00003164 BtCursor *pCur, /* Cursor pointing to entry to read from */
3165 int offset, /* Begin reading this far into payload */
3166 int amt, /* Read this many bytes */
3167 unsigned char *pBuf, /* Write the bytes into this buffer */
danielk19779f8d6402007-05-02 17:48:45 +00003168 int skipKey, /* offset begins at data if this is true */
3169 int eOp /* zero to read. non-zero to write. */
drh3aac2dd2004-04-26 14:10:20 +00003170){
3171 unsigned char *aPayload;
danielk1977da107192007-05-04 08:32:13 +00003172 int rc = SQLITE_OK;
drhfa1a98a2004-05-14 19:08:17 +00003173 u32 nKey;
danielk19772dec9702007-05-02 16:48:37 +00003174 int iIdx = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003175 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3176 BtShared *pBt; /* Btree this cursor belongs to */
drh3aac2dd2004-04-26 14:10:20 +00003177
danielk1977da107192007-05-04 08:32:13 +00003178 assert( pPage );
danielk1977da184232006-01-05 11:34:32 +00003179 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003180 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
danielk1977da107192007-05-04 08:32:13 +00003181 assert( offset>=0 );
drh1fee73e2007-08-29 04:00:57 +00003182 assert( cursorHoldsMutex(pCur) );
danielk1977da107192007-05-04 08:32:13 +00003183
drh86057612007-06-26 01:04:48 +00003184 getCellInfo(pCur);
drh366fda62006-01-13 02:35:09 +00003185 aPayload = pCur->info.pCell + pCur->info.nHeader;
danielk1977da107192007-05-04 08:32:13 +00003186 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3187
drh3aac2dd2004-04-26 14:10:20 +00003188 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003189 offset += nKey;
drh3aac2dd2004-04-26 14:10:20 +00003190 }
drhfa1a98a2004-05-14 19:08:17 +00003191 if( offset+amt > nKey+pCur->info.nData ){
danielk1977da107192007-05-04 08:32:13 +00003192 /* Trying to read or write past the end of the data is an error */
danielk197767fd7a92008-09-10 17:53:35 +00003193 return SQLITE_CORRUPT_BKPT;
drh3aac2dd2004-04-26 14:10:20 +00003194 }
danielk1977da107192007-05-04 08:32:13 +00003195
3196 /* Check if data must be read/written to/from the btree page itself. */
drhfa1a98a2004-05-14 19:08:17 +00003197 if( offset<pCur->info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00003198 int a = amt;
drhfa1a98a2004-05-14 19:08:17 +00003199 if( a+offset>pCur->info.nLocal ){
3200 a = pCur->info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00003201 }
danielk1977da107192007-05-04 08:32:13 +00003202 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
drh2aa679f2001-06-25 02:11:07 +00003203 offset = 0;
drha34b6762004-05-07 13:30:42 +00003204 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00003205 amt -= a;
drhdd793422001-06-28 01:54:48 +00003206 }else{
drhfa1a98a2004-05-14 19:08:17 +00003207 offset -= pCur->info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00003208 }
danielk1977da107192007-05-04 08:32:13 +00003209
drh51f015e2007-10-16 19:45:29 +00003210 pBt = pCur->pBt;
danielk1977da107192007-05-04 08:32:13 +00003211 if( rc==SQLITE_OK && amt>0 ){
3212 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3213 Pgno nextPage;
3214
drhfa1a98a2004-05-14 19:08:17 +00003215 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
danielk1977da107192007-05-04 08:32:13 +00003216
danielk19772dec9702007-05-02 16:48:37 +00003217#ifndef SQLITE_OMIT_INCRBLOB
danielk1977dcbb5d32007-05-04 18:36:44 +00003218 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
danielk1977da107192007-05-04 08:32:13 +00003219 ** has not been allocated, allocate it now. The array is sized at
3220 ** one entry for each overflow page in the overflow chain. The
3221 ** page number of the first overflow page is stored in aOverflow[0],
3222 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3223 ** (the cache is lazily populated).
3224 */
danielk1977dcbb5d32007-05-04 18:36:44 +00003225 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
danielk19772dec9702007-05-02 16:48:37 +00003226 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
drh17435752007-08-16 04:30:38 +00003227 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
danielk19772dec9702007-05-02 16:48:37 +00003228 if( nOvfl && !pCur->aOverflow ){
danielk1977da107192007-05-04 08:32:13 +00003229 rc = SQLITE_NOMEM;
danielk19772dec9702007-05-02 16:48:37 +00003230 }
3231 }
danielk1977da107192007-05-04 08:32:13 +00003232
3233 /* If the overflow page-list cache has been allocated and the
3234 ** entry for the first required overflow page is valid, skip
3235 ** directly to it.
3236 */
danielk19772dec9702007-05-02 16:48:37 +00003237 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3238 iIdx = (offset/ovflSize);
3239 nextPage = pCur->aOverflow[iIdx];
3240 offset = (offset%ovflSize);
3241 }
3242#endif
danielk1977da107192007-05-04 08:32:13 +00003243
3244 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3245
3246#ifndef SQLITE_OMIT_INCRBLOB
3247 /* If required, populate the overflow page-list cache. */
3248 if( pCur->aOverflow ){
3249 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3250 pCur->aOverflow[iIdx] = nextPage;
3251 }
3252#endif
3253
danielk1977d04417962007-05-02 13:16:30 +00003254 if( offset>=ovflSize ){
3255 /* The only reason to read this page is to obtain the page
danielk1977da107192007-05-04 08:32:13 +00003256 ** number for the next page in the overflow chain. The page
drhfd131da2007-08-07 17:13:03 +00003257 ** data is not required. So first try to lookup the overflow
3258 ** page-list cache, if any, then fall back to the getOverflowPage()
danielk1977da107192007-05-04 08:32:13 +00003259 ** function.
danielk1977d04417962007-05-02 13:16:30 +00003260 */
danielk19772dec9702007-05-02 16:48:37 +00003261#ifndef SQLITE_OMIT_INCRBLOB
danielk1977da107192007-05-04 08:32:13 +00003262 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3263 nextPage = pCur->aOverflow[iIdx+1];
3264 } else
danielk19772dec9702007-05-02 16:48:37 +00003265#endif
danielk1977da107192007-05-04 08:32:13 +00003266 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
danielk1977da107192007-05-04 08:32:13 +00003267 offset -= ovflSize;
danielk1977d04417962007-05-02 13:16:30 +00003268 }else{
danielk19779f8d6402007-05-02 17:48:45 +00003269 /* Need to read this page properly. It contains some of the
3270 ** range of data that is being read (eOp==0) or written (eOp!=0).
danielk1977d04417962007-05-02 13:16:30 +00003271 */
3272 DbPage *pDbPage;
danielk1977cfe9a692004-06-16 12:00:29 +00003273 int a = amt;
danielk1977d04417962007-05-02 13:16:30 +00003274 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
danielk1977da107192007-05-04 08:32:13 +00003275 if( rc==SQLITE_OK ){
3276 aPayload = sqlite3PagerGetData(pDbPage);
3277 nextPage = get4byte(aPayload);
3278 if( a + offset > ovflSize ){
3279 a = ovflSize - offset;
danielk19779f8d6402007-05-02 17:48:45 +00003280 }
danielk1977da107192007-05-04 08:32:13 +00003281 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3282 sqlite3PagerUnref(pDbPage);
3283 offset = 0;
3284 amt -= a;
3285 pBuf += a;
danielk19779f8d6402007-05-02 17:48:45 +00003286 }
danielk1977cfe9a692004-06-16 12:00:29 +00003287 }
drh2af926b2001-05-15 00:39:25 +00003288 }
drh2af926b2001-05-15 00:39:25 +00003289 }
danielk1977cfe9a692004-06-16 12:00:29 +00003290
danielk1977da107192007-05-04 08:32:13 +00003291 if( rc==SQLITE_OK && amt>0 ){
drh49285702005-09-17 15:20:26 +00003292 return SQLITE_CORRUPT_BKPT;
drha7fcb052001-12-14 15:09:55 +00003293 }
danielk1977da107192007-05-04 08:32:13 +00003294 return rc;
drh2af926b2001-05-15 00:39:25 +00003295}
3296
drh72f82862001-05-24 21:06:34 +00003297/*
drh3aac2dd2004-04-26 14:10:20 +00003298** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003299** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003300** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00003301**
drh3aac2dd2004-04-26 14:10:20 +00003302** Return SQLITE_OK on success or an error code if anything goes
3303** wrong. An error is returned if "offset+amt" is larger than
3304** the available payload.
drh72f82862001-05-24 21:06:34 +00003305*/
drha34b6762004-05-07 13:30:42 +00003306int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003307 int rc;
3308
drh1fee73e2007-08-29 04:00:57 +00003309 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003310 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003311 if( rc==SQLITE_OK ){
3312 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003313 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3314 if( pCur->apPage[0]->intKey ){
danielk1977da184232006-01-05 11:34:32 +00003315 return SQLITE_CORRUPT_BKPT;
3316 }
danielk197771d5d2c2008-09-29 11:49:47 +00003317 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003318 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
drh6575a222005-03-10 17:06:34 +00003319 }
danielk1977da184232006-01-05 11:34:32 +00003320 return rc;
drh3aac2dd2004-04-26 14:10:20 +00003321}
3322
3323/*
drh3aac2dd2004-04-26 14:10:20 +00003324** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003325** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003326** begins at "offset".
3327**
3328** Return SQLITE_OK on success or an error code if anything goes
3329** wrong. An error is returned if "offset+amt" is larger than
3330** the available payload.
drh72f82862001-05-24 21:06:34 +00003331*/
drh3aac2dd2004-04-26 14:10:20 +00003332int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003333 int rc;
3334
danielk19773588ceb2008-06-10 17:30:26 +00003335#ifndef SQLITE_OMIT_INCRBLOB
3336 if ( pCur->eState==CURSOR_INVALID ){
3337 return SQLITE_ABORT;
3338 }
3339#endif
3340
drh1fee73e2007-08-29 04:00:57 +00003341 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003342 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003343 if( rc==SQLITE_OK ){
3344 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003345 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3346 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003347 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
danielk1977da184232006-01-05 11:34:32 +00003348 }
3349 return rc;
drh2af926b2001-05-15 00:39:25 +00003350}
3351
drh72f82862001-05-24 21:06:34 +00003352/*
drh0e1c19e2004-05-11 00:58:56 +00003353** Return a pointer to payload information from the entry that the
3354** pCur cursor is pointing to. The pointer is to the beginning of
3355** the key if skipKey==0 and it points to the beginning of data if
drhe51c44f2004-05-30 20:46:09 +00003356** skipKey==1. The number of bytes of available key/data is written
3357** into *pAmt. If *pAmt==0, then the value returned will not be
3358** a valid pointer.
drh0e1c19e2004-05-11 00:58:56 +00003359**
3360** This routine is an optimization. It is common for the entire key
3361** and data to fit on the local page and for there to be no overflow
3362** pages. When that is so, this routine can be used to access the
3363** key and data without making a copy. If the key and/or data spills
drh16a9b832007-05-05 18:39:25 +00003364** onto overflow pages, then accessPayload() must be used to reassembly
drh0e1c19e2004-05-11 00:58:56 +00003365** the key/data and copy it into a preallocated buffer.
3366**
3367** The pointer returned by this routine looks directly into the cached
3368** page of the database. The data might change or move the next time
3369** any btree routine is called.
3370*/
3371static const unsigned char *fetchPayload(
3372 BtCursor *pCur, /* Cursor pointing to entry to read from */
drhe51c44f2004-05-30 20:46:09 +00003373 int *pAmt, /* Write the number of available bytes here */
drh0e1c19e2004-05-11 00:58:56 +00003374 int skipKey /* read beginning at data if this is true */
3375){
3376 unsigned char *aPayload;
3377 MemPage *pPage;
drhfa1a98a2004-05-14 19:08:17 +00003378 u32 nKey;
3379 int nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003380
danielk197771d5d2c2008-09-29 11:49:47 +00003381 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
danielk1977da184232006-01-05 11:34:32 +00003382 assert( pCur->eState==CURSOR_VALID );
drh1fee73e2007-08-29 04:00:57 +00003383 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00003384 pPage = pCur->apPage[pCur->iPage];
3385 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drh86057612007-06-26 01:04:48 +00003386 getCellInfo(pCur);
drh43605152004-05-29 21:46:49 +00003387 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00003388 aPayload += pCur->info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00003389 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00003390 nKey = 0;
3391 }else{
3392 nKey = pCur->info.nKey;
drh0e1c19e2004-05-11 00:58:56 +00003393 }
drh0e1c19e2004-05-11 00:58:56 +00003394 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003395 aPayload += nKey;
3396 nLocal = pCur->info.nLocal - nKey;
drh0e1c19e2004-05-11 00:58:56 +00003397 }else{
drhfa1a98a2004-05-14 19:08:17 +00003398 nLocal = pCur->info.nLocal;
drhe51c44f2004-05-30 20:46:09 +00003399 if( nLocal>nKey ){
3400 nLocal = nKey;
3401 }
drh0e1c19e2004-05-11 00:58:56 +00003402 }
drhe51c44f2004-05-30 20:46:09 +00003403 *pAmt = nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003404 return aPayload;
3405}
3406
3407
3408/*
drhe51c44f2004-05-30 20:46:09 +00003409** For the entry that cursor pCur is point to, return as
3410** many bytes of the key or data as are available on the local
3411** b-tree page. Write the number of available bytes into *pAmt.
drh0e1c19e2004-05-11 00:58:56 +00003412**
3413** The pointer returned is ephemeral. The key/data may move
drhd677b3d2007-08-20 22:48:41 +00003414** or be destroyed on the next call to any Btree routine,
3415** including calls from other threads against the same cache.
3416** Hence, a mutex on the BtShared should be held prior to calling
3417** this routine.
drh0e1c19e2004-05-11 00:58:56 +00003418**
3419** These routines is used to get quick access to key and data
3420** in the common case where no overflow pages are used.
drh0e1c19e2004-05-11 00:58:56 +00003421*/
drhe51c44f2004-05-30 20:46:09 +00003422const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003423 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003424 if( pCur->eState==CURSOR_VALID ){
3425 return (const void*)fetchPayload(pCur, pAmt, 0);
3426 }
3427 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003428}
drhe51c44f2004-05-30 20:46:09 +00003429const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003430 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003431 if( pCur->eState==CURSOR_VALID ){
3432 return (const void*)fetchPayload(pCur, pAmt, 1);
3433 }
3434 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003435}
3436
3437
3438/*
drh8178a752003-01-05 21:41:40 +00003439** Move the cursor down to a new child page. The newPgno argument is the
drhab01f612004-05-22 02:55:23 +00003440** page number of the child page to move to.
drh72f82862001-05-24 21:06:34 +00003441*/
drh3aac2dd2004-04-26 14:10:20 +00003442static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00003443 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003444 int i = pCur->iPage;
drh72f82862001-05-24 21:06:34 +00003445 MemPage *pNewPage;
drhd0679ed2007-08-28 22:24:34 +00003446 BtShared *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00003447
drh1fee73e2007-08-29 04:00:57 +00003448 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003449 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003450 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3451 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3452 return SQLITE_CORRUPT_BKPT;
3453 }
3454 rc = getAndInitPage(pBt, newPgno, &pNewPage);
drh6019e162001-07-02 17:51:45 +00003455 if( rc ) return rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003456 pCur->apPage[i+1] = pNewPage;
3457 pCur->aiIdx[i+1] = 0;
3458 pCur->iPage++;
3459
drh271efa52004-05-30 19:19:05 +00003460 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003461 pCur->validNKey = 0;
drh4be295b2003-12-16 03:44:47 +00003462 if( pNewPage->nCell<1 ){
drh49285702005-09-17 15:20:26 +00003463 return SQLITE_CORRUPT_BKPT;
drh4be295b2003-12-16 03:44:47 +00003464 }
drh72f82862001-05-24 21:06:34 +00003465 return SQLITE_OK;
3466}
3467
danielk1977bf93c562008-09-29 15:53:25 +00003468#ifndef NDEBUG
3469/*
3470** Page pParent is an internal (non-leaf) tree page. This function
3471** asserts that page number iChild is the left-child if the iIdx'th
3472** cell in page pParent. Or, if iIdx is equal to the total number of
3473** cells in pParent, that page number iChild is the right-child of
3474** the page.
3475*/
3476static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3477 assert( iIdx<=pParent->nCell );
3478 if( iIdx==pParent->nCell ){
3479 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3480 }else{
3481 assert( get4byte(findCell(pParent, iIdx))==iChild );
3482 }
3483}
3484#else
3485# define assertParentIndex(x,y,z)
3486#endif
3487
drh72f82862001-05-24 21:06:34 +00003488/*
drh5e2f8b92001-05-28 00:41:15 +00003489** Move the cursor up to the parent page.
3490**
3491** pCur->idx is set to the cell index that contains the pointer
3492** to the page we are coming from. If we are coming from the
3493** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00003494** the largest cell index.
drh72f82862001-05-24 21:06:34 +00003495*/
drh16a9b832007-05-05 18:39:25 +00003496void sqlite3BtreeMoveToParent(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00003497 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003498 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003499 assert( pCur->iPage>0 );
3500 assert( pCur->apPage[pCur->iPage] );
danielk1977bf93c562008-09-29 15:53:25 +00003501 assertParentIndex(
3502 pCur->apPage[pCur->iPage-1],
3503 pCur->aiIdx[pCur->iPage-1],
3504 pCur->apPage[pCur->iPage]->pgno
3505 );
danielk197771d5d2c2008-09-29 11:49:47 +00003506 releasePage(pCur->apPage[pCur->iPage]);
3507 pCur->iPage--;
drh271efa52004-05-30 19:19:05 +00003508 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003509 pCur->validNKey = 0;
drh72f82862001-05-24 21:06:34 +00003510}
3511
3512/*
3513** Move the cursor to the root page
3514*/
drh5e2f8b92001-05-28 00:41:15 +00003515static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00003516 MemPage *pRoot;
drh777e4c42006-01-13 04:31:58 +00003517 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00003518 Btree *p = pCur->pBtree;
3519 BtShared *pBt = p->pBt;
drhbd03cae2001-06-02 02:40:57 +00003520
drh1fee73e2007-08-29 04:00:57 +00003521 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +00003522 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3523 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3524 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3525 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3526 if( pCur->eState==CURSOR_FAULT ){
3527 return pCur->skip;
3528 }
danielk1977be51a652008-10-08 17:58:48 +00003529 sqlite3BtreeClearCursor(pCur);
drhbf700f32007-03-31 02:36:44 +00003530 }
danielk197771d5d2c2008-09-29 11:49:47 +00003531
3532 if( pCur->iPage>=0 ){
3533 int i;
3534 for(i=1; i<=pCur->iPage; i++){
3535 releasePage(pCur->apPage[i]);
danielk1977d9f6c532008-09-19 16:39:38 +00003536 }
drh777e4c42006-01-13 04:31:58 +00003537 }else{
3538 if(
danielk197771d5d2c2008-09-29 11:49:47 +00003539 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
drh777e4c42006-01-13 04:31:58 +00003540 ){
3541 pCur->eState = CURSOR_INVALID;
3542 return rc;
3543 }
drhc39e0002004-05-07 23:50:57 +00003544 }
danielk197771d5d2c2008-09-29 11:49:47 +00003545
3546 pRoot = pCur->apPage[0];
3547 assert( pRoot->pgno==pCur->pgnoRoot );
3548 pCur->iPage = 0;
3549 pCur->aiIdx[0] = 0;
drh271efa52004-05-30 19:19:05 +00003550 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003551 pCur->atLast = 0;
3552 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003553
drh8856d6a2004-04-29 14:42:46 +00003554 if( pRoot->nCell==0 && !pRoot->leaf ){
3555 Pgno subpage;
3556 assert( pRoot->pgno==1 );
drh43605152004-05-29 21:46:49 +00003557 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
drh8856d6a2004-04-29 14:42:46 +00003558 assert( subpage>0 );
danielk1977da184232006-01-05 11:34:32 +00003559 pCur->eState = CURSOR_VALID;
drh4b70f112004-05-02 21:12:19 +00003560 rc = moveToChild(pCur, subpage);
danielk197771d5d2c2008-09-29 11:49:47 +00003561 }else{
3562 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
drh8856d6a2004-04-29 14:42:46 +00003563 }
3564 return rc;
drh72f82862001-05-24 21:06:34 +00003565}
drh2af926b2001-05-15 00:39:25 +00003566
drh5e2f8b92001-05-28 00:41:15 +00003567/*
3568** Move the cursor down to the left-most leaf entry beneath the
3569** entry to which it is currently pointing.
drh777e4c42006-01-13 04:31:58 +00003570**
3571** The left-most leaf is the one with the smallest key - the first
3572** in ascending order.
drh5e2f8b92001-05-28 00:41:15 +00003573*/
3574static int moveToLeftmost(BtCursor *pCur){
3575 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003576 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00003577 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00003578
drh1fee73e2007-08-29 04:00:57 +00003579 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003580 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003581 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3582 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3583 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
drh8178a752003-01-05 21:41:40 +00003584 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00003585 }
drhd677b3d2007-08-20 22:48:41 +00003586 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003587}
3588
drh2dcc9aa2002-12-04 13:40:25 +00003589/*
3590** Move the cursor down to the right-most leaf entry beneath the
3591** page to which it is currently pointing. Notice the difference
3592** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3593** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3594** finds the right-most entry beneath the *page*.
drh777e4c42006-01-13 04:31:58 +00003595**
3596** The right-most entry is the one with the largest key - the last
3597** key in ascending order.
drh2dcc9aa2002-12-04 13:40:25 +00003598*/
3599static int moveToRightmost(BtCursor *pCur){
3600 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003601 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00003602 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00003603
drh1fee73e2007-08-29 04:00:57 +00003604 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003605 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003606 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
drh43605152004-05-29 21:46:49 +00003607 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
danielk197771d5d2c2008-09-29 11:49:47 +00003608 pCur->aiIdx[pCur->iPage] = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00003609 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00003610 }
drhd677b3d2007-08-20 22:48:41 +00003611 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00003612 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
drhd677b3d2007-08-20 22:48:41 +00003613 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003614 pCur->validNKey = 0;
drhd677b3d2007-08-20 22:48:41 +00003615 }
danielk1977518002e2008-09-05 05:02:46 +00003616 return rc;
drh2dcc9aa2002-12-04 13:40:25 +00003617}
3618
drh5e00f6c2001-09-13 13:46:56 +00003619/* Move the cursor to the first entry in the table. Return SQLITE_OK
3620** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003621** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00003622*/
drh3aac2dd2004-04-26 14:10:20 +00003623int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00003624 int rc;
drhd677b3d2007-08-20 22:48:41 +00003625
drh1fee73e2007-08-29 04:00:57 +00003626 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003627 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh5e00f6c2001-09-13 13:46:56 +00003628 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003629 if( rc==SQLITE_OK ){
3630 if( pCur->eState==CURSOR_INVALID ){
danielk197771d5d2c2008-09-29 11:49:47 +00003631 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003632 *pRes = 1;
3633 rc = SQLITE_OK;
3634 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003635 assert( pCur->apPage[pCur->iPage]->nCell>0 );
drhd677b3d2007-08-20 22:48:41 +00003636 *pRes = 0;
3637 rc = moveToLeftmost(pCur);
3638 }
drh5e00f6c2001-09-13 13:46:56 +00003639 }
drh5e00f6c2001-09-13 13:46:56 +00003640 return rc;
3641}
drh5e2f8b92001-05-28 00:41:15 +00003642
drh9562b552002-02-19 15:00:07 +00003643/* Move the cursor to the last entry in the table. Return SQLITE_OK
3644** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003645** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00003646*/
drh3aac2dd2004-04-26 14:10:20 +00003647int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00003648 int rc;
drhd677b3d2007-08-20 22:48:41 +00003649
drh1fee73e2007-08-29 04:00:57 +00003650 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003651 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh9562b552002-02-19 15:00:07 +00003652 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003653 if( rc==SQLITE_OK ){
3654 if( CURSOR_INVALID==pCur->eState ){
danielk197771d5d2c2008-09-29 11:49:47 +00003655 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003656 *pRes = 1;
3657 }else{
3658 assert( pCur->eState==CURSOR_VALID );
3659 *pRes = 0;
3660 rc = moveToRightmost(pCur);
drha2c20e42008-03-29 16:01:04 +00003661 getCellInfo(pCur);
3662 pCur->atLast = rc==SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00003663 }
drh9562b552002-02-19 15:00:07 +00003664 }
drh9562b552002-02-19 15:00:07 +00003665 return rc;
3666}
3667
drhe14006d2008-03-25 17:23:32 +00003668/* Move the cursor so that it points to an entry near the key
drhe63d9992008-08-13 19:11:48 +00003669** specified by pIdxKey or intKey. Return a success code.
drh72f82862001-05-24 21:06:34 +00003670**
drhe63d9992008-08-13 19:11:48 +00003671** For INTKEY tables, the intKey parameter is used. pIdxKey
3672** must be NULL. For index tables, pIdxKey is used and intKey
3673** is ignored.
drh3aac2dd2004-04-26 14:10:20 +00003674**
drh5e2f8b92001-05-28 00:41:15 +00003675** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00003676** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00003677** were present. The cursor might point to an entry that comes
3678** before or after the key.
3679**
drhbd03cae2001-06-02 02:40:57 +00003680** The result of comparing the key with the entry to which the
drhab01f612004-05-22 02:55:23 +00003681** cursor is written to *pRes if pRes!=NULL. The meaning of
drhbd03cae2001-06-02 02:40:57 +00003682** this value is as follows:
3683**
3684** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00003685** is smaller than pKey or if the table is empty
3686** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00003687**
3688** *pRes==0 The cursor is left pointing at an entry that
3689** exactly matches pKey.
3690**
3691** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00003692** is larger than pKey.
drhd677b3d2007-08-20 22:48:41 +00003693**
drha059ad02001-04-17 20:09:11 +00003694*/
drhe63d9992008-08-13 19:11:48 +00003695int sqlite3BtreeMovetoUnpacked(
3696 BtCursor *pCur, /* The cursor to be moved */
3697 UnpackedRecord *pIdxKey, /* Unpacked index key */
3698 i64 intKey, /* The table key */
3699 int biasRight, /* If true, bias the search to the high end */
3700 int *pRes /* Write search results here */
drhe4d90812007-03-29 05:51:49 +00003701){
drh72f82862001-05-24 21:06:34 +00003702 int rc;
drhd677b3d2007-08-20 22:48:41 +00003703
drh1fee73e2007-08-29 04:00:57 +00003704 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003705 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drha2c20e42008-03-29 16:01:04 +00003706
3707 /* If the cursor is already positioned at the point we are trying
3708 ** to move to, then just return without doing any work */
danielk197771d5d2c2008-09-29 11:49:47 +00003709 if( pCur->eState==CURSOR_VALID && pCur->validNKey
3710 && pCur->apPage[0]->intKey
3711 ){
drhe63d9992008-08-13 19:11:48 +00003712 if( pCur->info.nKey==intKey ){
drha2c20e42008-03-29 16:01:04 +00003713 *pRes = 0;
3714 return SQLITE_OK;
3715 }
drhe63d9992008-08-13 19:11:48 +00003716 if( pCur->atLast && pCur->info.nKey<intKey ){
drha2c20e42008-03-29 16:01:04 +00003717 *pRes = -1;
3718 return SQLITE_OK;
3719 }
3720 }
3721
drh5e2f8b92001-05-28 00:41:15 +00003722 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003723 if( rc ){
3724 return rc;
3725 }
danielk197771d5d2c2008-09-29 11:49:47 +00003726 assert( pCur->apPage[pCur->iPage] );
3727 assert( pCur->apPage[pCur->iPage]->isInit );
danielk1977da184232006-01-05 11:34:32 +00003728 if( pCur->eState==CURSOR_INVALID ){
drhf328bc82004-05-10 23:29:49 +00003729 *pRes = -1;
danielk197771d5d2c2008-09-29 11:49:47 +00003730 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhc39e0002004-05-07 23:50:57 +00003731 return SQLITE_OK;
3732 }
danielk197771d5d2c2008-09-29 11:49:47 +00003733 assert( pCur->apPage[0]->intKey || pIdxKey );
drh14684382006-11-30 13:05:29 +00003734 for(;;){
drh72f82862001-05-24 21:06:34 +00003735 int lwr, upr;
3736 Pgno chldPg;
danielk197771d5d2c2008-09-29 11:49:47 +00003737 MemPage *pPage = pCur->apPage[pCur->iPage];
drh1a844c32002-12-04 22:29:28 +00003738 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00003739 lwr = 0;
3740 upr = pPage->nCell-1;
drhe63d9992008-08-13 19:11:48 +00003741 if( !pPage->intKey && pIdxKey==0 ){
drh1e968a02008-03-25 00:22:21 +00003742 rc = SQLITE_CORRUPT_BKPT;
3743 goto moveto_finish;
drh4eec4c12005-01-21 00:22:37 +00003744 }
drhe4d90812007-03-29 05:51:49 +00003745 if( biasRight ){
danielk197771d5d2c2008-09-29 11:49:47 +00003746 pCur->aiIdx[pCur->iPage] = upr;
drhe4d90812007-03-29 05:51:49 +00003747 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003748 pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
drhe4d90812007-03-29 05:51:49 +00003749 }
drhf1d68b32007-03-29 04:43:26 +00003750 if( lwr<=upr ) for(;;){
danielk197713adf8a2004-06-03 16:08:41 +00003751 void *pCellKey;
drh4a1c3802004-05-12 15:15:47 +00003752 i64 nCellKey;
danielk197771d5d2c2008-09-29 11:49:47 +00003753 int idx = pCur->aiIdx[pCur->iPage];
drh366fda62006-01-13 02:35:09 +00003754 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003755 pCur->validNKey = 1;
drh3aac2dd2004-04-26 14:10:20 +00003756 if( pPage->intKey ){
drh777e4c42006-01-13 04:31:58 +00003757 u8 *pCell;
danielk197771d5d2c2008-09-29 11:49:47 +00003758 pCell = findCell(pPage, idx) + pPage->childPtrSize;
drhd172f862006-01-12 15:01:15 +00003759 if( pPage->hasData ){
danielk1977bab45c62006-01-16 15:14:27 +00003760 u32 dummy;
shane3f8d5cf2008-04-24 19:15:09 +00003761 pCell += getVarint32(pCell, dummy);
drhd172f862006-01-12 15:01:15 +00003762 }
drha2c20e42008-03-29 16:01:04 +00003763 getVarint(pCell, (u64*)&nCellKey);
drhe63d9992008-08-13 19:11:48 +00003764 if( nCellKey==intKey ){
drh3aac2dd2004-04-26 14:10:20 +00003765 c = 0;
drhe63d9992008-08-13 19:11:48 +00003766 }else if( nCellKey<intKey ){
drh41eb9e92008-04-02 18:33:07 +00003767 c = -1;
3768 }else{
drhe63d9992008-08-13 19:11:48 +00003769 assert( nCellKey>intKey );
drh41eb9e92008-04-02 18:33:07 +00003770 c = +1;
drh3aac2dd2004-04-26 14:10:20 +00003771 }
drh3aac2dd2004-04-26 14:10:20 +00003772 }else{
drhe51c44f2004-05-30 20:46:09 +00003773 int available;
danielk197713adf8a2004-06-03 16:08:41 +00003774 pCellKey = (void *)fetchPayload(pCur, &available, 0);
drh366fda62006-01-13 02:35:09 +00003775 nCellKey = pCur->info.nKey;
drhe51c44f2004-05-30 20:46:09 +00003776 if( available>=nCellKey ){
drhe63d9992008-08-13 19:11:48 +00003777 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
drhe51c44f2004-05-30 20:46:09 +00003778 }else{
drhfacf0302008-06-17 15:12:00 +00003779 pCellKey = sqlite3Malloc( nCellKey );
danielk19776507ecb2008-03-25 09:56:44 +00003780 if( pCellKey==0 ){
3781 rc = SQLITE_NOMEM;
3782 goto moveto_finish;
3783 }
danielk197713adf8a2004-06-03 16:08:41 +00003784 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
drhe63d9992008-08-13 19:11:48 +00003785 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
drhfacf0302008-06-17 15:12:00 +00003786 sqlite3_free(pCellKey);
drh1e968a02008-03-25 00:22:21 +00003787 if( rc ) goto moveto_finish;
drhe51c44f2004-05-30 20:46:09 +00003788 }
drh3aac2dd2004-04-26 14:10:20 +00003789 }
drh72f82862001-05-24 21:06:34 +00003790 if( c==0 ){
drha2c20e42008-03-29 16:01:04 +00003791 pCur->info.nKey = nCellKey;
drh44845222008-07-17 18:39:57 +00003792 if( pPage->intKey && !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00003793 lwr = idx;
drhfc70e6f2004-05-12 21:11:27 +00003794 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00003795 break;
3796 }else{
drh8b18dd42004-05-12 19:18:15 +00003797 if( pRes ) *pRes = 0;
drh1e968a02008-03-25 00:22:21 +00003798 rc = SQLITE_OK;
3799 goto moveto_finish;
drh8b18dd42004-05-12 19:18:15 +00003800 }
drh72f82862001-05-24 21:06:34 +00003801 }
3802 if( c<0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003803 lwr = idx+1;
drh72f82862001-05-24 21:06:34 +00003804 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003805 upr = idx-1;
drh72f82862001-05-24 21:06:34 +00003806 }
drhf1d68b32007-03-29 04:43:26 +00003807 if( lwr>upr ){
drha2c20e42008-03-29 16:01:04 +00003808 pCur->info.nKey = nCellKey;
drhf1d68b32007-03-29 04:43:26 +00003809 break;
3810 }
danielk197771d5d2c2008-09-29 11:49:47 +00003811 pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
drh72f82862001-05-24 21:06:34 +00003812 }
3813 assert( lwr==upr+1 );
danielk197771d5d2c2008-09-29 11:49:47 +00003814 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00003815 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003816 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00003817 }else if( lwr>=pPage->nCell ){
drh43605152004-05-29 21:46:49 +00003818 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh72f82862001-05-24 21:06:34 +00003819 }else{
danielk19771cc5ed82007-05-16 17:28:43 +00003820 chldPg = get4byte(findCell(pPage, lwr));
drh72f82862001-05-24 21:06:34 +00003821 }
3822 if( chldPg==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003823 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh72f82862001-05-24 21:06:34 +00003824 if( pRes ) *pRes = c;
drh1e968a02008-03-25 00:22:21 +00003825 rc = SQLITE_OK;
3826 goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003827 }
danielk197771d5d2c2008-09-29 11:49:47 +00003828 pCur->aiIdx[pCur->iPage] = lwr;
drh271efa52004-05-30 19:19:05 +00003829 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003830 pCur->validNKey = 0;
drh8178a752003-01-05 21:41:40 +00003831 rc = moveToChild(pCur, chldPg);
drh1e968a02008-03-25 00:22:21 +00003832 if( rc ) goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003833 }
drh1e968a02008-03-25 00:22:21 +00003834moveto_finish:
drhe63d9992008-08-13 19:11:48 +00003835 return rc;
3836}
3837
3838/*
3839** In this version of BtreeMoveto, pKey is a packed index record
3840** such as is generated by the OP_MakeRecord opcode. Unpack the
3841** record and then call BtreeMovetoUnpacked() to do the work.
3842*/
3843int sqlite3BtreeMoveto(
3844 BtCursor *pCur, /* Cursor open on the btree to be searched */
3845 const void *pKey, /* Packed key if the btree is an index */
3846 i64 nKey, /* Integer key for tables. Size of pKey for indices */
3847 int bias, /* Bias search to the high end */
3848 int *pRes /* Write search results here */
3849){
3850 int rc; /* Status code */
3851 UnpackedRecord *pIdxKey; /* Unpacked index key */
drh23f79d02008-08-20 22:06:47 +00003852 UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
drhe63d9992008-08-13 19:11:48 +00003853
drhe14006d2008-03-25 17:23:32 +00003854 if( pKey ){
drhe63d9992008-08-13 19:11:48 +00003855 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
drh23f79d02008-08-20 22:06:47 +00003856 aSpace, sizeof(aSpace));
drhe63d9992008-08-13 19:11:48 +00003857 if( pIdxKey==0 ) return SQLITE_NOMEM;
3858 }else{
3859 pIdxKey = 0;
3860 }
3861 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
3862 if( pKey ){
3863 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
drhe14006d2008-03-25 17:23:32 +00003864 }
drh1e968a02008-03-25 00:22:21 +00003865 return rc;
drh72f82862001-05-24 21:06:34 +00003866}
3867
drhd677b3d2007-08-20 22:48:41 +00003868
drh72f82862001-05-24 21:06:34 +00003869/*
drhc39e0002004-05-07 23:50:57 +00003870** Return TRUE if the cursor is not pointing at an entry of the table.
3871**
3872** TRUE will be returned after a call to sqlite3BtreeNext() moves
3873** past the last entry in the table or sqlite3BtreePrev() moves past
3874** the first entry. TRUE is also returned if the table is empty.
3875*/
3876int sqlite3BtreeEof(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00003877 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3878 ** have been deleted? This API will need to change to return an error code
3879 ** as well as the boolean result value.
3880 */
3881 return (CURSOR_VALID!=pCur->eState);
drhc39e0002004-05-07 23:50:57 +00003882}
3883
3884/*
drhb21c8cd2007-08-21 19:33:56 +00003885** Return the database connection handle for a cursor.
3886*/
3887sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
drhe5fe6902007-12-07 18:55:28 +00003888 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3889 return pCur->pBtree->db;
drhb21c8cd2007-08-21 19:33:56 +00003890}
3891
3892/*
drhbd03cae2001-06-02 02:40:57 +00003893** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00003894** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00003895** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00003896** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00003897*/
drhd094db12008-04-03 21:46:57 +00003898int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00003899 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003900 int idx;
danielk197797a227c2006-01-20 16:32:04 +00003901 MemPage *pPage;
drh8b18dd42004-05-12 19:18:15 +00003902
drh1fee73e2007-08-29 04:00:57 +00003903 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003904 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003905 if( rc!=SQLITE_OK ){
3906 return rc;
3907 }
drh8c4d3a62007-04-06 01:03:32 +00003908 assert( pRes!=0 );
drh8c4d3a62007-04-06 01:03:32 +00003909 if( CURSOR_INVALID==pCur->eState ){
3910 *pRes = 1;
3911 return SQLITE_OK;
3912 }
danielk1977da184232006-01-05 11:34:32 +00003913 if( pCur->skip>0 ){
3914 pCur->skip = 0;
3915 *pRes = 0;
3916 return SQLITE_OK;
3917 }
3918 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00003919
danielk197771d5d2c2008-09-29 11:49:47 +00003920 pPage = pCur->apPage[pCur->iPage];
3921 idx = ++pCur->aiIdx[pCur->iPage];
3922 assert( pPage->isInit );
3923 assert( idx<=pPage->nCell );
danielk19776a43f9b2004-11-16 04:57:24 +00003924
drh271efa52004-05-30 19:19:05 +00003925 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003926 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003927 if( idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00003928 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00003929 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
drh5e2f8b92001-05-28 00:41:15 +00003930 if( rc ) return rc;
3931 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003932 *pRes = 0;
3933 return rc;
drh72f82862001-05-24 21:06:34 +00003934 }
drh5e2f8b92001-05-28 00:41:15 +00003935 do{
danielk197771d5d2c2008-09-29 11:49:47 +00003936 if( pCur->iPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00003937 *pRes = 1;
danielk1977da184232006-01-05 11:34:32 +00003938 pCur->eState = CURSOR_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00003939 return SQLITE_OK;
3940 }
drh16a9b832007-05-05 18:39:25 +00003941 sqlite3BtreeMoveToParent(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00003942 pPage = pCur->apPage[pCur->iPage];
3943 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00003944 *pRes = 0;
drh44845222008-07-17 18:39:57 +00003945 if( pPage->intKey ){
drh8b18dd42004-05-12 19:18:15 +00003946 rc = sqlite3BtreeNext(pCur, pRes);
3947 }else{
3948 rc = SQLITE_OK;
3949 }
3950 return rc;
drh8178a752003-01-05 21:41:40 +00003951 }
3952 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00003953 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00003954 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00003955 }
drh5e2f8b92001-05-28 00:41:15 +00003956 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003957 return rc;
drh72f82862001-05-24 21:06:34 +00003958}
drhd677b3d2007-08-20 22:48:41 +00003959
drh72f82862001-05-24 21:06:34 +00003960
drh3b7511c2001-05-26 13:15:44 +00003961/*
drh2dcc9aa2002-12-04 13:40:25 +00003962** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00003963** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00003964** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00003965** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00003966*/
drhd094db12008-04-03 21:46:57 +00003967int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00003968 int rc;
drh8178a752003-01-05 21:41:40 +00003969 MemPage *pPage;
danielk1977da184232006-01-05 11:34:32 +00003970
drh1fee73e2007-08-29 04:00:57 +00003971 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003972 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003973 if( rc!=SQLITE_OK ){
3974 return rc;
3975 }
drha2c20e42008-03-29 16:01:04 +00003976 pCur->atLast = 0;
drh8c4d3a62007-04-06 01:03:32 +00003977 if( CURSOR_INVALID==pCur->eState ){
3978 *pRes = 1;
3979 return SQLITE_OK;
3980 }
danielk1977da184232006-01-05 11:34:32 +00003981 if( pCur->skip<0 ){
3982 pCur->skip = 0;
3983 *pRes = 0;
3984 return SQLITE_OK;
3985 }
3986 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00003987
danielk197771d5d2c2008-09-29 11:49:47 +00003988 pPage = pCur->apPage[pCur->iPage];
3989 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003990 if( !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00003991 int idx = pCur->aiIdx[pCur->iPage];
3992 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
drhd677b3d2007-08-20 22:48:41 +00003993 if( rc ){
3994 return rc;
3995 }
drh2dcc9aa2002-12-04 13:40:25 +00003996 rc = moveToRightmost(pCur);
3997 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003998 while( pCur->aiIdx[pCur->iPage]==0 ){
3999 if( pCur->iPage==0 ){
danielk1977da184232006-01-05 11:34:32 +00004000 pCur->eState = CURSOR_INVALID;
drhc39e0002004-05-07 23:50:57 +00004001 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00004002 return SQLITE_OK;
4003 }
drh16a9b832007-05-05 18:39:25 +00004004 sqlite3BtreeMoveToParent(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00004005 }
drh271efa52004-05-30 19:19:05 +00004006 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004007 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004008
4009 pCur->aiIdx[pCur->iPage]--;
4010 pPage = pCur->apPage[pCur->iPage];
drh44845222008-07-17 18:39:57 +00004011 if( pPage->intKey && !pPage->leaf ){
drh8b18dd42004-05-12 19:18:15 +00004012 rc = sqlite3BtreePrevious(pCur, pRes);
4013 }else{
4014 rc = SQLITE_OK;
4015 }
drh2dcc9aa2002-12-04 13:40:25 +00004016 }
drh8178a752003-01-05 21:41:40 +00004017 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00004018 return rc;
4019}
4020
4021/*
drh3b7511c2001-05-26 13:15:44 +00004022** Allocate a new page from the database file.
4023**
danielk19773b8a05f2007-03-19 17:44:26 +00004024** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
drh3b7511c2001-05-26 13:15:44 +00004025** has already been called on the new page.) The new page has also
4026** been referenced and the calling routine is responsible for calling
danielk19773b8a05f2007-03-19 17:44:26 +00004027** sqlite3PagerUnref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00004028**
4029** SQLITE_OK is returned on success. Any other return value indicates
4030** an error. *ppPage and *pPgno are undefined in the event of an error.
danielk19773b8a05f2007-03-19 17:44:26 +00004031** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00004032**
drh199e3cf2002-07-18 11:01:47 +00004033** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4034** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00004035** attempt to keep related pages close to each other in the database file,
4036** which in turn can make database access faster.
danielk1977cb1a7eb2004-11-05 12:27:02 +00004037**
4038** If the "exact" parameter is not 0, and the page-number nearby exists
4039** anywhere on the free-list, then it is guarenteed to be returned. This
4040** is only used by auto-vacuum databases when allocating a new table.
drh3b7511c2001-05-26 13:15:44 +00004041*/
drh4f0c5872007-03-26 22:05:01 +00004042static int allocateBtreePage(
danielk1977aef0bf62005-12-30 16:28:01 +00004043 BtShared *pBt,
danielk1977cb1a7eb2004-11-05 12:27:02 +00004044 MemPage **ppPage,
4045 Pgno *pPgno,
4046 Pgno nearby,
4047 u8 exact
4048){
drh3aac2dd2004-04-26 14:10:20 +00004049 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00004050 int rc;
drh3aac2dd2004-04-26 14:10:20 +00004051 int n; /* Number of pages on the freelist */
4052 int k; /* Number of leaves on the trunk of the freelist */
drhd3627af2006-12-18 18:34:51 +00004053 MemPage *pTrunk = 0;
4054 MemPage *pPrevTrunk = 0;
drh30e58752002-03-02 20:41:57 +00004055
drh1fee73e2007-08-29 04:00:57 +00004056 assert( sqlite3_mutex_held(pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004057 pPage1 = pBt->pPage1;
4058 n = get4byte(&pPage1->aData[36]);
4059 if( n>0 ){
drh91025292004-05-03 19:49:32 +00004060 /* There are pages on the freelist. Reuse one of those pages. */
danielk1977cb1a7eb2004-11-05 12:27:02 +00004061 Pgno iTrunk;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004062 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4063
4064 /* If the 'exact' parameter was true and a query of the pointer-map
4065 ** shows that the page 'nearby' is somewhere on the free-list, then
4066 ** the entire-list will be searched for that page.
4067 */
4068#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977ad0132d2008-06-07 08:58:22 +00004069 if( exact && nearby<=pagerPagecount(pBt->pPager) ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004070 u8 eType;
4071 assert( nearby>0 );
4072 assert( pBt->autoVacuum );
4073 rc = ptrmapGet(pBt, nearby, &eType, 0);
4074 if( rc ) return rc;
4075 if( eType==PTRMAP_FREEPAGE ){
4076 searchList = 1;
4077 }
4078 *pPgno = nearby;
4079 }
4080#endif
4081
4082 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4083 ** first free-list trunk page. iPrevTrunk is initially 1.
4084 */
danielk19773b8a05f2007-03-19 17:44:26 +00004085 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3b7511c2001-05-26 13:15:44 +00004086 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004087 put4byte(&pPage1->aData[36], n-1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004088
4089 /* The code within this loop is run only once if the 'searchList' variable
4090 ** is not true. Otherwise, it runs once for each trunk-page on the
4091 ** free-list until the page 'nearby' is located.
4092 */
4093 do {
4094 pPrevTrunk = pTrunk;
4095 if( pPrevTrunk ){
4096 iTrunk = get4byte(&pPrevTrunk->aData[0]);
drhbea00b92002-07-08 10:59:50 +00004097 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00004098 iTrunk = get4byte(&pPage1->aData[32]);
drhbea00b92002-07-08 10:59:50 +00004099 }
drh16a9b832007-05-05 18:39:25 +00004100 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004101 if( rc ){
drhd3627af2006-12-18 18:34:51 +00004102 pTrunk = 0;
4103 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004104 }
4105
4106 k = get4byte(&pTrunk->aData[4]);
4107 if( k==0 && !searchList ){
4108 /* The trunk has no leaves and the list is not being searched.
4109 ** So extract the trunk page itself and use it as the newly
4110 ** allocated page */
4111 assert( pPrevTrunk==0 );
danielk19773b8a05f2007-03-19 17:44:26 +00004112 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004113 if( rc ){
4114 goto end_allocate_page;
4115 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004116 *pPgno = iTrunk;
4117 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4118 *ppPage = pTrunk;
4119 pTrunk = 0;
4120 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh45b1fac2008-07-04 17:52:42 +00004121 }else if( k>pBt->usableSize/4 - 2 ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004122 /* Value of k is out of range. Database corruption */
drhd3627af2006-12-18 18:34:51 +00004123 rc = SQLITE_CORRUPT_BKPT;
4124 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004125#ifndef SQLITE_OMIT_AUTOVACUUM
4126 }else if( searchList && nearby==iTrunk ){
4127 /* The list is being searched and this trunk page is the page
4128 ** to allocate, regardless of whether it has leaves.
4129 */
4130 assert( *pPgno==iTrunk );
4131 *ppPage = pTrunk;
4132 searchList = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00004133 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004134 if( rc ){
4135 goto end_allocate_page;
4136 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004137 if( k==0 ){
4138 if( !pPrevTrunk ){
4139 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4140 }else{
4141 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4142 }
4143 }else{
4144 /* The trunk page is required by the caller but it contains
4145 ** pointers to free-list leaves. The first leaf becomes a trunk
4146 ** page in this case.
4147 */
4148 MemPage *pNewTrunk;
4149 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
drh16a9b832007-05-05 18:39:25 +00004150 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004151 if( rc!=SQLITE_OK ){
drhd3627af2006-12-18 18:34:51 +00004152 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004153 }
danielk19773b8a05f2007-03-19 17:44:26 +00004154 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004155 if( rc!=SQLITE_OK ){
4156 releasePage(pNewTrunk);
drhd3627af2006-12-18 18:34:51 +00004157 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004158 }
4159 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4160 put4byte(&pNewTrunk->aData[4], k-1);
4161 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
drhd3627af2006-12-18 18:34:51 +00004162 releasePage(pNewTrunk);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004163 if( !pPrevTrunk ){
4164 put4byte(&pPage1->aData[32], iNewTrunk);
4165 }else{
danielk19773b8a05f2007-03-19 17:44:26 +00004166 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004167 if( rc ){
4168 goto end_allocate_page;
4169 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004170 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4171 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004172 }
4173 pTrunk = 0;
4174 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4175#endif
4176 }else{
4177 /* Extract a leaf from the trunk */
4178 int closest;
4179 Pgno iPage;
4180 unsigned char *aData = pTrunk->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00004181 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004182 if( rc ){
4183 goto end_allocate_page;
4184 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004185 if( nearby>0 ){
4186 int i, dist;
4187 closest = 0;
4188 dist = get4byte(&aData[8]) - nearby;
4189 if( dist<0 ) dist = -dist;
4190 for(i=1; i<k; i++){
4191 int d2 = get4byte(&aData[8+i*4]) - nearby;
4192 if( d2<0 ) d2 = -d2;
4193 if( d2<dist ){
4194 closest = i;
4195 dist = d2;
4196 }
4197 }
4198 }else{
4199 closest = 0;
4200 }
4201
4202 iPage = get4byte(&aData[8+closest*4]);
4203 if( !searchList || iPage==nearby ){
danielk1977ad0132d2008-06-07 08:58:22 +00004204 int nPage;
shane1f9e6aa2008-06-09 19:27:11 +00004205 *pPgno = iPage;
danielk1977ad0132d2008-06-07 08:58:22 +00004206 nPage = pagerPagecount(pBt->pPager);
4207 if( *pPgno>nPage ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004208 /* Free page off the end of the file */
danielk197743e377a2008-05-05 12:09:32 +00004209 rc = SQLITE_CORRUPT_BKPT;
4210 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004211 }
4212 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4213 ": %d more free pages\n",
4214 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4215 if( closest<k-1 ){
4216 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4217 }
4218 put4byte(&aData[4], k-1);
drh16a9b832007-05-05 18:39:25 +00004219 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004220 if( rc==SQLITE_OK ){
drh538f5702007-04-13 02:14:30 +00004221 sqlite3PagerDontRollback((*ppPage)->pDbPage);
danielk19773b8a05f2007-03-19 17:44:26 +00004222 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004223 if( rc!=SQLITE_OK ){
4224 releasePage(*ppPage);
4225 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004226 }
4227 searchList = 0;
4228 }
drhee696e22004-08-30 16:52:17 +00004229 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004230 releasePage(pPrevTrunk);
drhd3627af2006-12-18 18:34:51 +00004231 pPrevTrunk = 0;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004232 }while( searchList );
drh3b7511c2001-05-26 13:15:44 +00004233 }else{
drh3aac2dd2004-04-26 14:10:20 +00004234 /* There are no pages on the freelist, so create a new page at the
4235 ** end of the file */
danielk1977ad0132d2008-06-07 08:58:22 +00004236 int nPage = pagerPagecount(pBt->pPager);
4237 *pPgno = nPage + 1;
danielk1977afcdd022004-10-31 16:25:42 +00004238
4239#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00004240 if( pBt->nTrunc ){
4241 /* An incr-vacuum has already run within this transaction. So the
4242 ** page to allocate is not from the physical end of the file, but
4243 ** at pBt->nTrunc.
4244 */
4245 *pPgno = pBt->nTrunc+1;
4246 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4247 (*pPgno)++;
4248 }
4249 }
danielk1977266664d2006-02-10 08:24:21 +00004250 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
danielk1977afcdd022004-10-31 16:25:42 +00004251 /* If *pPgno refers to a pointer-map page, allocate two new pages
4252 ** at the end of the file instead of one. The first allocated page
4253 ** becomes a new pointer-map page, the second is used by the caller.
4254 */
4255 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
danielk1977599fcba2004-11-08 07:13:13 +00004256 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk1977afcdd022004-10-31 16:25:42 +00004257 (*pPgno)++;
drh72190432008-01-31 14:54:43 +00004258 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
danielk1977afcdd022004-10-31 16:25:42 +00004259 }
danielk1977dddbcdc2007-04-26 14:42:34 +00004260 if( pBt->nTrunc ){
4261 pBt->nTrunc = *pPgno;
4262 }
danielk1977afcdd022004-10-31 16:25:42 +00004263#endif
4264
danielk1977599fcba2004-11-08 07:13:13 +00004265 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drh16a9b832007-05-05 18:39:25 +00004266 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
drh3b7511c2001-05-26 13:15:44 +00004267 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00004268 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004269 if( rc!=SQLITE_OK ){
4270 releasePage(*ppPage);
4271 }
drh3a4c1412004-05-09 20:40:11 +00004272 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00004273 }
danielk1977599fcba2004-11-08 07:13:13 +00004274
4275 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drhd3627af2006-12-18 18:34:51 +00004276
4277end_allocate_page:
4278 releasePage(pTrunk);
4279 releasePage(pPrevTrunk);
danielk197771d5d2c2008-09-29 11:49:47 +00004280 if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4281 releasePage(*ppPage);
4282 return SQLITE_CORRUPT_BKPT;
danielk1977eaa06f62008-09-18 17:34:44 +00004283 }
drh3b7511c2001-05-26 13:15:44 +00004284 return rc;
4285}
4286
4287/*
drh3aac2dd2004-04-26 14:10:20 +00004288** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00004289**
danielk19773b8a05f2007-03-19 17:44:26 +00004290** sqlite3PagerUnref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00004291*/
drh3aac2dd2004-04-26 14:10:20 +00004292static int freePage(MemPage *pPage){
danielk1977aef0bf62005-12-30 16:28:01 +00004293 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004294 MemPage *pPage1 = pBt->pPage1;
4295 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00004296
drh3aac2dd2004-04-26 14:10:20 +00004297 /* Prepare the page for freeing */
drh1fee73e2007-08-29 04:00:57 +00004298 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004299 assert( pPage->pgno>1 );
4300 pPage->isInit = 0;
drh3aac2dd2004-04-26 14:10:20 +00004301
drha34b6762004-05-07 13:30:42 +00004302 /* Increment the free page count on pPage1 */
danielk19773b8a05f2007-03-19 17:44:26 +00004303 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00004304 if( rc ) return rc;
4305 n = get4byte(&pPage1->aData[36]);
4306 put4byte(&pPage1->aData[36], n+1);
4307
drhfcce93f2006-02-22 03:08:32 +00004308#ifdef SQLITE_SECURE_DELETE
4309 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4310 ** always fully overwrite deleted information with zeros.
4311 */
danielk19773b8a05f2007-03-19 17:44:26 +00004312 rc = sqlite3PagerWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004313 if( rc ) return rc;
4314 memset(pPage->aData, 0, pPage->pBt->pageSize);
4315#endif
4316
danielk1977687566d2004-11-02 12:56:41 +00004317 /* If the database supports auto-vacuum, write an entry in the pointer-map
danielk1977cb1a7eb2004-11-05 12:27:02 +00004318 ** to indicate that the page is free.
danielk1977687566d2004-11-02 12:56:41 +00004319 */
danielk197785d90ca2008-07-19 14:25:15 +00004320 if( ISAUTOVACUUM ){
danielk1977687566d2004-11-02 12:56:41 +00004321 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
danielk1977a64a0352004-11-05 01:45:13 +00004322 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00004323 }
danielk1977687566d2004-11-02 12:56:41 +00004324
drh3aac2dd2004-04-26 14:10:20 +00004325 if( n==0 ){
4326 /* This is the first free page */
danielk19773b8a05f2007-03-19 17:44:26 +00004327 rc = sqlite3PagerWrite(pPage->pDbPage);
drhda200cc2004-05-09 11:51:38 +00004328 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004329 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00004330 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00004331 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004332 }else{
4333 /* Other free pages already exist. Retrive the first trunk page
4334 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00004335 MemPage *pTrunk;
drh16a9b832007-05-05 18:39:25 +00004336 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
drh3b7511c2001-05-26 13:15:44 +00004337 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004338 k = get4byte(&pTrunk->aData[4]);
drhee696e22004-08-30 16:52:17 +00004339 if( k>=pBt->usableSize/4 - 8 ){
drh3aac2dd2004-04-26 14:10:20 +00004340 /* The trunk is full. Turn the page being freed into a new
drh45b1fac2008-07-04 17:52:42 +00004341 ** trunk page with no leaves.
4342 **
4343 ** Note that the trunk page is not really full until it contains
4344 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4345 ** coded. But due to a coding error in versions of SQLite prior to
4346 ** 3.6.0, databases with freelist trunk pages holding more than
4347 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4348 ** to maintain backwards compatibility with older versions of SQLite,
4349 ** we will contain to restrict the number of entries to usableSize/4 - 8
4350 ** for now. At some point in the future (once everyone has upgraded
4351 ** to 3.6.0 or later) we should consider fixing the conditional above
4352 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4353 */
danielk19773b8a05f2007-03-19 17:44:26 +00004354 rc = sqlite3PagerWrite(pPage->pDbPage);
drhb9ee4932007-09-07 14:32:06 +00004355 if( rc==SQLITE_OK ){
4356 put4byte(pPage->aData, pTrunk->pgno);
4357 put4byte(&pPage->aData[4], 0);
4358 put4byte(&pPage1->aData[32], pPage->pgno);
4359 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4360 pPage->pgno, pTrunk->pgno));
4361 }
4362 }else if( k<0 ){
4363 rc = SQLITE_CORRUPT;
drh3aac2dd2004-04-26 14:10:20 +00004364 }else{
4365 /* Add the newly freed page as a leaf on the current trunk */
danielk19773b8a05f2007-03-19 17:44:26 +00004366 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhf5345442007-04-09 12:45:02 +00004367 if( rc==SQLITE_OK ){
4368 put4byte(&pTrunk->aData[4], k+1);
4369 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drhfcce93f2006-02-22 03:08:32 +00004370#ifndef SQLITE_SECURE_DELETE
danielk1977a1fa00d2008-08-27 15:16:33 +00004371 rc = sqlite3PagerDontWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004372#endif
drhf5345442007-04-09 12:45:02 +00004373 }
drh3a4c1412004-05-09 20:40:11 +00004374 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004375 }
4376 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00004377 }
drh3b7511c2001-05-26 13:15:44 +00004378 return rc;
4379}
4380
4381/*
drh3aac2dd2004-04-26 14:10:20 +00004382** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00004383*/
drh3aac2dd2004-04-26 14:10:20 +00004384static int clearCell(MemPage *pPage, unsigned char *pCell){
danielk1977aef0bf62005-12-30 16:28:01 +00004385 BtShared *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00004386 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00004387 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00004388 int rc;
drh94440812007-03-06 11:42:19 +00004389 int nOvfl;
4390 int ovflPageSize;
drh3b7511c2001-05-26 13:15:44 +00004391
drh1fee73e2007-08-29 04:00:57 +00004392 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh16a9b832007-05-05 18:39:25 +00004393 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004394 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00004395 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00004396 }
drh6f11bef2004-05-13 01:12:56 +00004397 ovflPgno = get4byte(&pCell[info.iOverflow]);
drh94440812007-03-06 11:42:19 +00004398 ovflPageSize = pBt->usableSize - 4;
drh72365832007-03-06 15:53:44 +00004399 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4400 assert( ovflPgno==0 || nOvfl>0 );
4401 while( nOvfl-- ){
drh3aac2dd2004-04-26 14:10:20 +00004402 MemPage *pOvfl;
danielk1977ad0132d2008-06-07 08:58:22 +00004403 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00004404 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00004405 }
danielk19778c0a9592007-04-30 16:55:00 +00004406
4407 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
drh3b7511c2001-05-26 13:15:44 +00004408 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00004409 rc = freePage(pOvfl);
danielk19773b8a05f2007-03-19 17:44:26 +00004410 sqlite3PagerUnref(pOvfl->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00004411 if( rc ) return rc;
drh3b7511c2001-05-26 13:15:44 +00004412 }
drh5e2f8b92001-05-28 00:41:15 +00004413 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00004414}
4415
4416/*
drh91025292004-05-03 19:49:32 +00004417** Create the byte sequence used to represent a cell on page pPage
4418** and write that byte sequence into pCell[]. Overflow pages are
4419** allocated and filled in as necessary. The calling procedure
4420** is responsible for making sure sufficient space has been allocated
4421** for pCell[].
4422**
4423** Note that pCell does not necessary need to point to the pPage->aData
4424** area. pCell might point to some temporary storage. The cell will
4425** be constructed in this temporary area then copied into pPage->aData
4426** later.
drh3b7511c2001-05-26 13:15:44 +00004427*/
4428static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00004429 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00004430 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00004431 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00004432 const void *pData,int nData, /* The data */
drhb026e052007-05-02 01:34:31 +00004433 int nZero, /* Extra zero bytes to append to pData */
drh4b70f112004-05-02 21:12:19 +00004434 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00004435){
drh3b7511c2001-05-26 13:15:44 +00004436 int nPayload;
drh8c6fa9b2004-05-26 00:01:53 +00004437 const u8 *pSrc;
drha34b6762004-05-07 13:30:42 +00004438 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00004439 int spaceLeft;
4440 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00004441 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00004442 unsigned char *pPrior;
4443 unsigned char *pPayload;
danielk1977aef0bf62005-12-30 16:28:01 +00004444 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004445 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00004446 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00004447 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00004448
drh1fee73e2007-08-29 04:00:57 +00004449 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004450
drh91025292004-05-03 19:49:32 +00004451 /* Fill in the header. */
drh43605152004-05-29 21:46:49 +00004452 nHeader = 0;
drh91025292004-05-03 19:49:32 +00004453 if( !pPage->leaf ){
4454 nHeader += 4;
4455 }
drh8b18dd42004-05-12 19:18:15 +00004456 if( pPage->hasData ){
drhb026e052007-05-02 01:34:31 +00004457 nHeader += putVarint(&pCell[nHeader], nData+nZero);
drh6f11bef2004-05-13 01:12:56 +00004458 }else{
drhb026e052007-05-02 01:34:31 +00004459 nData = nZero = 0;
drh91025292004-05-03 19:49:32 +00004460 }
drh6f11bef2004-05-13 01:12:56 +00004461 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
drh16a9b832007-05-05 18:39:25 +00004462 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004463 assert( info.nHeader==nHeader );
4464 assert( info.nKey==nKey );
drhb026e052007-05-02 01:34:31 +00004465 assert( info.nData==nData+nZero );
drh6f11bef2004-05-13 01:12:56 +00004466
4467 /* Fill in the payload */
drhb026e052007-05-02 01:34:31 +00004468 nPayload = nData + nZero;
drh3aac2dd2004-04-26 14:10:20 +00004469 if( pPage->intKey ){
4470 pSrc = pData;
4471 nSrc = nData;
drh91025292004-05-03 19:49:32 +00004472 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00004473 }else{
4474 nPayload += nKey;
4475 pSrc = pKey;
4476 nSrc = nKey;
4477 }
drh6f11bef2004-05-13 01:12:56 +00004478 *pnSize = info.nSize;
4479 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00004480 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00004481 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00004482
drh3b7511c2001-05-26 13:15:44 +00004483 while( nPayload>0 ){
4484 if( spaceLeft==0 ){
danielk1977b39f70b2007-05-17 18:28:11 +00004485 int isExact = 0;
danielk1977afcdd022004-10-31 16:25:42 +00004486#ifndef SQLITE_OMIT_AUTOVACUUM
4487 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
danielk1977b39f70b2007-05-17 18:28:11 +00004488 if( pBt->autoVacuum ){
4489 do{
4490 pgnoOvfl++;
4491 } while(
4492 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4493 );
danielk197789a4be82007-05-23 13:34:32 +00004494 if( pgnoOvfl>1 ){
danielk1977b39f70b2007-05-17 18:28:11 +00004495 /* isExact = 1; */
4496 }
4497 }
danielk1977afcdd022004-10-31 16:25:42 +00004498#endif
danielk1977b39f70b2007-05-17 18:28:11 +00004499 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
danielk1977afcdd022004-10-31 16:25:42 +00004500#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977a19df672004-11-03 11:37:07 +00004501 /* If the database supports auto-vacuum, and the second or subsequent
4502 ** overflow page is being allocated, add an entry to the pointer-map
danielk19774ef24492007-05-23 09:52:41 +00004503 ** for that page now.
4504 **
4505 ** If this is the first overflow page, then write a partial entry
4506 ** to the pointer-map. If we write nothing to this pointer-map slot,
4507 ** then the optimistic overflow chain processing in clearCell()
4508 ** may misinterpret the uninitialised values and delete the
4509 ** wrong pages from the database.
danielk1977afcdd022004-10-31 16:25:42 +00004510 */
danielk19774ef24492007-05-23 09:52:41 +00004511 if( pBt->autoVacuum && rc==SQLITE_OK ){
4512 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4513 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
danielk197789a4be82007-05-23 13:34:32 +00004514 if( rc ){
4515 releasePage(pOvfl);
4516 }
danielk1977afcdd022004-10-31 16:25:42 +00004517 }
4518#endif
drh3b7511c2001-05-26 13:15:44 +00004519 if( rc ){
drh9b171272004-05-08 02:03:22 +00004520 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004521 return rc;
4522 }
drh3aac2dd2004-04-26 14:10:20 +00004523 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00004524 releasePage(pToRelease);
4525 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00004526 pPrior = pOvfl->aData;
4527 put4byte(pPrior, 0);
4528 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00004529 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00004530 }
4531 n = nPayload;
4532 if( n>spaceLeft ) n = spaceLeft;
drhb026e052007-05-02 01:34:31 +00004533 if( nSrc>0 ){
4534 if( n>nSrc ) n = nSrc;
4535 assert( pSrc );
4536 memcpy(pPayload, pSrc, n);
4537 }else{
4538 memset(pPayload, 0, n);
4539 }
drh3b7511c2001-05-26 13:15:44 +00004540 nPayload -= n;
drhde647132004-05-07 17:57:49 +00004541 pPayload += n;
drh9b171272004-05-08 02:03:22 +00004542 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00004543 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00004544 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00004545 if( nSrc==0 ){
4546 nSrc = nData;
4547 pSrc = pData;
4548 }
drhdd793422001-06-28 01:54:48 +00004549 }
drh9b171272004-05-08 02:03:22 +00004550 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004551 return SQLITE_OK;
4552}
4553
drh14acc042001-06-10 19:56:58 +00004554/*
4555** Remove the i-th cell from pPage. This routine effects pPage only.
4556** The cell content is not freed or deallocated. It is assumed that
4557** the cell content has been copied someplace else. This routine just
4558** removes the reference to the cell from pPage.
4559**
4560** "sz" must be the number of bytes in the cell.
drh14acc042001-06-10 19:56:58 +00004561*/
drh4b70f112004-05-02 21:12:19 +00004562static void dropCell(MemPage *pPage, int idx, int sz){
drh43605152004-05-29 21:46:49 +00004563 int i; /* Loop counter */
4564 int pc; /* Offset to cell content of cell being deleted */
4565 u8 *data; /* pPage->aData */
4566 u8 *ptr; /* Used to move bytes around within data[] */
4567
drh8c42ca92001-06-22 19:15:00 +00004568 assert( idx>=0 && idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00004569 assert( sz==cellSize(pPage, idx) );
danielk19773b8a05f2007-03-19 17:44:26 +00004570 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00004571 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhda200cc2004-05-09 11:51:38 +00004572 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00004573 ptr = &data[pPage->cellOffset + 2*idx];
4574 pc = get2byte(ptr);
4575 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
drhde647132004-05-07 17:57:49 +00004576 freeSpace(pPage, pc, sz);
drh43605152004-05-29 21:46:49 +00004577 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4578 ptr[0] = ptr[2];
4579 ptr[1] = ptr[3];
drh14acc042001-06-10 19:56:58 +00004580 }
4581 pPage->nCell--;
drh43605152004-05-29 21:46:49 +00004582 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4583 pPage->nFree += 2;
drh14acc042001-06-10 19:56:58 +00004584}
4585
4586/*
4587** Insert a new cell on pPage at cell index "i". pCell points to the
4588** content of the cell.
4589**
4590** If the cell content will fit on the page, then put it there. If it
drh43605152004-05-29 21:46:49 +00004591** will not fit, then make a copy of the cell content into pTemp if
4592** pTemp is not null. Regardless of pTemp, allocate a new entry
4593** in pPage->aOvfl[] and make it point to the cell content (either
4594** in pTemp or the original pCell) and also record its index.
4595** Allocating a new entry in pPage->aCell[] implies that
4596** pPage->nOverflow is incremented.
danielk1977a3ad5e72005-01-07 08:56:44 +00004597**
4598** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4599** cell. The caller will overwrite them after this function returns. If
drh4b238df2005-01-08 15:43:18 +00004600** nSkip is non-zero, then pCell may not point to an invalid memory location
danielk1977a3ad5e72005-01-07 08:56:44 +00004601** (but pCell+nSkip is always valid).
drh14acc042001-06-10 19:56:58 +00004602*/
danielk1977e80463b2004-11-03 03:01:16 +00004603static int insertCell(
drh24cd67e2004-05-10 16:18:47 +00004604 MemPage *pPage, /* Page into which we are copying */
drh43605152004-05-29 21:46:49 +00004605 int i, /* New cell becomes the i-th cell of the page */
4606 u8 *pCell, /* Content of the new cell */
4607 int sz, /* Bytes of content in pCell */
danielk1977a3ad5e72005-01-07 08:56:44 +00004608 u8 *pTemp, /* Temp storage space for pCell, if needed */
4609 u8 nSkip /* Do not write the first nSkip bytes of the cell */
drh24cd67e2004-05-10 16:18:47 +00004610){
drh43605152004-05-29 21:46:49 +00004611 int idx; /* Where to write new cell content in data[] */
4612 int j; /* Loop counter */
4613 int top; /* First byte of content for any cell in data[] */
4614 int end; /* First byte past the last cell pointer in data[] */
4615 int ins; /* Index in data[] where new cell pointer is inserted */
4616 int hdr; /* Offset into data[] of the page header */
4617 int cellOffset; /* Address of first cell pointer in data[] */
4618 u8 *data; /* The content of the whole page */
4619 u8 *ptr; /* Used for moving information around in data[] */
4620
4621 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4622 assert( sz==cellSizePtr(pPage, pCell) );
drh1fee73e2007-08-29 04:00:57 +00004623 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +00004624 if( pPage->nOverflow || sz+2>pPage->nFree ){
drh24cd67e2004-05-10 16:18:47 +00004625 if( pTemp ){
danielk1977a3ad5e72005-01-07 08:56:44 +00004626 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004627 pCell = pTemp;
drh24cd67e2004-05-10 16:18:47 +00004628 }
drh43605152004-05-29 21:46:49 +00004629 j = pPage->nOverflow++;
4630 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4631 pPage->aOvfl[j].pCell = pCell;
4632 pPage->aOvfl[j].idx = i;
4633 pPage->nFree = 0;
drh14acc042001-06-10 19:56:58 +00004634 }else{
danielk19776e465eb2007-08-21 13:11:00 +00004635 int rc = sqlite3PagerWrite(pPage->pDbPage);
4636 if( rc!=SQLITE_OK ){
4637 return rc;
4638 }
4639 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +00004640 data = pPage->aData;
4641 hdr = pPage->hdrOffset;
4642 top = get2byte(&data[hdr+5]);
4643 cellOffset = pPage->cellOffset;
4644 end = cellOffset + 2*pPage->nCell + 2;
4645 ins = cellOffset + 2*i;
4646 if( end > top - sz ){
danielk1977474b7cc2008-07-09 11:49:46 +00004647 defragmentPage(pPage);
drh43605152004-05-29 21:46:49 +00004648 top = get2byte(&data[hdr+5]);
4649 assert( end + sz <= top );
4650 }
4651 idx = allocateSpace(pPage, sz);
4652 assert( idx>0 );
4653 assert( end <= get2byte(&data[hdr+5]) );
4654 pPage->nCell++;
4655 pPage->nFree -= 2;
danielk1977a3ad5e72005-01-07 08:56:44 +00004656 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004657 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4658 ptr[0] = ptr[-2];
4659 ptr[1] = ptr[-1];
drhda200cc2004-05-09 11:51:38 +00004660 }
drh43605152004-05-29 21:46:49 +00004661 put2byte(&data[ins], idx);
4662 put2byte(&data[hdr+3], pPage->nCell);
danielk1977a19df672004-11-03 11:37:07 +00004663#ifndef SQLITE_OMIT_AUTOVACUUM
4664 if( pPage->pBt->autoVacuum ){
4665 /* The cell may contain a pointer to an overflow page. If so, write
4666 ** the entry for the overflow page into the pointer map.
4667 */
4668 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00004669 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh72365832007-03-06 15:53:44 +00004670 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
danielk1977a19df672004-11-03 11:37:07 +00004671 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4672 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
danielk19776e465eb2007-08-21 13:11:00 +00004673 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977a19df672004-11-03 11:37:07 +00004674 if( rc!=SQLITE_OK ) return rc;
4675 }
4676 }
4677#endif
drh14acc042001-06-10 19:56:58 +00004678 }
danielk1977e80463b2004-11-03 03:01:16 +00004679
danielk1977e80463b2004-11-03 03:01:16 +00004680 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00004681}
4682
4683/*
drhfa1a98a2004-05-14 19:08:17 +00004684** Add a list of cells to a page. The page should be initially empty.
4685** The cells are guaranteed to fit on the page.
4686*/
4687static void assemblePage(
4688 MemPage *pPage, /* The page to be assemblied */
4689 int nCell, /* The number of cells to add to this page */
drh43605152004-05-29 21:46:49 +00004690 u8 **apCell, /* Pointers to cell bodies */
drha9121e42008-02-19 14:59:35 +00004691 u16 *aSize /* Sizes of the cells */
drhfa1a98a2004-05-14 19:08:17 +00004692){
4693 int i; /* Loop counter */
4694 int totalSize; /* Total size of all cells */
4695 int hdr; /* Index of page header */
drh43605152004-05-29 21:46:49 +00004696 int cellptr; /* Address of next cell pointer */
4697 int cellbody; /* Address of next cell body */
drhfa1a98a2004-05-14 19:08:17 +00004698 u8 *data; /* Data for the page */
4699
drh43605152004-05-29 21:46:49 +00004700 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +00004701 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa1a98a2004-05-14 19:08:17 +00004702 totalSize = 0;
4703 for(i=0; i<nCell; i++){
4704 totalSize += aSize[i];
4705 }
drh43605152004-05-29 21:46:49 +00004706 assert( totalSize+2*nCell<=pPage->nFree );
drhfa1a98a2004-05-14 19:08:17 +00004707 assert( pPage->nCell==0 );
drh43605152004-05-29 21:46:49 +00004708 cellptr = pPage->cellOffset;
drhfa1a98a2004-05-14 19:08:17 +00004709 data = pPage->aData;
4710 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00004711 put2byte(&data[hdr+3], nCell);
drh09d0deb2005-08-02 17:13:09 +00004712 if( nCell ){
4713 cellbody = allocateSpace(pPage, totalSize);
4714 assert( cellbody>0 );
4715 assert( pPage->nFree >= 2*nCell );
4716 pPage->nFree -= 2*nCell;
4717 for(i=0; i<nCell; i++){
4718 put2byte(&data[cellptr], cellbody);
4719 memcpy(&data[cellbody], apCell[i], aSize[i]);
4720 cellptr += 2;
4721 cellbody += aSize[i];
4722 }
4723 assert( cellbody==pPage->pBt->usableSize );
drhfa1a98a2004-05-14 19:08:17 +00004724 }
4725 pPage->nCell = nCell;
drhfa1a98a2004-05-14 19:08:17 +00004726}
4727
drh14acc042001-06-10 19:56:58 +00004728/*
drhc3b70572003-01-04 19:44:07 +00004729** The following parameters determine how many adjacent pages get involved
4730** in a balancing operation. NN is the number of neighbors on either side
4731** of the page that participate in the balancing operation. NB is the
4732** total number of pages that participate, including the target page and
4733** NN neighbors on either side.
4734**
4735** The minimum value of NN is 1 (of course). Increasing NN above 1
4736** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4737** in exchange for a larger degradation in INSERT and UPDATE performance.
4738** The value of NN appears to give the best results overall.
4739*/
4740#define NN 1 /* Number of neighbors on either side of pPage */
4741#define NB (NN*2+1) /* Total pages involved in the balance */
4742
drh43605152004-05-29 21:46:49 +00004743/* Forward reference */
danielk197771d5d2c2008-09-29 11:49:47 +00004744static int balance(BtCursor*, int);
danielk1977ac245ec2005-01-14 13:50:11 +00004745
drh615ae552005-01-16 23:21:00 +00004746#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004747/*
4748** This version of balance() handles the common special case where
4749** a new entry is being inserted on the extreme right-end of the
4750** tree, in other words, when the new entry will become the largest
4751** entry in the tree.
4752**
4753** Instead of trying balance the 3 right-most leaf pages, just add
4754** a new page to the right-hand side and put the one new entry in
4755** that page. This leaves the right side of the tree somewhat
4756** unbalanced. But odds are that we will be inserting new entries
4757** at the end soon afterwards so the nearly empty page will quickly
4758** fill up. On average.
4759**
4760** pPage is the leaf page which is the right-most page in the tree.
4761** pParent is its parent. pPage must have a single overflow entry
4762** which is also the right-most entry on the page.
4763*/
danielk197771d5d2c2008-09-29 11:49:47 +00004764static int balance_quick(BtCursor *pCur){
danielk1977ac245ec2005-01-14 13:50:11 +00004765 int rc;
danielk1977eaa06f62008-09-18 17:34:44 +00004766 MemPage *pNew = 0;
danielk1977ac245ec2005-01-14 13:50:11 +00004767 Pgno pgnoNew;
4768 u8 *pCell;
drha9121e42008-02-19 14:59:35 +00004769 u16 szCell;
danielk1977ac245ec2005-01-14 13:50:11 +00004770 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00004771 MemPage *pPage = pCur->apPage[pCur->iPage];
4772 MemPage *pParent = pCur->apPage[pCur->iPage-1];
danielk1977aef0bf62005-12-30 16:28:01 +00004773 BtShared *pBt = pPage->pBt;
danielk197779a40da2005-01-16 08:00:01 +00004774 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4775 int parentSize; /* Size of new divider cell */
4776 u8 parentCell[64]; /* Space for the new divider cell */
danielk1977ac245ec2005-01-14 13:50:11 +00004777
drh1fee73e2007-08-29 04:00:57 +00004778 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004779
danielk1977ac245ec2005-01-14 13:50:11 +00004780 /* Allocate a new page. Insert the overflow cell from pPage
4781 ** into it. Then remove the overflow cell from pPage.
4782 */
drh4f0c5872007-03-26 22:05:01 +00004783 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004784 if( rc==SQLITE_OK ){
4785 pCell = pPage->aOvfl[0].pCell;
4786 szCell = cellSizePtr(pPage, pCell);
4787 zeroPage(pNew, pPage->aData[0]);
4788 assemblePage(pNew, 1, &pCell, &szCell);
4789 pPage->nOverflow = 0;
4790
danielk1977eaa06f62008-09-18 17:34:44 +00004791 /* pPage is currently the right-child of pParent. Change this
4792 ** so that the right-child is the new page allocated above and
4793 ** pPage is the next-to-right child.
4794 **
4795 ** Ignore the return value of the call to fillInCell(). fillInCell()
4796 ** may only return other than SQLITE_OK if it is required to allocate
4797 ** one or more overflow pages. Since an internal table B-Tree cell
4798 ** may never spill over onto an overflow page (it is a maximum of
4799 ** 13 bytes in size), it is not neccessary to check the return code.
4800 **
4801 ** Similarly, the insertCell() function cannot fail if the page
4802 ** being inserted into is already writable and the cell does not
4803 ** contain an overflow pointer. So ignore this return code too.
4804 */
4805 assert( pPage->nCell>0 );
4806 pCell = findCell(pPage, pPage->nCell-1);
4807 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4808 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4809 assert( parentSize<64 );
4810 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4811 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4812 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4813 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4814
4815 /* If this is an auto-vacuum database, update the pointer map
4816 ** with entries for the new page, and any pointer from the
4817 ** cell on the page to an overflow page.
4818 */
4819 if( ISAUTOVACUUM ){
4820 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4821 if( rc==SQLITE_OK ){
4822 rc = ptrmapPutOvfl(pNew, 0);
4823 }
danielk1977ac11ee62005-01-15 12:45:51 +00004824 }
danielk1977e08a3c42008-09-18 18:17:03 +00004825
4826 /* Release the reference to the new page. */
4827 releasePage(pNew);
danielk1977ac11ee62005-01-15 12:45:51 +00004828 }
4829
danielk1977eaa06f62008-09-18 17:34:44 +00004830 /* At this point the pPage->nFree variable is not set correctly with
4831 ** respect to the content of the page (because it was set to 0 by
4832 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
4833 ** correct.
4834 **
4835 ** This has to be done even if an error will be returned. Normally, if
4836 ** an error occurs during tree balancing, the contents of MemPage are
4837 ** not important, as they will be recalculated when the page is rolled
4838 ** back. But here, in balance_quick(), it is possible that pPage has
4839 ** not yet been marked dirty or written into the journal file. Therefore
4840 ** it will not be rolled back and so it is important to make sure that
4841 ** the page data and contents of MemPage are consistent.
4842 */
4843 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004844 sqlite3BtreeInitPage(pPage);
danielk1977eaa06f62008-09-18 17:34:44 +00004845
danielk1977e08a3c42008-09-18 18:17:03 +00004846 /* If everything else succeeded, balance the parent page, in
4847 ** case the divider cell inserted caused it to become overfull.
danielk197779a40da2005-01-16 08:00:01 +00004848 */
danielk1977eaa06f62008-09-18 17:34:44 +00004849 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00004850 releasePage(pPage);
4851 pCur->iPage--;
4852 rc = balance(pCur, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004853 }
4854 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00004855}
drh615ae552005-01-16 23:21:00 +00004856#endif /* SQLITE_OMIT_QUICKBALANCE */
drh43605152004-05-29 21:46:49 +00004857
drhc3b70572003-01-04 19:44:07 +00004858/*
drhab01f612004-05-22 02:55:23 +00004859** This routine redistributes Cells on pPage and up to NN*2 siblings
drh8b2f49b2001-06-08 00:21:52 +00004860** of pPage so that all pages have about the same amount of free space.
drh0c6cc4e2004-06-15 02:13:26 +00004861** Usually NN siblings on either side of pPage is used in the balancing,
4862** though more siblings might come from one side if pPage is the first
drhab01f612004-05-22 02:55:23 +00004863** or last child of its parent. If pPage has fewer than 2*NN siblings
drh8b2f49b2001-06-08 00:21:52 +00004864** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00004865** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00004866**
drh0c6cc4e2004-06-15 02:13:26 +00004867** The number of siblings of pPage might be increased or decreased by one or
4868** two in an effort to keep pages nearly full but not over full. The root page
drhab01f612004-05-22 02:55:23 +00004869** is special and is allowed to be nearly empty. If pPage is
drh8c42ca92001-06-22 19:15:00 +00004870** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00004871** or decreased by one, as necessary, to keep the root page from being
drhab01f612004-05-22 02:55:23 +00004872** overfull or completely empty.
drh14acc042001-06-10 19:56:58 +00004873**
drh8b2f49b2001-06-08 00:21:52 +00004874** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00004875** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00004876** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00004877** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00004878**
drh8c42ca92001-06-22 19:15:00 +00004879** In the course of balancing the siblings of pPage, the parent of pPage
4880** might become overfull or underfull. If that happens, then this routine
4881** is called recursively on the parent.
4882**
drh5e00f6c2001-09-13 13:46:56 +00004883** If this routine fails for any reason, it might leave the database
4884** in a corrupted state. So if this routine fails, the database should
4885** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00004886*/
danielk197771d5d2c2008-09-29 11:49:47 +00004887static int balance_nonroot(BtCursor *pCur){
4888 MemPage *pPage; /* The over or underfull page to balance */
drh8b2f49b2001-06-08 00:21:52 +00004889 MemPage *pParent; /* The parent of pPage */
drh16a9b832007-05-05 18:39:25 +00004890 BtShared *pBt; /* The whole database */
danielk1977634f2982005-03-28 08:44:07 +00004891 int nCell = 0; /* Number of cells in apCell[] */
4892 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
drh8b2f49b2001-06-08 00:21:52 +00004893 int nOld; /* Number of pages in apOld[] */
4894 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00004895 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00004896 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00004897 int idx; /* Index of pPage in pParent->aCell[] */
4898 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00004899 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00004900 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00004901 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00004902 int usableSpace; /* Bytes in pPage beyond the header */
4903 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00004904 int subtotal; /* Subtotal of bytes in cells on one page */
drhe5ae5732008-06-15 02:51:47 +00004905 int iSpace1 = 0; /* First unused byte of aSpace1[] */
4906 int iSpace2 = 0; /* First unused byte of aSpace2[] */
drhfacf0302008-06-17 15:12:00 +00004907 int szScratch; /* Size of scratch memory requested */
drhc3b70572003-01-04 19:44:07 +00004908 MemPage *apOld[NB]; /* pPage and up to two siblings */
4909 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00004910 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drha2fce642004-06-05 00:01:44 +00004911 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4912 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
drh4b70f112004-05-02 21:12:19 +00004913 u8 *apDiv[NB]; /* Divider cells in pParent */
drha2fce642004-06-05 00:01:44 +00004914 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4915 int szNew[NB+2]; /* Combined size of cells place on i-th page */
danielk197750f059b2005-03-29 02:54:03 +00004916 u8 **apCell = 0; /* All cells begin balanced */
drha9121e42008-02-19 14:59:35 +00004917 u16 *szCell; /* Local size of all cells in apCell[] */
drhe5ae5732008-06-15 02:51:47 +00004918 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4919 u8 *aSpace1; /* Space for copies of dividers cells before balance */
4920 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
danielk1977ac11ee62005-01-15 12:45:51 +00004921 u8 *aFrom = 0;
drh8b2f49b2001-06-08 00:21:52 +00004922
danielk197771d5d2c2008-09-29 11:49:47 +00004923 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00004924 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf94a1732008-09-30 17:18:17 +00004925 VVA_ONLY( pCur->pagesShuffled = 1 );
drhd677b3d2007-08-20 22:48:41 +00004926
drh14acc042001-06-10 19:56:58 +00004927 /*
drh43605152004-05-29 21:46:49 +00004928 ** Find the parent page.
drh8b2f49b2001-06-08 00:21:52 +00004929 */
danielk197771d5d2c2008-09-29 11:49:47 +00004930 assert( pCur->iPage>0 );
4931 assert( pPage->isInit );
danielk19776e465eb2007-08-21 13:11:00 +00004932 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
drh4b70f112004-05-02 21:12:19 +00004933 pBt = pPage->pBt;
danielk197771d5d2c2008-09-29 11:49:47 +00004934 pParent = pCur->apPage[pCur->iPage-1];
drh43605152004-05-29 21:46:49 +00004935 assert( pParent );
danielk19773b8a05f2007-03-19 17:44:26 +00004936 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
danielk197707cb5602006-01-20 10:55:05 +00004937 return rc;
4938 }
danielk1977474b7cc2008-07-09 11:49:46 +00004939
drh43605152004-05-29 21:46:49 +00004940 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
drh2e38c322004-09-03 18:38:44 +00004941
drh615ae552005-01-16 23:21:00 +00004942#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004943 /*
4944 ** A special case: If a new entry has just been inserted into a
4945 ** table (that is, a btree with integer keys and all data at the leaves)
drh09d0deb2005-08-02 17:13:09 +00004946 ** and the new entry is the right-most entry in the tree (it has the
drhf222e712005-01-14 22:55:49 +00004947 ** largest key) then use the special balance_quick() routine for
4948 ** balancing. balance_quick() is much faster and results in a tighter
4949 ** packing of data in the common case.
4950 */
danielk1977ac245ec2005-01-14 13:50:11 +00004951 if( pPage->leaf &&
4952 pPage->intKey &&
danielk1977ac245ec2005-01-14 13:50:11 +00004953 pPage->nOverflow==1 &&
4954 pPage->aOvfl[0].idx==pPage->nCell &&
danielk197771d5d2c2008-09-29 11:49:47 +00004955 pParent->pgno!=1 &&
danielk1977ac245ec2005-01-14 13:50:11 +00004956 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4957 ){
drh44845222008-07-17 18:39:57 +00004958 assert( pPage->intKey );
danielk1977ac11ee62005-01-15 12:45:51 +00004959 /*
4960 ** TODO: Check the siblings to the left of pPage. It may be that
4961 ** they are not full and no new page is required.
4962 */
danielk197771d5d2c2008-09-29 11:49:47 +00004963 return balance_quick(pCur);
danielk1977ac245ec2005-01-14 13:50:11 +00004964 }
4965#endif
4966
danielk19776e465eb2007-08-21 13:11:00 +00004967 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4968 return rc;
4969 }
4970
drh2e38c322004-09-03 18:38:44 +00004971 /*
drh4b70f112004-05-02 21:12:19 +00004972 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00004973 ** to pPage. The "idx" variable is the index of that cell. If pPage
4974 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00004975 */
danielk1977bf93c562008-09-29 15:53:25 +00004976 idx = pCur->aiIdx[pCur->iPage-1];
4977 assertParentIndex(pParent, idx, pPage->pgno);
drh8b2f49b2001-06-08 00:21:52 +00004978
4979 /*
drh14acc042001-06-10 19:56:58 +00004980 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00004981 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00004982 */
drh14acc042001-06-10 19:56:58 +00004983 nOld = nNew = 0;
drh14acc042001-06-10 19:56:58 +00004984
4985 /*
drh4b70f112004-05-02 21:12:19 +00004986 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00004987 ** the siblings. An attempt is made to find NN siblings on either
4988 ** side of pPage. More siblings are taken from one side, however, if
4989 ** pPage there are fewer than NN siblings on the other side. If pParent
4990 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00004991 */
drhc3b70572003-01-04 19:44:07 +00004992 nxDiv = idx - NN;
4993 if( nxDiv + NB > pParent->nCell ){
4994 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00004995 }
drhc3b70572003-01-04 19:44:07 +00004996 if( nxDiv<0 ){
4997 nxDiv = 0;
4998 }
drh8b2f49b2001-06-08 00:21:52 +00004999 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00005000 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00005001 if( k<pParent->nCell ){
danielk19771cc5ed82007-05-16 17:28:43 +00005002 apDiv[i] = findCell(pParent, k);
drh8b2f49b2001-06-08 00:21:52 +00005003 nDiv++;
drha34b6762004-05-07 13:30:42 +00005004 assert( !pParent->leaf );
drh43605152004-05-29 21:46:49 +00005005 pgnoOld[i] = get4byte(apDiv[i]);
drh14acc042001-06-10 19:56:58 +00005006 }else if( k==pParent->nCell ){
drh43605152004-05-29 21:46:49 +00005007 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
drh14acc042001-06-10 19:56:58 +00005008 }else{
5009 break;
drh8b2f49b2001-06-08 00:21:52 +00005010 }
danielk197771d5d2c2008-09-29 11:49:47 +00005011 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
drh6019e162001-07-02 17:51:45 +00005012 if( rc ) goto balance_cleanup;
danielk197771d5d2c2008-09-29 11:49:47 +00005013 /* apOld[i]->idxParent = k; */
drh91025292004-05-03 19:49:32 +00005014 apCopy[i] = 0;
5015 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00005016 nOld++;
danielk1977634f2982005-03-28 08:44:07 +00005017 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
drh8b2f49b2001-06-08 00:21:52 +00005018 }
5019
drha9121e42008-02-19 14:59:35 +00005020 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
drh8d97f1f2005-05-05 18:14:13 +00005021 ** alignment */
drha9121e42008-02-19 14:59:35 +00005022 nMaxCells = (nMaxCells + 3)&~3;
drh8d97f1f2005-05-05 18:14:13 +00005023
drh8b2f49b2001-06-08 00:21:52 +00005024 /*
danielk1977634f2982005-03-28 08:44:07 +00005025 ** Allocate space for memory structures
5026 */
drhfacf0302008-06-17 15:12:00 +00005027 szScratch =
drha9121e42008-02-19 14:59:35 +00005028 nMaxCells*sizeof(u8*) /* apCell */
5029 + nMaxCells*sizeof(u16) /* szCell */
5030 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
drhe5ae5732008-06-15 02:51:47 +00005031 + pBt->pageSize /* aSpace1 */
drhfacf0302008-06-17 15:12:00 +00005032 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5033 apCell = sqlite3ScratchMalloc( szScratch );
danielk1977634f2982005-03-28 08:44:07 +00005034 if( apCell==0 ){
5035 rc = SQLITE_NOMEM;
5036 goto balance_cleanup;
5037 }
drha9121e42008-02-19 14:59:35 +00005038 szCell = (u16*)&apCell[nMaxCells];
danielk1977634f2982005-03-28 08:44:07 +00005039 aCopy[0] = (u8*)&szCell[nMaxCells];
drhc96d8532005-05-03 12:30:33 +00005040 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005041 for(i=1; i<NB; i++){
drhc96d8532005-05-03 12:30:33 +00005042 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5043 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005044 }
drhe5ae5732008-06-15 02:51:47 +00005045 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5046 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk197785d90ca2008-07-19 14:25:15 +00005047 if( ISAUTOVACUUM ){
drhe5ae5732008-06-15 02:51:47 +00005048 aFrom = &aSpace1[pBt->pageSize];
danielk1977634f2982005-03-28 08:44:07 +00005049 }
drhfacf0302008-06-17 15:12:00 +00005050 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
drhe5ae5732008-06-15 02:51:47 +00005051 if( aSpace2==0 ){
5052 rc = SQLITE_NOMEM;
5053 goto balance_cleanup;
5054 }
danielk1977634f2982005-03-28 08:44:07 +00005055
5056 /*
drh14acc042001-06-10 19:56:58 +00005057 ** Make copies of the content of pPage and its siblings into aOld[].
5058 ** The rest of this function will use data from the copies rather
5059 ** that the original pages since the original pages will be in the
5060 ** process of being overwritten.
5061 */
5062 for(i=0; i<nOld; i++){
drhbf4bca52007-09-06 22:19:14 +00005063 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5064 memcpy(p, apOld[i], sizeof(MemPage));
5065 p->aData = (void*)&p[1];
5066 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
drh14acc042001-06-10 19:56:58 +00005067 }
5068
5069 /*
5070 ** Load pointers to all cells on sibling pages and the divider cells
5071 ** into the local apCell[] array. Make copies of the divider cells
drhe5ae5732008-06-15 02:51:47 +00005072 ** into space obtained form aSpace1[] and remove the the divider Cells
drhb6f41482004-05-14 01:58:11 +00005073 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00005074 **
5075 ** If the siblings are on leaf pages, then the child pointers of the
5076 ** divider cells are stripped from the cells before they are copied
drhe5ae5732008-06-15 02:51:47 +00005077 ** into aSpace1[]. In this way, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00005078 ** child pointers. If siblings are not leaves, then all cell in
5079 ** apCell[] include child pointers. Either way, all cells in apCell[]
5080 ** are alike.
drh96f5b762004-05-16 16:24:36 +00005081 **
5082 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5083 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
drh8b2f49b2001-06-08 00:21:52 +00005084 */
5085 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00005086 leafCorrection = pPage->leaf*4;
drh44845222008-07-17 18:39:57 +00005087 leafData = pPage->hasData;
drh8b2f49b2001-06-08 00:21:52 +00005088 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00005089 MemPage *pOld = apCopy[i];
drh43605152004-05-29 21:46:49 +00005090 int limit = pOld->nCell+pOld->nOverflow;
5091 for(j=0; j<limit; j++){
danielk1977634f2982005-03-28 08:44:07 +00005092 assert( nCell<nMaxCells );
drh43605152004-05-29 21:46:49 +00005093 apCell[nCell] = findOverflowCell(pOld, j);
5094 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
danielk197785d90ca2008-07-19 14:25:15 +00005095 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005096 int a;
5097 aFrom[nCell] = i;
5098 for(a=0; a<pOld->nOverflow; a++){
5099 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5100 aFrom[nCell] = 0xFF;
5101 break;
5102 }
5103 }
5104 }
drh14acc042001-06-10 19:56:58 +00005105 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00005106 }
5107 if( i<nOld-1 ){
drha9121e42008-02-19 14:59:35 +00005108 u16 sz = cellSizePtr(pParent, apDiv[i]);
drh8b18dd42004-05-12 19:18:15 +00005109 if( leafData ){
drh96f5b762004-05-16 16:24:36 +00005110 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5111 ** are duplicates of keys on the child pages. We need to remove
5112 ** the divider cells from pParent, but the dividers cells are not
5113 ** added to apCell[] because they are duplicates of child cells.
5114 */
drh8b18dd42004-05-12 19:18:15 +00005115 dropCell(pParent, nxDiv, sz);
drh4b70f112004-05-02 21:12:19 +00005116 }else{
drhb6f41482004-05-14 01:58:11 +00005117 u8 *pTemp;
danielk1977634f2982005-03-28 08:44:07 +00005118 assert( nCell<nMaxCells );
drhb6f41482004-05-14 01:58:11 +00005119 szCell[nCell] = sz;
drhe5ae5732008-06-15 02:51:47 +00005120 pTemp = &aSpace1[iSpace1];
5121 iSpace1 += sz;
5122 assert( sz<=pBt->pageSize/4 );
5123 assert( iSpace1<=pBt->pageSize );
drhb6f41482004-05-14 01:58:11 +00005124 memcpy(pTemp, apDiv[i], sz);
5125 apCell[nCell] = pTemp+leafCorrection;
danielk197785d90ca2008-07-19 14:25:15 +00005126 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005127 aFrom[nCell] = 0xFF;
5128 }
drhb6f41482004-05-14 01:58:11 +00005129 dropCell(pParent, nxDiv, sz);
drh8b18dd42004-05-12 19:18:15 +00005130 szCell[nCell] -= leafCorrection;
drh43605152004-05-29 21:46:49 +00005131 assert( get4byte(pTemp)==pgnoOld[i] );
drh8b18dd42004-05-12 19:18:15 +00005132 if( !pOld->leaf ){
5133 assert( leafCorrection==0 );
5134 /* The right pointer of the child page pOld becomes the left
5135 ** pointer of the divider cell */
drh43605152004-05-29 21:46:49 +00005136 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
drh8b18dd42004-05-12 19:18:15 +00005137 }else{
5138 assert( leafCorrection==4 );
danielk197739c96042007-05-12 10:41:47 +00005139 if( szCell[nCell]<4 ){
5140 /* Do not allow any cells smaller than 4 bytes. */
5141 szCell[nCell] = 4;
5142 }
drh8b18dd42004-05-12 19:18:15 +00005143 }
5144 nCell++;
drh4b70f112004-05-02 21:12:19 +00005145 }
drh8b2f49b2001-06-08 00:21:52 +00005146 }
5147 }
5148
5149 /*
drh6019e162001-07-02 17:51:45 +00005150 ** Figure out the number of pages needed to hold all nCell cells.
5151 ** Store this number in "k". Also compute szNew[] which is the total
5152 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00005153 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00005154 ** cntNew[k] should equal nCell.
5155 **
drh96f5b762004-05-16 16:24:36 +00005156 ** Values computed by this block:
5157 **
5158 ** k: The total number of sibling pages
5159 ** szNew[i]: Spaced used on the i-th sibling page.
5160 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5161 ** the right of the i-th sibling page.
5162 ** usableSpace: Number of bytes of space available on each sibling.
5163 **
drh8b2f49b2001-06-08 00:21:52 +00005164 */
drh43605152004-05-29 21:46:49 +00005165 usableSpace = pBt->usableSize - 12 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00005166 for(subtotal=k=i=0; i<nCell; i++){
danielk1977634f2982005-03-28 08:44:07 +00005167 assert( i<nMaxCells );
drh43605152004-05-29 21:46:49 +00005168 subtotal += szCell[i] + 2;
drh4b70f112004-05-02 21:12:19 +00005169 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00005170 szNew[k] = subtotal - szCell[i];
5171 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00005172 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00005173 subtotal = 0;
5174 k++;
5175 }
5176 }
5177 szNew[k] = subtotal;
5178 cntNew[k] = nCell;
5179 k++;
drh96f5b762004-05-16 16:24:36 +00005180
5181 /*
5182 ** The packing computed by the previous block is biased toward the siblings
5183 ** on the left side. The left siblings are always nearly full, while the
5184 ** right-most sibling might be nearly empty. This block of code attempts
5185 ** to adjust the packing of siblings to get a better balance.
5186 **
5187 ** This adjustment is more than an optimization. The packing above might
5188 ** be so out of balance as to be illegal. For example, the right-most
5189 ** sibling might be completely empty. This adjustment is not optional.
5190 */
drh6019e162001-07-02 17:51:45 +00005191 for(i=k-1; i>0; i--){
drh96f5b762004-05-16 16:24:36 +00005192 int szRight = szNew[i]; /* Size of sibling on the right */
5193 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5194 int r; /* Index of right-most cell in left sibling */
5195 int d; /* Index of first cell to the left of right sibling */
5196
5197 r = cntNew[i-1] - 1;
5198 d = r + 1 - leafData;
danielk1977634f2982005-03-28 08:44:07 +00005199 assert( d<nMaxCells );
5200 assert( r<nMaxCells );
drh43605152004-05-29 21:46:49 +00005201 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5202 szRight += szCell[d] + 2;
5203 szLeft -= szCell[r] + 2;
drh6019e162001-07-02 17:51:45 +00005204 cntNew[i-1]--;
drh96f5b762004-05-16 16:24:36 +00005205 r = cntNew[i-1] - 1;
5206 d = r + 1 - leafData;
drh6019e162001-07-02 17:51:45 +00005207 }
drh96f5b762004-05-16 16:24:36 +00005208 szNew[i] = szRight;
5209 szNew[i-1] = szLeft;
drh6019e162001-07-02 17:51:45 +00005210 }
drh09d0deb2005-08-02 17:13:09 +00005211
5212 /* Either we found one or more cells (cntnew[0])>0) or we are the
5213 ** a virtual root page. A virtual root page is when the real root
5214 ** page is page 1 and we are the only child of that page.
5215 */
5216 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
drh8b2f49b2001-06-08 00:21:52 +00005217
5218 /*
drh6b308672002-07-08 02:16:37 +00005219 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00005220 */
drh4b70f112004-05-02 21:12:19 +00005221 assert( pPage->pgno>1 );
5222 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00005223 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00005224 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00005225 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00005226 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00005227 pgnoNew[i] = pgnoOld[i];
5228 apOld[i] = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00005229 rc = sqlite3PagerWrite(pNew->pDbPage);
drhf5345442007-04-09 12:45:02 +00005230 nNew++;
danielk197728129562005-01-11 10:25:06 +00005231 if( rc ) goto balance_cleanup;
drh6b308672002-07-08 02:16:37 +00005232 }else{
drh7aa8f852006-03-28 00:24:44 +00005233 assert( i>0 );
drh4f0c5872007-03-26 22:05:01 +00005234 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
drh6b308672002-07-08 02:16:37 +00005235 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00005236 apNew[i] = pNew;
drhf5345442007-04-09 12:45:02 +00005237 nNew++;
drh6b308672002-07-08 02:16:37 +00005238 }
drh8b2f49b2001-06-08 00:21:52 +00005239 }
5240
danielk1977299b1872004-11-22 10:02:10 +00005241 /* Free any old pages that were not reused as new pages.
5242 */
5243 while( i<nOld ){
5244 rc = freePage(apOld[i]);
5245 if( rc ) goto balance_cleanup;
5246 releasePage(apOld[i]);
5247 apOld[i] = 0;
5248 i++;
5249 }
5250
drh8b2f49b2001-06-08 00:21:52 +00005251 /*
drhf9ffac92002-03-02 19:00:31 +00005252 ** Put the new pages in accending order. This helps to
5253 ** keep entries in the disk file in order so that a scan
5254 ** of the table is a linear scan through the file. That
5255 ** in turn helps the operating system to deliver pages
5256 ** from the disk more rapidly.
5257 **
5258 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00005259 ** n is never more than NB (a small constant), that should
5260 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00005261 **
drhc3b70572003-01-04 19:44:07 +00005262 ** When NB==3, this one optimization makes the database
5263 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00005264 */
5265 for(i=0; i<k-1; i++){
5266 int minV = pgnoNew[i];
5267 int minI = i;
5268 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00005269 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00005270 minI = j;
5271 minV = pgnoNew[j];
5272 }
5273 }
5274 if( minI>i ){
5275 int t;
5276 MemPage *pT;
5277 t = pgnoNew[i];
5278 pT = apNew[i];
5279 pgnoNew[i] = pgnoNew[minI];
5280 apNew[i] = apNew[minI];
5281 pgnoNew[minI] = t;
5282 apNew[minI] = pT;
5283 }
5284 }
drha2fce642004-06-05 00:01:44 +00005285 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
drh24cd67e2004-05-10 16:18:47 +00005286 pgnoOld[0],
5287 nOld>=2 ? pgnoOld[1] : 0,
5288 nOld>=3 ? pgnoOld[2] : 0,
drh10c0fa62004-05-18 12:50:17 +00005289 pgnoNew[0], szNew[0],
5290 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5291 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
drha2fce642004-06-05 00:01:44 +00005292 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5293 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
drh24cd67e2004-05-10 16:18:47 +00005294
drhf9ffac92002-03-02 19:00:31 +00005295 /*
drh14acc042001-06-10 19:56:58 +00005296 ** Evenly distribute the data in apCell[] across the new pages.
5297 ** Insert divider cells into pParent as necessary.
5298 */
5299 j = 0;
5300 for(i=0; i<nNew; i++){
danielk1977ac11ee62005-01-15 12:45:51 +00005301 /* Assemble the new sibling page. */
drh14acc042001-06-10 19:56:58 +00005302 MemPage *pNew = apNew[i];
drh19642e52005-03-29 13:17:45 +00005303 assert( j<nMaxCells );
drh4b70f112004-05-02 21:12:19 +00005304 assert( pNew->pgno==pgnoNew[i] );
drh10131482008-07-11 03:34:09 +00005305 zeroPage(pNew, pageFlags);
drhfa1a98a2004-05-14 19:08:17 +00005306 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
drh09d0deb2005-08-02 17:13:09 +00005307 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
drh43605152004-05-29 21:46:49 +00005308 assert( pNew->nOverflow==0 );
danielk1977ac11ee62005-01-15 12:45:51 +00005309
danielk1977ac11ee62005-01-15 12:45:51 +00005310 /* If this is an auto-vacuum database, update the pointer map entries
5311 ** that point to the siblings that were rearranged. These can be: left
5312 ** children of cells, the right-child of the page, or overflow pages
5313 ** pointed to by cells.
5314 */
danielk197785d90ca2008-07-19 14:25:15 +00005315 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005316 for(k=j; k<cntNew[i]; k++){
danielk1977634f2982005-03-28 08:44:07 +00005317 assert( k<nMaxCells );
danielk1977ac11ee62005-01-15 12:45:51 +00005318 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
danielk197779a40da2005-01-16 08:00:01 +00005319 rc = ptrmapPutOvfl(pNew, k-j);
danielk197787c52b52008-07-19 11:49:07 +00005320 if( rc==SQLITE_OK && leafCorrection==0 ){
5321 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5322 }
danielk197779a40da2005-01-16 08:00:01 +00005323 if( rc!=SQLITE_OK ){
5324 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005325 }
5326 }
5327 }
5328 }
danielk1977ac11ee62005-01-15 12:45:51 +00005329
5330 j = cntNew[i];
5331
5332 /* If the sibling page assembled above was not the right-most sibling,
5333 ** insert a divider cell into the parent page.
5334 */
drh14acc042001-06-10 19:56:58 +00005335 if( i<nNew-1 && j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00005336 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00005337 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00005338 int sz;
danielk1977634f2982005-03-28 08:44:07 +00005339
5340 assert( j<nMaxCells );
drh8b18dd42004-05-12 19:18:15 +00005341 pCell = apCell[j];
5342 sz = szCell[j] + leafCorrection;
drhe5ae5732008-06-15 02:51:47 +00005343 pTemp = &aSpace2[iSpace2];
drh4b70f112004-05-02 21:12:19 +00005344 if( !pNew->leaf ){
drh43605152004-05-29 21:46:49 +00005345 memcpy(&pNew->aData[8], pCell, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005346 if( ISAUTOVACUUM
danielk197787c52b52008-07-19 11:49:07 +00005347 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5348 ){
5349 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5350 if( rc!=SQLITE_OK ){
5351 goto balance_cleanup;
5352 }
5353 }
drh8b18dd42004-05-12 19:18:15 +00005354 }else if( leafData ){
drhfd131da2007-08-07 17:13:03 +00005355 /* If the tree is a leaf-data tree, and the siblings are leaves,
danielk1977ac11ee62005-01-15 12:45:51 +00005356 ** then there is no divider cell in apCell[]. Instead, the divider
5357 ** cell consists of the integer key for the right-most cell of
5358 ** the sibling-page assembled above only.
5359 */
drh6f11bef2004-05-13 01:12:56 +00005360 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00005361 j--;
drh16a9b832007-05-05 18:39:25 +00005362 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
drhe5ae5732008-06-15 02:51:47 +00005363 pCell = pTemp;
drhb026e052007-05-02 01:34:31 +00005364 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
drh8b18dd42004-05-12 19:18:15 +00005365 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00005366 }else{
5367 pCell -= 4;
danielk19774aeff622007-05-12 09:30:47 +00005368 /* Obscure case for non-leaf-data trees: If the cell at pCell was
drh85b623f2007-12-13 21:54:09 +00005369 ** previously stored on a leaf node, and its reported size was 4
danielk19774aeff622007-05-12 09:30:47 +00005370 ** bytes, then it may actually be smaller than this
5371 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
drh85b623f2007-12-13 21:54:09 +00005372 ** any cell). But it is important to pass the correct size to
danielk19774aeff622007-05-12 09:30:47 +00005373 ** insertCell(), so reparse the cell now.
5374 **
5375 ** Note that this can never happen in an SQLite data file, as all
5376 ** cells are at least 4 bytes. It only happens in b-trees used
5377 ** to evaluate "IN (SELECT ...)" and similar clauses.
5378 */
5379 if( szCell[j]==4 ){
5380 assert(leafCorrection==4);
5381 sz = cellSizePtr(pParent, pCell);
5382 }
drh4b70f112004-05-02 21:12:19 +00005383 }
drhe5ae5732008-06-15 02:51:47 +00005384 iSpace2 += sz;
5385 assert( sz<=pBt->pageSize/4 );
5386 assert( iSpace2<=pBt->pageSize );
danielk1977a3ad5e72005-01-07 08:56:44 +00005387 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
danielk1977e80463b2004-11-03 03:01:16 +00005388 if( rc!=SQLITE_OK ) goto balance_cleanup;
drh43605152004-05-29 21:46:49 +00005389 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
danielk197785d90ca2008-07-19 14:25:15 +00005390
danielk1977ac11ee62005-01-15 12:45:51 +00005391 /* If this is an auto-vacuum database, and not a leaf-data tree,
5392 ** then update the pointer map with an entry for the overflow page
5393 ** that the cell just inserted points to (if any).
5394 */
danielk197785d90ca2008-07-19 14:25:15 +00005395 if( ISAUTOVACUUM && !leafData ){
danielk197779a40da2005-01-16 08:00:01 +00005396 rc = ptrmapPutOvfl(pParent, nxDiv);
5397 if( rc!=SQLITE_OK ){
5398 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005399 }
5400 }
drh14acc042001-06-10 19:56:58 +00005401 j++;
5402 nxDiv++;
5403 }
danielk197787c52b52008-07-19 11:49:07 +00005404
danielk197787c52b52008-07-19 11:49:07 +00005405 /* Set the pointer-map entry for the new sibling page. */
danielk197785d90ca2008-07-19 14:25:15 +00005406 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005407 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5408 if( rc!=SQLITE_OK ){
5409 goto balance_cleanup;
5410 }
5411 }
drh14acc042001-06-10 19:56:58 +00005412 }
drh6019e162001-07-02 17:51:45 +00005413 assert( j==nCell );
drh7aa8f852006-03-28 00:24:44 +00005414 assert( nOld>0 );
5415 assert( nNew>0 );
drh4b70f112004-05-02 21:12:19 +00005416 if( (pageFlags & PTF_LEAF)==0 ){
danielk197787c52b52008-07-19 11:49:07 +00005417 u8 *zChild = &apCopy[nOld-1]->aData[8];
5418 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005419 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005420 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5421 if( rc!=SQLITE_OK ){
5422 goto balance_cleanup;
5423 }
5424 }
drh14acc042001-06-10 19:56:58 +00005425 }
drh43605152004-05-29 21:46:49 +00005426 if( nxDiv==pParent->nCell+pParent->nOverflow ){
drh4b70f112004-05-02 21:12:19 +00005427 /* Right-most sibling is the right-most child of pParent */
drh43605152004-05-29 21:46:49 +00005428 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
drh4b70f112004-05-02 21:12:19 +00005429 }else{
5430 /* Right-most sibling is the left child of the first entry in pParent
5431 ** past the right-most divider entry */
drh43605152004-05-29 21:46:49 +00005432 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00005433 }
5434
5435 /*
drh3a4c1412004-05-09 20:40:11 +00005436 ** Balance the parent page. Note that the current page (pPage) might
danielk1977ac11ee62005-01-15 12:45:51 +00005437 ** have been added to the freelist so it might no longer be initialized.
drh3a4c1412004-05-09 20:40:11 +00005438 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00005439 */
danielk197771d5d2c2008-09-29 11:49:47 +00005440 assert( pParent->isInit );
drhfacf0302008-06-17 15:12:00 +00005441 sqlite3ScratchFree(apCell);
drhe5ae5732008-06-15 02:51:47 +00005442 apCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005443 releasePage(pPage);
5444 pCur->iPage--;
5445 rc = balance(pCur, 0);
drhda200cc2004-05-09 11:51:38 +00005446
drh8b2f49b2001-06-08 00:21:52 +00005447 /*
drh14acc042001-06-10 19:56:58 +00005448 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00005449 */
drh14acc042001-06-10 19:56:58 +00005450balance_cleanup:
drhfacf0302008-06-17 15:12:00 +00005451 sqlite3PageFree(aSpace2);
5452 sqlite3ScratchFree(apCell);
drh8b2f49b2001-06-08 00:21:52 +00005453 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00005454 releasePage(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00005455 }
drh14acc042001-06-10 19:56:58 +00005456 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00005457 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00005458 }
danielk1977eaa06f62008-09-18 17:34:44 +00005459
danielk197771d5d2c2008-09-29 11:49:47 +00005460 /* releasePage(pParent); */
drh3a4c1412004-05-09 20:40:11 +00005461 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5462 pPage->pgno, nOld, nNew, nCell));
danielk1977eaa06f62008-09-18 17:34:44 +00005463
drh8b2f49b2001-06-08 00:21:52 +00005464 return rc;
5465}
5466
5467/*
drh43605152004-05-29 21:46:49 +00005468** This routine is called for the root page of a btree when the root
5469** page contains no cells. This is an opportunity to make the tree
5470** shallower by one level.
5471*/
danielk197771d5d2c2008-09-29 11:49:47 +00005472static int balance_shallower(BtCursor *pCur){
5473 MemPage *pPage; /* Root page of B-Tree */
drh43605152004-05-29 21:46:49 +00005474 MemPage *pChild; /* The only child page of pPage */
5475 Pgno pgnoChild; /* Page number for pChild */
drh2e38c322004-09-03 18:38:44 +00005476 int rc = SQLITE_OK; /* Return code from subprocedures */
danielk1977aef0bf62005-12-30 16:28:01 +00005477 BtShared *pBt; /* The main BTree structure */
drh2e38c322004-09-03 18:38:44 +00005478 int mxCellPerPage; /* Maximum number of cells per page */
5479 u8 **apCell; /* All cells from pages being balanced */
drha9121e42008-02-19 14:59:35 +00005480 u16 *szCell; /* Local size of all cells */
drh43605152004-05-29 21:46:49 +00005481
danielk197771d5d2c2008-09-29 11:49:47 +00005482 assert( pCur->iPage==0 );
5483 pPage = pCur->apPage[0];
5484
drh43605152004-05-29 21:46:49 +00005485 assert( pPage->nCell==0 );
drh1fee73e2007-08-29 04:00:57 +00005486 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh2e38c322004-09-03 18:38:44 +00005487 pBt = pPage->pBt;
5488 mxCellPerPage = MX_CELL(pBt);
drhe5ae5732008-06-15 02:51:47 +00005489 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
drh2e38c322004-09-03 18:38:44 +00005490 if( apCell==0 ) return SQLITE_NOMEM;
drha9121e42008-02-19 14:59:35 +00005491 szCell = (u16*)&apCell[mxCellPerPage];
drh43605152004-05-29 21:46:49 +00005492 if( pPage->leaf ){
5493 /* The table is completely empty */
5494 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5495 }else{
5496 /* The root page is empty but has one child. Transfer the
5497 ** information from that one child into the root page if it
5498 ** will fit. This reduces the depth of the tree by one.
5499 **
5500 ** If the root page is page 1, it has less space available than
5501 ** its child (due to the 100 byte header that occurs at the beginning
5502 ** of the database fle), so it might not be able to hold all of the
5503 ** information currently contained in the child. If this is the
5504 ** case, then do not do the transfer. Leave page 1 empty except
5505 ** for the right-pointer to the child page. The child page becomes
5506 ** the virtual root of the tree.
5507 */
drhf94a1732008-09-30 17:18:17 +00005508 VVA_ONLY( pCur->pagesShuffled = 1 );
drh43605152004-05-29 21:46:49 +00005509 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5510 assert( pgnoChild>0 );
danielk1977ad0132d2008-06-07 08:58:22 +00005511 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
drh16a9b832007-05-05 18:39:25 +00005512 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
drh2e38c322004-09-03 18:38:44 +00005513 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005514 if( pPage->pgno==1 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005515 rc = sqlite3BtreeInitPage(pChild);
drh2e38c322004-09-03 18:38:44 +00005516 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005517 assert( pChild->nOverflow==0 );
5518 if( pChild->nFree>=100 ){
5519 /* The child information will fit on the root page, so do the
5520 ** copy */
5521 int i;
5522 zeroPage(pPage, pChild->aData[0]);
5523 for(i=0; i<pChild->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00005524 apCell[i] = findCell(pChild,i);
drh43605152004-05-29 21:46:49 +00005525 szCell[i] = cellSizePtr(pChild, apCell[i]);
5526 }
5527 assemblePage(pPage, pChild->nCell, apCell, szCell);
danielk1977ae825582004-11-23 09:06:55 +00005528 /* Copy the right-pointer of the child to the parent. */
5529 put4byte(&pPage->aData[pPage->hdrOffset+8],
5530 get4byte(&pChild->aData[pChild->hdrOffset+8]));
drh43605152004-05-29 21:46:49 +00005531 freePage(pChild);
5532 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5533 }else{
5534 /* The child has more information that will fit on the root.
5535 ** The tree is already balanced. Do nothing. */
5536 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5537 }
5538 }else{
5539 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5540 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005541 rc = sqlite3BtreeInitPage(pPage);
drh43605152004-05-29 21:46:49 +00005542 assert( rc==SQLITE_OK );
5543 freePage(pChild);
5544 TRACE(("BALANCE: transfer child %d into root %d\n",
5545 pChild->pgno, pPage->pgno));
5546 }
danielk1977ac11ee62005-01-15 12:45:51 +00005547 assert( pPage->nOverflow==0 );
danielk197785d90ca2008-07-19 14:25:15 +00005548 if( ISAUTOVACUUM ){
danielk197700a696d2008-09-29 16:41:31 +00005549 rc = setChildPtrmaps(pPage);
danielk1977ac11ee62005-01-15 12:45:51 +00005550 }
drh43605152004-05-29 21:46:49 +00005551 releasePage(pChild);
5552 }
drh2e38c322004-09-03 18:38:44 +00005553end_shallow_balance:
drh17435752007-08-16 04:30:38 +00005554 sqlite3_free(apCell);
drh2e38c322004-09-03 18:38:44 +00005555 return rc;
drh43605152004-05-29 21:46:49 +00005556}
5557
5558
5559/*
5560** The root page is overfull
5561**
5562** When this happens, Create a new child page and copy the
5563** contents of the root into the child. Then make the root
5564** page an empty page with rightChild pointing to the new
5565** child. Finally, call balance_internal() on the new child
5566** to cause it to split.
5567*/
danielk197771d5d2c2008-09-29 11:49:47 +00005568static int balance_deeper(BtCursor *pCur){
drh43605152004-05-29 21:46:49 +00005569 int rc; /* Return value from subprocedures */
danielk197771d5d2c2008-09-29 11:49:47 +00005570 MemPage *pPage; /* Pointer to the root page */
drh43605152004-05-29 21:46:49 +00005571 MemPage *pChild; /* Pointer to a new child page */
5572 Pgno pgnoChild; /* Page number of the new child page */
danielk1977aef0bf62005-12-30 16:28:01 +00005573 BtShared *pBt; /* The BTree */
drh43605152004-05-29 21:46:49 +00005574 int usableSize; /* Total usable size of a page */
5575 u8 *data; /* Content of the parent page */
5576 u8 *cdata; /* Content of the child page */
5577 int hdr; /* Offset to page header in parent */
drh281b21d2008-08-22 12:57:08 +00005578 int cbrk; /* Offset to content of first cell in parent */
drh43605152004-05-29 21:46:49 +00005579
danielk197771d5d2c2008-09-29 11:49:47 +00005580 assert( pCur->iPage==0 );
5581 assert( pCur->apPage[0]->nOverflow>0 );
5582
drhf94a1732008-09-30 17:18:17 +00005583 VVA_ONLY( pCur->pagesShuffled = 1 );
danielk197771d5d2c2008-09-29 11:49:47 +00005584 pPage = pCur->apPage[0];
drh43605152004-05-29 21:46:49 +00005585 pBt = pPage->pBt;
drh1fee73e2007-08-29 04:00:57 +00005586 assert( sqlite3_mutex_held(pBt->mutex) );
drh4f0c5872007-03-26 22:05:01 +00005587 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
drh43605152004-05-29 21:46:49 +00005588 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00005589 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
drh43605152004-05-29 21:46:49 +00005590 usableSize = pBt->usableSize;
5591 data = pPage->aData;
5592 hdr = pPage->hdrOffset;
drh281b21d2008-08-22 12:57:08 +00005593 cbrk = get2byte(&data[hdr+5]);
drh43605152004-05-29 21:46:49 +00005594 cdata = pChild->aData;
5595 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
drh281b21d2008-08-22 12:57:08 +00005596 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
danielk197771d5d2c2008-09-29 11:49:47 +00005597
5598 rc = sqlite3BtreeInitPage(pChild);
5599 if( rc==SQLITE_OK ){
5600 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5601 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5602 pChild->nOverflow = pPage->nOverflow;
5603 if( pChild->nOverflow ){
5604 pChild->nFree = 0;
5605 }
5606 assert( pChild->nCell==pPage->nCell );
5607 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5608 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5609 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5610 if( ISAUTOVACUUM ){
danielk197771d5d2c2008-09-29 11:49:47 +00005611 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
danielk197771d5d2c2008-09-29 11:49:47 +00005612 if( rc==SQLITE_OK ){
danielk197700a696d2008-09-29 16:41:31 +00005613 rc = setChildPtrmaps(pChild);
danielk1977ac11ee62005-01-15 12:45:51 +00005614 }
5615 }
danielk197787c52b52008-07-19 11:49:07 +00005616 }
danielk19776b456a22005-03-21 04:04:02 +00005617
danielk197771d5d2c2008-09-29 11:49:47 +00005618 if( rc==SQLITE_OK ){
5619 pCur->iPage++;
5620 pCur->apPage[1] = pChild;
danielk1977bf93c562008-09-29 15:53:25 +00005621 pCur->aiIdx[0] = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005622 rc = balance_nonroot(pCur);
5623 }else{
5624 releasePage(pChild);
5625 }
5626
drh43605152004-05-29 21:46:49 +00005627 return rc;
5628}
5629
5630/*
danielk197771d5d2c2008-09-29 11:49:47 +00005631** The page that pCur currently points to has just been modified in
5632** some way. This function figures out if this modification means the
5633** tree needs to be balanced, and if so calls the appropriate balancing
5634** routine.
5635**
5636** Parameter isInsert is true if a new cell was just inserted into the
5637** page, or false otherwise.
drh43605152004-05-29 21:46:49 +00005638*/
danielk197771d5d2c2008-09-29 11:49:47 +00005639static int balance(BtCursor *pCur, int isInsert){
drh43605152004-05-29 21:46:49 +00005640 int rc = SQLITE_OK;
danielk197771d5d2c2008-09-29 11:49:47 +00005641 MemPage *pPage = pCur->apPage[pCur->iPage];
5642
drh1fee73e2007-08-29 04:00:57 +00005643 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00005644 if( pCur->iPage==0 ){
danielk19776e465eb2007-08-21 13:11:00 +00005645 rc = sqlite3PagerWrite(pPage->pDbPage);
5646 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005647 rc = balance_deeper(pCur);
drh43605152004-05-29 21:46:49 +00005648 }
danielk1977687566d2004-11-02 12:56:41 +00005649 if( rc==SQLITE_OK && pPage->nCell==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005650 rc = balance_shallower(pCur);
drh43605152004-05-29 21:46:49 +00005651 }
5652 }else{
danielk1977ac245ec2005-01-14 13:50:11 +00005653 if( pPage->nOverflow>0 ||
danielk197771d5d2c2008-09-29 11:49:47 +00005654 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5655 rc = balance_nonroot(pCur);
drh43605152004-05-29 21:46:49 +00005656 }
5657 }
5658 return rc;
5659}
5660
5661/*
drh8dcd7ca2004-08-08 19:43:29 +00005662** This routine checks all cursors that point to table pgnoRoot.
drh980b1a72006-08-16 16:42:48 +00005663** If any of those cursors were opened with wrFlag==0 in a different
5664** database connection (a database connection that shares the pager
5665** cache with the current connection) and that other connection
5666** is not in the ReadUncommmitted state, then this routine returns
5667** SQLITE_LOCKED.
danielk1977299b1872004-11-22 10:02:10 +00005668**
danielk19773588ceb2008-06-10 17:30:26 +00005669** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5670** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5671** blob cursors are used for both reading and writing.
5672**
5673** When pgnoRoot is the root page of an intkey table, this function is also
5674** responsible for invalidating incremental blob cursors when the table row
5675** on which they are opened is deleted or modified. Cursors are invalidated
5676** according to the following rules:
5677**
5678** 1) When BtreeClearTable() is called to completely delete the contents
5679** of a B-Tree table, pExclude is set to zero and parameter iRow is
5680** set to non-zero. In this case all incremental blob cursors open
5681** on the table rooted at pgnoRoot are invalidated.
5682**
5683** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5684** modify a table row via an SQL statement, pExclude is set to the
5685** write cursor used to do the modification and parameter iRow is set
5686** to the integer row id of the B-Tree entry being modified. Unless
5687** pExclude is itself an incremental blob cursor, then all incremental
5688** blob cursors open on row iRow of the B-Tree are invalidated.
5689**
5690** 3) If both pExclude and iRow are set to zero, no incremental blob
5691** cursors are invalidated.
drhf74b8d92002-09-01 23:20:45 +00005692*/
danielk19773588ceb2008-06-10 17:30:26 +00005693static int checkReadLocks(
5694 Btree *pBtree,
5695 Pgno pgnoRoot,
5696 BtCursor *pExclude,
5697 i64 iRow
5698){
danielk1977299b1872004-11-22 10:02:10 +00005699 BtCursor *p;
drh980b1a72006-08-16 16:42:48 +00005700 BtShared *pBt = pBtree->pBt;
drhe5fe6902007-12-07 18:55:28 +00005701 sqlite3 *db = pBtree->db;
drh1fee73e2007-08-29 04:00:57 +00005702 assert( sqlite3BtreeHoldsMutex(pBtree) );
danielk1977299b1872004-11-22 10:02:10 +00005703 for(p=pBt->pCursor; p; p=p->pNext){
drh980b1a72006-08-16 16:42:48 +00005704 if( p==pExclude ) continue;
drh980b1a72006-08-16 16:42:48 +00005705 if( p->pgnoRoot!=pgnoRoot ) continue;
danielk19773588ceb2008-06-10 17:30:26 +00005706#ifndef SQLITE_OMIT_INCRBLOB
5707 if( p->isIncrblobHandle && (
5708 (!pExclude && iRow)
5709 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5710 )){
5711 p->eState = CURSOR_INVALID;
5712 }
5713#endif
5714 if( p->eState!=CURSOR_VALID ) continue;
5715 if( p->wrFlag==0
5716#ifndef SQLITE_OMIT_INCRBLOB
5717 || p->isIncrblobHandle
5718#endif
5719 ){
drhe5fe6902007-12-07 18:55:28 +00005720 sqlite3 *dbOther = p->pBtree->db;
drh980b1a72006-08-16 16:42:48 +00005721 if( dbOther==0 ||
5722 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5723 return SQLITE_LOCKED;
5724 }
danielk1977299b1872004-11-22 10:02:10 +00005725 }
5726 }
drhf74b8d92002-09-01 23:20:45 +00005727 return SQLITE_OK;
5728}
5729
5730/*
drh3b7511c2001-05-26 13:15:44 +00005731** Insert a new record into the BTree. The key is given by (pKey,nKey)
5732** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00005733** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00005734** is left pointing at a random location.
5735**
5736** For an INTKEY table, only the nKey value of the key is used. pKey is
5737** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00005738*/
drh3aac2dd2004-04-26 14:10:20 +00005739int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00005740 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00005741 const void *pKey, i64 nKey, /* The key of the new record */
drhe4d90812007-03-29 05:51:49 +00005742 const void *pData, int nData, /* The data of the new record */
drhb026e052007-05-02 01:34:31 +00005743 int nZero, /* Number of extra 0 bytes to append to data */
drhe4d90812007-03-29 05:51:49 +00005744 int appendBias /* True if this is likely an append */
drh3b7511c2001-05-26 13:15:44 +00005745){
drh3b7511c2001-05-26 13:15:44 +00005746 int rc;
5747 int loc;
drh14acc042001-06-10 19:56:58 +00005748 int szNew;
danielk197771d5d2c2008-09-29 11:49:47 +00005749 int idx;
drh3b7511c2001-05-26 13:15:44 +00005750 MemPage *pPage;
drhd677b3d2007-08-20 22:48:41 +00005751 Btree *p = pCur->pBtree;
5752 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00005753 unsigned char *oldCell;
drh2e38c322004-09-03 18:38:44 +00005754 unsigned char *newCell = 0;
drh3b7511c2001-05-26 13:15:44 +00005755
drh1fee73e2007-08-29 04:00:57 +00005756 assert( cursorHoldsMutex(pCur) );
danielk1977aef0bf62005-12-30 16:28:01 +00005757 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005758 /* Must start a transaction before doing an insert */
drhd677b3d2007-08-20 22:48:41 +00005759 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005760 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005761 }
drhf74b8d92002-09-01 23:20:45 +00005762 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00005763 if( !pCur->wrFlag ){
5764 return SQLITE_PERM; /* Cursor not open for writing */
5765 }
danielk19773588ceb2008-06-10 17:30:26 +00005766 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
drhf74b8d92002-09-01 23:20:45 +00005767 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5768 }
drhfb982642007-08-30 01:19:59 +00005769 if( pCur->eState==CURSOR_FAULT ){
5770 return pCur->skip;
5771 }
danielk1977da184232006-01-05 11:34:32 +00005772
5773 /* Save the positions of any other cursors open on this table */
danielk1977be51a652008-10-08 17:58:48 +00005774 sqlite3BtreeClearCursor(pCur);
danielk19772e94d4d2006-01-09 05:36:27 +00005775 if(
danielk19772e94d4d2006-01-09 05:36:27 +00005776 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
drhe63d9992008-08-13 19:11:48 +00005777 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
danielk19772e94d4d2006-01-09 05:36:27 +00005778 ){
danielk1977da184232006-01-05 11:34:32 +00005779 return rc;
5780 }
5781
danielk197771d5d2c2008-09-29 11:49:47 +00005782 pPage = pCur->apPage[pCur->iPage];
drh4a1c3802004-05-12 15:15:47 +00005783 assert( pPage->intKey || nKey>=0 );
drh44845222008-07-17 18:39:57 +00005784 assert( pPage->leaf || !pPage->intKey );
drh3a4c1412004-05-09 20:40:11 +00005785 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5786 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5787 loc==0 ? "overwrite" : "new entry"));
danielk197771d5d2c2008-09-29 11:49:47 +00005788 assert( pPage->isInit );
danielk197752ae7242008-03-25 14:24:56 +00005789 allocateTempSpace(pBt);
5790 newCell = pBt->pTmpSpace;
drh2e38c322004-09-03 18:38:44 +00005791 if( newCell==0 ) return SQLITE_NOMEM;
drhb026e052007-05-02 01:34:31 +00005792 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
drh2e38c322004-09-03 18:38:44 +00005793 if( rc ) goto end_insert;
drh43605152004-05-29 21:46:49 +00005794 assert( szNew==cellSizePtr(pPage, newCell) );
drh2e38c322004-09-03 18:38:44 +00005795 assert( szNew<=MX_CELL_SIZE(pBt) );
danielk197771d5d2c2008-09-29 11:49:47 +00005796 idx = pCur->aiIdx[pCur->iPage];
danielk1977da184232006-01-05 11:34:32 +00005797 if( loc==0 && CURSOR_VALID==pCur->eState ){
drha9121e42008-02-19 14:59:35 +00005798 u16 szOld;
danielk197771d5d2c2008-09-29 11:49:47 +00005799 assert( idx<pPage->nCell );
danielk19776e465eb2007-08-21 13:11:00 +00005800 rc = sqlite3PagerWrite(pPage->pDbPage);
5801 if( rc ){
5802 goto end_insert;
5803 }
danielk197771d5d2c2008-09-29 11:49:47 +00005804 oldCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00005805 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005806 memcpy(newCell, oldCell, 4);
drh4b70f112004-05-02 21:12:19 +00005807 }
drh43605152004-05-29 21:46:49 +00005808 szOld = cellSizePtr(pPage, oldCell);
drh4b70f112004-05-02 21:12:19 +00005809 rc = clearCell(pPage, oldCell);
drh2e38c322004-09-03 18:38:44 +00005810 if( rc ) goto end_insert;
danielk197771d5d2c2008-09-29 11:49:47 +00005811 dropCell(pPage, idx, szOld);
drh7c717f72001-06-24 20:39:41 +00005812 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00005813 assert( pPage->leaf );
danielk197771d5d2c2008-09-29 11:49:47 +00005814 idx = ++pCur->aiIdx[pCur->iPage];
drh271efa52004-05-30 19:19:05 +00005815 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00005816 pCur->validNKey = 0;
drh14acc042001-06-10 19:56:58 +00005817 }else{
drh4b70f112004-05-02 21:12:19 +00005818 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00005819 }
danielk197771d5d2c2008-09-29 11:49:47 +00005820 rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
danielk1977e80463b2004-11-03 03:01:16 +00005821 if( rc!=SQLITE_OK ) goto end_insert;
danielk197771d5d2c2008-09-29 11:49:47 +00005822 rc = balance(pCur, 1);
danielk1977299b1872004-11-22 10:02:10 +00005823 if( rc==SQLITE_OK ){
5824 moveToRoot(pCur);
5825 }
drh2e38c322004-09-03 18:38:44 +00005826end_insert:
drh5e2f8b92001-05-28 00:41:15 +00005827 return rc;
5828}
5829
5830/*
drh4b70f112004-05-02 21:12:19 +00005831** Delete the entry that the cursor is pointing to. The cursor
drhf94a1732008-09-30 17:18:17 +00005832** is left pointing at a arbitrary location.
drh3b7511c2001-05-26 13:15:44 +00005833*/
drh3aac2dd2004-04-26 14:10:20 +00005834int sqlite3BtreeDelete(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00005835 MemPage *pPage = pCur->apPage[pCur->iPage];
5836 int idx;
drh4b70f112004-05-02 21:12:19 +00005837 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00005838 int rc;
danielk1977cfe9a692004-06-16 12:00:29 +00005839 Pgno pgnoChild = 0;
drhd677b3d2007-08-20 22:48:41 +00005840 Btree *p = pCur->pBtree;
5841 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00005842
drh1fee73e2007-08-29 04:00:57 +00005843 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00005844 assert( pPage->isInit );
danielk1977aef0bf62005-12-30 16:28:01 +00005845 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005846 /* Must start a transaction before doing a delete */
drhd677b3d2007-08-20 22:48:41 +00005847 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005848 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005849 }
drhf74b8d92002-09-01 23:20:45 +00005850 assert( !pBt->readOnly );
drhfb982642007-08-30 01:19:59 +00005851 if( pCur->eState==CURSOR_FAULT ){
5852 return pCur->skip;
5853 }
danielk197771d5d2c2008-09-29 11:49:47 +00005854 if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
drhbd03cae2001-06-02 02:40:57 +00005855 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5856 }
drhecdc7532001-09-23 02:35:53 +00005857 if( !pCur->wrFlag ){
5858 return SQLITE_PERM; /* Did not open this cursor for writing */
5859 }
danielk19773588ceb2008-06-10 17:30:26 +00005860 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
drhf74b8d92002-09-01 23:20:45 +00005861 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5862 }
danielk1977da184232006-01-05 11:34:32 +00005863
5864 /* Restore the current cursor position (a no-op if the cursor is not in
5865 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
danielk19773b8a05f2007-03-19 17:44:26 +00005866 ** open on the same table. Then call sqlite3PagerWrite() on the page
danielk1977da184232006-01-05 11:34:32 +00005867 ** that the entry will be deleted from.
5868 */
5869 if(
drha3460582008-07-11 21:02:53 +00005870 (rc = restoreCursorPosition(pCur))!=0 ||
drhd1167392006-01-23 13:00:35 +00005871 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
danielk19773b8a05f2007-03-19 17:44:26 +00005872 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
danielk1977da184232006-01-05 11:34:32 +00005873 ){
5874 return rc;
5875 }
danielk1977e6efa742004-11-10 11:55:10 +00005876
drh85b623f2007-12-13 21:54:09 +00005877 /* Locate the cell within its page and leave pCell pointing to the
danielk1977e6efa742004-11-10 11:55:10 +00005878 ** data. The clearCell() call frees any overflow pages associated with the
5879 ** cell. The cell itself is still intact.
5880 */
danielk197771d5d2c2008-09-29 11:49:47 +00005881 idx = pCur->aiIdx[pCur->iPage];
5882 pCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00005883 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005884 pgnoChild = get4byte(pCell);
drh4b70f112004-05-02 21:12:19 +00005885 }
danielk197728129562005-01-11 10:25:06 +00005886 rc = clearCell(pPage, pCell);
drhd677b3d2007-08-20 22:48:41 +00005887 if( rc ){
drhd677b3d2007-08-20 22:48:41 +00005888 return rc;
5889 }
danielk1977e6efa742004-11-10 11:55:10 +00005890
drh4b70f112004-05-02 21:12:19 +00005891 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00005892 /*
drh5e00f6c2001-09-13 13:46:56 +00005893 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00005894 ** do something we will leave a hole on an internal page.
5895 ** We have to fill the hole by moving in a cell from a leaf. The
5896 ** next Cell after the one to be deleted is guaranteed to exist and
danielk1977299b1872004-11-22 10:02:10 +00005897 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00005898 */
drh14acc042001-06-10 19:56:58 +00005899 BtCursor leafCur;
danielk197771d5d2c2008-09-29 11:49:47 +00005900 MemPage *pLeafPage;
danielk197771d5d2c2008-09-29 11:49:47 +00005901
drh4b70f112004-05-02 21:12:19 +00005902 unsigned char *pNext;
danielk1977299b1872004-11-22 10:02:10 +00005903 int notUsed;
danielk19776b456a22005-03-21 04:04:02 +00005904 unsigned char *tempCell = 0;
drh44845222008-07-17 18:39:57 +00005905 assert( !pPage->intKey );
drh16a9b832007-05-05 18:39:25 +00005906 sqlite3BtreeGetTempCursor(pCur, &leafCur);
danielk1977299b1872004-11-22 10:02:10 +00005907 rc = sqlite3BtreeNext(&leafCur, &notUsed);
danielk19776b456a22005-03-21 04:04:02 +00005908 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00005909 assert( leafCur.aiIdx[leafCur.iPage]==0 );
danielk197771d5d2c2008-09-29 11:49:47 +00005910 pLeafPage = leafCur.apPage[leafCur.iPage];
danielk197771d5d2c2008-09-29 11:49:47 +00005911 rc = sqlite3PagerWrite(pLeafPage->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00005912 }
5913 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00005914 int leafCursorInvalid = 0;
drha9121e42008-02-19 14:59:35 +00005915 u16 szNext;
danielk19776b456a22005-03-21 04:04:02 +00005916 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
danielk197771d5d2c2008-09-29 11:49:47 +00005917 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
5918 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
danielk19772f78fc62008-09-30 09:31:45 +00005919 pNext = findCell(pLeafPage, 0);
danielk197771d5d2c2008-09-29 11:49:47 +00005920 szNext = cellSizePtr(pLeafPage, pNext);
danielk19776b456a22005-03-21 04:04:02 +00005921 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
danielk197752ae7242008-03-25 14:24:56 +00005922 allocateTempSpace(pBt);
5923 tempCell = pBt->pTmpSpace;
danielk19776b456a22005-03-21 04:04:02 +00005924 if( tempCell==0 ){
5925 rc = SQLITE_NOMEM;
5926 }
danielk19778ea1cfa2008-01-01 06:19:02 +00005927 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00005928 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00005929 }
danielk19772f78fc62008-09-30 09:31:45 +00005930
drhf94a1732008-09-30 17:18:17 +00005931
5932 /* The "if" statement in the next code block is critical. The
5933 ** slightest error in that statement would allow SQLite to operate
5934 ** correctly most of the time but produce very rare failures. To
5935 ** guard against this, the following macros help to verify that
5936 ** the "if" statement is well tested.
5937 */
5938 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
5939 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5940 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
5941 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5942 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
5943 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5944 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
5945 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5946 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
5947 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
5948
5949
danielk19772f78fc62008-09-30 09:31:45 +00005950 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
5951 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
5952 ){
drhf94a1732008-09-30 17:18:17 +00005953 /* This branch is taken if the internal node is now either overflowing
5954 ** or underfull and the leaf node will be underfull after the just cell
danielk19772f78fc62008-09-30 09:31:45 +00005955 ** copied to the internal node is deleted from it. This is a special
5956 ** case because the call to balance() to correct the internal node
5957 ** may change the tree structure and invalidate the contents of
5958 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
5959 ** used by the balance() required to correct the underfull leaf
5960 ** node.
5961 **
5962 ** The formula used in the expression above are based on facets of
5963 ** the SQLite file-format that do not change over time.
5964 */
drhf94a1732008-09-30 17:18:17 +00005965 testcase( pPage->nFree==pBt->usableSize*2/3+1 );
5966 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
danielk19772f78fc62008-09-30 09:31:45 +00005967 leafCursorInvalid = 1;
5968 }
5969
danielk19778ea1cfa2008-01-01 06:19:02 +00005970 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00005971 put4byte(findOverflowCell(pPage, idx), pgnoChild);
drhf94a1732008-09-30 17:18:17 +00005972 VVA_ONLY( pCur->pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00005973 rc = balance(pCur, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00005974 }
danielk19772f78fc62008-09-30 09:31:45 +00005975
5976 if( rc==SQLITE_OK && leafCursorInvalid ){
5977 /* The leaf-node is now underfull and so the tree needs to be
5978 ** rebalanced. However, the balance() operation on the internal
5979 ** node above may have modified the structure of the B-Tree and
5980 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
5981 ** may not be trusted.
5982 **
5983 ** It is not possible to copy the ancestry from pCur, as the same
5984 ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
5985 ** arrays.
drh7b682802008-09-30 14:06:28 +00005986 **
5987 ** The call to saveCursorPosition() below internally saves the
5988 ** key that leafCur is currently pointing to. Currently, there
5989 ** are two copies of that key in the tree - one here on the leaf
5990 ** page and one on some internal node in the tree. The copy on
5991 ** the leaf node is always the next key in tree-order after the
5992 ** copy on the internal node. So, the call to sqlite3BtreeNext()
5993 ** calls restoreCursorPosition() to point the cursor to the copy
5994 ** stored on the internal node, then advances to the next entry,
5995 ** which happens to be the copy of the key on the internal node.
danielk1977a69fda22008-09-30 16:48:10 +00005996 ** Net effect: leafCur is pointing back to the duplicate cell
5997 ** that needs to be removed, and the leafCur.apPage[] and
5998 ** leafCur.aiIdx[] arrays are correct.
danielk19772f78fc62008-09-30 09:31:45 +00005999 */
drhf94a1732008-09-30 17:18:17 +00006000 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
danielk19772f78fc62008-09-30 09:31:45 +00006001 rc = saveCursorPosition(&leafCur);
6002 if( rc==SQLITE_OK ){
6003 rc = sqlite3BtreeNext(&leafCur, &notUsed);
6004 }
6005 pLeafPage = leafCur.apPage[leafCur.iPage];
6006 assert( pLeafPage->pgno==leafPgno );
6007 assert( leafCur.aiIdx[leafCur.iPage]==0 );
6008 }
6009
danielk19778ea1cfa2008-01-01 06:19:02 +00006010 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00006011 dropCell(pLeafPage, 0, szNext);
drhf94a1732008-09-30 17:18:17 +00006012 VVA_ONLY( leafCur.pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00006013 rc = balance(&leafCur, 0);
drhf94a1732008-09-30 17:18:17 +00006014 assert( leafCursorInvalid || !leafCur.pagesShuffled
6015 || !pCur->pagesShuffled );
danielk19778ea1cfa2008-01-01 06:19:02 +00006016 }
danielk19776b456a22005-03-21 04:04:02 +00006017 }
drh16a9b832007-05-05 18:39:25 +00006018 sqlite3BtreeReleaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00006019 }else{
danielk1977299b1872004-11-22 10:02:10 +00006020 TRACE(("DELETE: table=%d delete from leaf %d\n",
6021 pCur->pgnoRoot, pPage->pgno));
danielk197771d5d2c2008-09-29 11:49:47 +00006022 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6023 rc = balance(pCur, 0);
drh5e2f8b92001-05-28 00:41:15 +00006024 }
danielk19776b456a22005-03-21 04:04:02 +00006025 if( rc==SQLITE_OK ){
6026 moveToRoot(pCur);
6027 }
drh5e2f8b92001-05-28 00:41:15 +00006028 return rc;
drh3b7511c2001-05-26 13:15:44 +00006029}
drh8b2f49b2001-06-08 00:21:52 +00006030
6031/*
drhc6b52df2002-01-04 03:09:29 +00006032** Create a new BTree table. Write into *piTable the page
6033** number for the root page of the new table.
6034**
drhab01f612004-05-22 02:55:23 +00006035** The type of type is determined by the flags parameter. Only the
6036** following values of flags are currently in use. Other values for
6037** flags might not work:
6038**
6039** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6040** BTREE_ZERODATA Used for SQL indices
drh8b2f49b2001-06-08 00:21:52 +00006041*/
drhd677b3d2007-08-20 22:48:41 +00006042static int btreeCreateTable(Btree *p, int *piTable, int flags){
danielk1977aef0bf62005-12-30 16:28:01 +00006043 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006044 MemPage *pRoot;
6045 Pgno pgnoRoot;
6046 int rc;
drhd677b3d2007-08-20 22:48:41 +00006047
drh1fee73e2007-08-29 04:00:57 +00006048 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006049 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006050 /* Must start a transaction first */
drhd677b3d2007-08-20 22:48:41 +00006051 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6052 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006053 }
danielk197728129562005-01-11 10:25:06 +00006054 assert( !pBt->readOnly );
danielk1977e6efa742004-11-10 11:55:10 +00006055
danielk1977003ba062004-11-04 02:57:33 +00006056#ifdef SQLITE_OMIT_AUTOVACUUM
drh4f0c5872007-03-26 22:05:01 +00006057 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
drhd677b3d2007-08-20 22:48:41 +00006058 if( rc ){
6059 return rc;
6060 }
danielk1977003ba062004-11-04 02:57:33 +00006061#else
danielk1977687566d2004-11-02 12:56:41 +00006062 if( pBt->autoVacuum ){
danielk1977003ba062004-11-04 02:57:33 +00006063 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6064 MemPage *pPageMove; /* The page to move to. */
6065
danielk197720713f32007-05-03 11:43:33 +00006066 /* Creating a new table may probably require moving an existing database
6067 ** to make room for the new tables root page. In case this page turns
6068 ** out to be an overflow page, delete all overflow page-map caches
6069 ** held by open cursors.
6070 */
danielk197792d4d7a2007-05-04 12:05:56 +00006071 invalidateAllOverflowCache(pBt);
danielk197720713f32007-05-03 11:43:33 +00006072
danielk1977003ba062004-11-04 02:57:33 +00006073 /* Read the value of meta[3] from the database to determine where the
6074 ** root page of the new table should go. meta[3] is the largest root-page
6075 ** created so far, so the new root-page is (meta[3]+1).
6076 */
danielk1977aef0bf62005-12-30 16:28:01 +00006077 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
drhd677b3d2007-08-20 22:48:41 +00006078 if( rc!=SQLITE_OK ){
6079 return rc;
6080 }
danielk1977003ba062004-11-04 02:57:33 +00006081 pgnoRoot++;
6082
danielk1977599fcba2004-11-08 07:13:13 +00006083 /* The new root-page may not be allocated on a pointer-map page, or the
6084 ** PENDING_BYTE page.
6085 */
drh72190432008-01-31 14:54:43 +00006086 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
danielk1977599fcba2004-11-08 07:13:13 +00006087 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
danielk1977003ba062004-11-04 02:57:33 +00006088 pgnoRoot++;
6089 }
6090 assert( pgnoRoot>=3 );
6091
6092 /* Allocate a page. The page that currently resides at pgnoRoot will
6093 ** be moved to the allocated page (unless the allocated page happens
6094 ** to reside at pgnoRoot).
6095 */
drh4f0c5872007-03-26 22:05:01 +00006096 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
danielk1977003ba062004-11-04 02:57:33 +00006097 if( rc!=SQLITE_OK ){
danielk1977687566d2004-11-02 12:56:41 +00006098 return rc;
6099 }
danielk1977003ba062004-11-04 02:57:33 +00006100
6101 if( pgnoMove!=pgnoRoot ){
danielk1977f35843b2007-04-07 15:03:17 +00006102 /* pgnoRoot is the page that will be used for the root-page of
6103 ** the new table (assuming an error did not occur). But we were
6104 ** allocated pgnoMove. If required (i.e. if it was not allocated
6105 ** by extending the file), the current page at position pgnoMove
6106 ** is already journaled.
6107 */
danielk1977003ba062004-11-04 02:57:33 +00006108 u8 eType;
6109 Pgno iPtrPage;
6110
6111 releasePage(pPageMove);
danielk1977f35843b2007-04-07 15:03:17 +00006112
6113 /* Move the page currently at pgnoRoot to pgnoMove. */
drh16a9b832007-05-05 18:39:25 +00006114 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006115 if( rc!=SQLITE_OK ){
6116 return rc;
6117 }
6118 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
drhccae6022005-02-26 17:31:26 +00006119 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00006120 releasePage(pRoot);
6121 return rc;
6122 }
drhccae6022005-02-26 17:31:26 +00006123 assert( eType!=PTRMAP_ROOTPAGE );
6124 assert( eType!=PTRMAP_FREEPAGE );
danielk19773b8a05f2007-03-19 17:44:26 +00006125 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk19775fd057a2005-03-09 13:09:43 +00006126 if( rc!=SQLITE_OK ){
6127 releasePage(pRoot);
6128 return rc;
6129 }
danielk19774c999992008-07-16 18:17:55 +00006130 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
danielk1977003ba062004-11-04 02:57:33 +00006131 releasePage(pRoot);
danielk1977f35843b2007-04-07 15:03:17 +00006132
6133 /* Obtain the page at pgnoRoot */
danielk1977003ba062004-11-04 02:57:33 +00006134 if( rc!=SQLITE_OK ){
6135 return rc;
6136 }
drh16a9b832007-05-05 18:39:25 +00006137 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006138 if( rc!=SQLITE_OK ){
6139 return rc;
6140 }
danielk19773b8a05f2007-03-19 17:44:26 +00006141 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk1977003ba062004-11-04 02:57:33 +00006142 if( rc!=SQLITE_OK ){
6143 releasePage(pRoot);
6144 return rc;
6145 }
6146 }else{
6147 pRoot = pPageMove;
6148 }
6149
danielk197742741be2005-01-08 12:42:39 +00006150 /* Update the pointer-map and meta-data with the new root-page number. */
danielk1977003ba062004-11-04 02:57:33 +00006151 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6152 if( rc ){
6153 releasePage(pRoot);
6154 return rc;
6155 }
danielk1977aef0bf62005-12-30 16:28:01 +00006156 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00006157 if( rc ){
6158 releasePage(pRoot);
6159 return rc;
6160 }
danielk197742741be2005-01-08 12:42:39 +00006161
danielk1977003ba062004-11-04 02:57:33 +00006162 }else{
drh4f0c5872007-03-26 22:05:01 +00006163 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
danielk1977003ba062004-11-04 02:57:33 +00006164 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00006165 }
6166#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006167 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
drhde647132004-05-07 17:57:49 +00006168 zeroPage(pRoot, flags | PTF_LEAF);
danielk19773b8a05f2007-03-19 17:44:26 +00006169 sqlite3PagerUnref(pRoot->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00006170 *piTable = (int)pgnoRoot;
6171 return SQLITE_OK;
6172}
drhd677b3d2007-08-20 22:48:41 +00006173int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6174 int rc;
6175 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006176 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006177 rc = btreeCreateTable(p, piTable, flags);
6178 sqlite3BtreeLeave(p);
6179 return rc;
6180}
drh8b2f49b2001-06-08 00:21:52 +00006181
6182/*
6183** Erase the given database page and all its children. Return
6184** the page to the freelist.
6185*/
drh4b70f112004-05-02 21:12:19 +00006186static int clearDatabasePage(
danielk1977aef0bf62005-12-30 16:28:01 +00006187 BtShared *pBt, /* The BTree that contains the table */
drh4b70f112004-05-02 21:12:19 +00006188 Pgno pgno, /* Page number to clear */
6189 MemPage *pParent, /* Parent page. NULL for the root */
danielk1977c7af4842008-10-27 13:59:33 +00006190 int freePageFlag, /* Deallocate page if true */
6191 int *pnChange
drh4b70f112004-05-02 21:12:19 +00006192){
danielk19776b456a22005-03-21 04:04:02 +00006193 MemPage *pPage = 0;
drh8b2f49b2001-06-08 00:21:52 +00006194 int rc;
drh4b70f112004-05-02 21:12:19 +00006195 unsigned char *pCell;
6196 int i;
drh8b2f49b2001-06-08 00:21:52 +00006197
drh1fee73e2007-08-29 04:00:57 +00006198 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977ad0132d2008-06-07 08:58:22 +00006199 if( pgno>pagerPagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00006200 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00006201 }
6202
danielk197771d5d2c2008-09-29 11:49:47 +00006203 rc = getAndInitPage(pBt, pgno, &pPage);
danielk19776b456a22005-03-21 04:04:02 +00006204 if( rc ) goto cleardatabasepage_out;
drh4b70f112004-05-02 21:12:19 +00006205 for(i=0; i<pPage->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00006206 pCell = findCell(pPage, i);
drh4b70f112004-05-02 21:12:19 +00006207 if( !pPage->leaf ){
danielk1977c7af4842008-10-27 13:59:33 +00006208 rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006209 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006210 }
drh4b70f112004-05-02 21:12:19 +00006211 rc = clearCell(pPage, pCell);
danielk19776b456a22005-03-21 04:04:02 +00006212 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006213 }
drha34b6762004-05-07 13:30:42 +00006214 if( !pPage->leaf ){
danielk1977c7af4842008-10-27 13:59:33 +00006215 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006216 if( rc ) goto cleardatabasepage_out;
danielk1977c7af4842008-10-27 13:59:33 +00006217 }else if( pnChange ){
6218 assert( pPage->intKey );
6219 *pnChange += pPage->nCell;
drh2aa679f2001-06-25 02:11:07 +00006220 }
6221 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00006222 rc = freePage(pPage);
danielk19773b8a05f2007-03-19 17:44:26 +00006223 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
drh3a4c1412004-05-09 20:40:11 +00006224 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00006225 }
danielk19776b456a22005-03-21 04:04:02 +00006226
6227cleardatabasepage_out:
drh4b70f112004-05-02 21:12:19 +00006228 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00006229 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006230}
6231
6232/*
drhab01f612004-05-22 02:55:23 +00006233** Delete all information from a single table in the database. iTable is
6234** the page number of the root of the table. After this routine returns,
6235** the root page is empty, but still exists.
6236**
6237** This routine will fail with SQLITE_LOCKED if there are any open
6238** read cursors on the table. Open write cursors are moved to the
6239** root of the table.
danielk1977c7af4842008-10-27 13:59:33 +00006240**
6241** If pnChange is not NULL, then table iTable must be an intkey table. The
6242** integer value pointed to by pnChange is incremented by the number of
6243** entries in the table.
drh8b2f49b2001-06-08 00:21:52 +00006244*/
danielk1977c7af4842008-10-27 13:59:33 +00006245int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
drh8b2f49b2001-06-08 00:21:52 +00006246 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00006247 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00006248 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006249 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006250 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006251 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
danielk19773588ceb2008-06-10 17:30:26 +00006252 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006253 /* nothing to do */
6254 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6255 /* nothing to do */
6256 }else{
danielk1977c7af4842008-10-27 13:59:33 +00006257 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0, pnChange);
drh8b2f49b2001-06-08 00:21:52 +00006258 }
drhd677b3d2007-08-20 22:48:41 +00006259 sqlite3BtreeLeave(p);
6260 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006261}
6262
6263/*
6264** Erase all information in a table and add the root of the table to
6265** the freelist. Except, the root of the principle table (the one on
drhab01f612004-05-22 02:55:23 +00006266** page 1) is never added to the freelist.
6267**
6268** This routine will fail with SQLITE_LOCKED if there are any open
6269** cursors on the table.
drh205f48e2004-11-05 00:43:11 +00006270**
6271** If AUTOVACUUM is enabled and the page at iTable is not the last
6272** root page in the database file, then the last root page
6273** in the database file is moved into the slot formerly occupied by
6274** iTable and that last slot formerly occupied by the last root page
6275** is added to the freelist instead of iTable. In this say, all
6276** root pages are kept at the beginning of the database file, which
6277** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6278** page number that used to be the last root page in the file before
6279** the move. If no page gets moved, *piMoved is set to 0.
6280** The last root page is recorded in meta[3] and the value of
6281** meta[3] is updated by this procedure.
drh8b2f49b2001-06-08 00:21:52 +00006282*/
drhd677b3d2007-08-20 22:48:41 +00006283static int btreeDropTable(Btree *p, int iTable, int *piMoved){
drh8b2f49b2001-06-08 00:21:52 +00006284 int rc;
danielk1977a0bf2652004-11-04 14:30:04 +00006285 MemPage *pPage = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00006286 BtShared *pBt = p->pBt;
danielk1977a0bf2652004-11-04 14:30:04 +00006287
drh1fee73e2007-08-29 04:00:57 +00006288 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006289 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006290 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00006291 }
danielk1977a0bf2652004-11-04 14:30:04 +00006292
danielk1977e6efa742004-11-10 11:55:10 +00006293 /* It is illegal to drop a table if any cursors are open on the
6294 ** database. This is because in auto-vacuum mode the backend may
6295 ** need to move another root-page to fill a gap left by the deleted
6296 ** root page. If an open cursor was using this page a problem would
6297 ** occur.
6298 */
6299 if( pBt->pCursor ){
6300 return SQLITE_LOCKED;
drh5df72a52002-06-06 23:16:05 +00006301 }
danielk1977a0bf2652004-11-04 14:30:04 +00006302
drh16a9b832007-05-05 18:39:25 +00006303 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
drh2aa679f2001-06-25 02:11:07 +00006304 if( rc ) return rc;
danielk1977c7af4842008-10-27 13:59:33 +00006305 rc = sqlite3BtreeClearTable(p, iTable, 0);
danielk19776b456a22005-03-21 04:04:02 +00006306 if( rc ){
6307 releasePage(pPage);
6308 return rc;
6309 }
danielk1977a0bf2652004-11-04 14:30:04 +00006310
drh205f48e2004-11-05 00:43:11 +00006311 *piMoved = 0;
danielk1977a0bf2652004-11-04 14:30:04 +00006312
drh4b70f112004-05-02 21:12:19 +00006313 if( iTable>1 ){
danielk1977a0bf2652004-11-04 14:30:04 +00006314#ifdef SQLITE_OMIT_AUTOVACUUM
drha34b6762004-05-07 13:30:42 +00006315 rc = freePage(pPage);
danielk1977a0bf2652004-11-04 14:30:04 +00006316 releasePage(pPage);
6317#else
6318 if( pBt->autoVacuum ){
6319 Pgno maxRootPgno;
danielk1977aef0bf62005-12-30 16:28:01 +00006320 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006321 if( rc!=SQLITE_OK ){
6322 releasePage(pPage);
6323 return rc;
6324 }
6325
6326 if( iTable==maxRootPgno ){
6327 /* If the table being dropped is the table with the largest root-page
6328 ** number in the database, put the root page on the free list.
6329 */
6330 rc = freePage(pPage);
6331 releasePage(pPage);
6332 if( rc!=SQLITE_OK ){
6333 return rc;
6334 }
6335 }else{
6336 /* The table being dropped does not have the largest root-page
6337 ** number in the database. So move the page that does into the
6338 ** gap left by the deleted root-page.
6339 */
6340 MemPage *pMove;
6341 releasePage(pPage);
drh16a9b832007-05-05 18:39:25 +00006342 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006343 if( rc!=SQLITE_OK ){
6344 return rc;
6345 }
danielk19774c999992008-07-16 18:17:55 +00006346 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006347 releasePage(pMove);
6348 if( rc!=SQLITE_OK ){
6349 return rc;
6350 }
drh16a9b832007-05-05 18:39:25 +00006351 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006352 if( rc!=SQLITE_OK ){
6353 return rc;
6354 }
6355 rc = freePage(pMove);
6356 releasePage(pMove);
6357 if( rc!=SQLITE_OK ){
6358 return rc;
6359 }
6360 *piMoved = maxRootPgno;
6361 }
6362
danielk1977599fcba2004-11-08 07:13:13 +00006363 /* Set the new 'max-root-page' value in the database header. This
6364 ** is the old value less one, less one more if that happens to
6365 ** be a root-page number, less one again if that is the
6366 ** PENDING_BYTE_PAGE.
6367 */
danielk197787a6e732004-11-05 12:58:25 +00006368 maxRootPgno--;
danielk1977599fcba2004-11-08 07:13:13 +00006369 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6370 maxRootPgno--;
6371 }
danielk1977266664d2006-02-10 08:24:21 +00006372 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
danielk197787a6e732004-11-05 12:58:25 +00006373 maxRootPgno--;
6374 }
danielk1977599fcba2004-11-08 07:13:13 +00006375 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6376
danielk1977aef0bf62005-12-30 16:28:01 +00006377 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006378 }else{
6379 rc = freePage(pPage);
6380 releasePage(pPage);
6381 }
6382#endif
drh2aa679f2001-06-25 02:11:07 +00006383 }else{
danielk1977a0bf2652004-11-04 14:30:04 +00006384 /* If sqlite3BtreeDropTable was called on page 1. */
drha34b6762004-05-07 13:30:42 +00006385 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
danielk1977a0bf2652004-11-04 14:30:04 +00006386 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00006387 }
drh8b2f49b2001-06-08 00:21:52 +00006388 return rc;
6389}
drhd677b3d2007-08-20 22:48:41 +00006390int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6391 int rc;
6392 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006393 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006394 rc = btreeDropTable(p, iTable, piMoved);
6395 sqlite3BtreeLeave(p);
6396 return rc;
6397}
drh8b2f49b2001-06-08 00:21:52 +00006398
drh001bbcb2003-03-19 03:14:00 +00006399
drh8b2f49b2001-06-08 00:21:52 +00006400/*
drh23e11ca2004-05-04 17:27:28 +00006401** Read the meta-information out of a database file. Meta[0]
6402** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00006403** through meta[15] are available for use by higher layers. Meta[0]
6404** is read-only, the others are read/write.
6405**
6406** The schema layer numbers meta values differently. At the schema
6407** layer (and the SetCookie and ReadCookie opcodes) the number of
6408** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00006409*/
danielk1977aef0bf62005-12-30 16:28:01 +00006410int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
danielk19773b8a05f2007-03-19 17:44:26 +00006411 DbPage *pDbPage;
drh8b2f49b2001-06-08 00:21:52 +00006412 int rc;
drh4b70f112004-05-02 21:12:19 +00006413 unsigned char *pP1;
danielk1977aef0bf62005-12-30 16:28:01 +00006414 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006415
drhd677b3d2007-08-20 22:48:41 +00006416 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006417 pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006418
danielk1977da184232006-01-05 11:34:32 +00006419 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6420 ** the sqlite_master table. We grab this lock regardless of whether or
6421 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6422 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6423 */
6424 rc = queryTableLock(p, 1, READ_LOCK);
6425 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006426 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006427 return rc;
6428 }
6429
drh23e11ca2004-05-04 17:27:28 +00006430 assert( idx>=0 && idx<=15 );
danielk1977d9f6c532008-09-19 16:39:38 +00006431 if( pBt->pPage1 ){
6432 /* The b-tree is already holding a reference to page 1 of the database
6433 ** file. In this case the required meta-data value can be read directly
6434 ** from the page data of this reference. This is slightly faster than
6435 ** requesting a new reference from the pager layer.
6436 */
6437 pP1 = (unsigned char *)pBt->pPage1->aData;
6438 }else{
6439 /* The b-tree does not have a reference to page 1 of the database file.
6440 ** Obtain one from the pager layer.
6441 */
danielk1977ea897302008-09-19 15:10:58 +00006442 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6443 if( rc ){
6444 sqlite3BtreeLeave(p);
6445 return rc;
6446 }
6447 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
drhd677b3d2007-08-20 22:48:41 +00006448 }
drh23e11ca2004-05-04 17:27:28 +00006449 *pMeta = get4byte(&pP1[36 + idx*4]);
danielk1977ea897302008-09-19 15:10:58 +00006450
danielk1977d9f6c532008-09-19 16:39:38 +00006451 /* If the b-tree is not holding a reference to page 1, then one was
6452 ** requested from the pager layer in the above block. Release it now.
6453 */
danielk1977ea897302008-09-19 15:10:58 +00006454 if( !pBt->pPage1 ){
6455 sqlite3PagerUnref(pDbPage);
6456 }
drhae157872004-08-14 19:20:09 +00006457
danielk1977599fcba2004-11-08 07:13:13 +00006458 /* If autovacuumed is disabled in this build but we are trying to
6459 ** access an autovacuumed database, then make the database readonly.
6460 */
danielk1977003ba062004-11-04 02:57:33 +00006461#ifdef SQLITE_OMIT_AUTOVACUUM
drhae157872004-08-14 19:20:09 +00006462 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
danielk1977003ba062004-11-04 02:57:33 +00006463#endif
drhae157872004-08-14 19:20:09 +00006464
danielk1977da184232006-01-05 11:34:32 +00006465 /* Grab the read-lock on page 1. */
6466 rc = lockTable(p, 1, READ_LOCK);
drhd677b3d2007-08-20 22:48:41 +00006467 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006468 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006469}
6470
6471/*
drh23e11ca2004-05-04 17:27:28 +00006472** Write meta-information back into the database. Meta[0] is
6473** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00006474*/
danielk1977aef0bf62005-12-30 16:28:01 +00006475int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6476 BtShared *pBt = p->pBt;
drh4b70f112004-05-02 21:12:19 +00006477 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00006478 int rc;
drh23e11ca2004-05-04 17:27:28 +00006479 assert( idx>=1 && idx<=15 );
drhd677b3d2007-08-20 22:48:41 +00006480 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006481 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006482 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006483 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6484 }else{
6485 assert( pBt->pPage1!=0 );
6486 pP1 = pBt->pPage1->aData;
6487 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6488 if( rc==SQLITE_OK ){
6489 put4byte(&pP1[36 + idx*4], iMeta);
danielk19774152e672007-09-12 17:01:45 +00006490#ifndef SQLITE_OMIT_AUTOVACUUM
drhd677b3d2007-08-20 22:48:41 +00006491 if( idx==7 ){
6492 assert( pBt->autoVacuum || iMeta==0 );
6493 assert( iMeta==0 || iMeta==1 );
6494 pBt->incrVacuum = iMeta;
6495 }
danielk19774152e672007-09-12 17:01:45 +00006496#endif
drhd677b3d2007-08-20 22:48:41 +00006497 }
drh5df72a52002-06-06 23:16:05 +00006498 }
drhd677b3d2007-08-20 22:48:41 +00006499 sqlite3BtreeLeave(p);
6500 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006501}
drh8c42ca92001-06-22 19:15:00 +00006502
drhf328bc82004-05-10 23:29:49 +00006503/*
6504** Return the flag byte at the beginning of the page that the cursor
6505** is currently pointing to.
6506*/
6507int sqlite3BtreeFlags(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00006508 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
drha3460582008-07-11 21:02:53 +00006509 ** restoreCursorPosition() here.
danielk1977da184232006-01-05 11:34:32 +00006510 */
danielk1977e448dc42008-01-02 11:50:51 +00006511 MemPage *pPage;
drha3460582008-07-11 21:02:53 +00006512 restoreCursorPosition(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00006513 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00006514 assert( cursorHoldsMutex(pCur) );
drhd0679ed2007-08-28 22:24:34 +00006515 assert( pPage->pBt==pCur->pBt );
drhf328bc82004-05-10 23:29:49 +00006516 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6517}
6518
drhdd793422001-06-28 01:54:48 +00006519
drhdd793422001-06-28 01:54:48 +00006520/*
drh5eddca62001-06-30 21:53:53 +00006521** Return the pager associated with a BTree. This routine is used for
6522** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00006523*/
danielk1977aef0bf62005-12-30 16:28:01 +00006524Pager *sqlite3BtreePager(Btree *p){
6525 return p->pBt->pPager;
drhdd793422001-06-28 01:54:48 +00006526}
drh5eddca62001-06-30 21:53:53 +00006527
drhb7f91642004-10-31 02:22:47 +00006528#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006529/*
6530** Append a message to the error message string.
6531*/
drh2e38c322004-09-03 18:38:44 +00006532static void checkAppendMsg(
6533 IntegrityCk *pCheck,
6534 char *zMsg1,
6535 const char *zFormat,
6536 ...
6537){
6538 va_list ap;
drh1dcdbc02007-01-27 02:24:54 +00006539 if( !pCheck->mxErr ) return;
6540 pCheck->mxErr--;
6541 pCheck->nErr++;
drh2e38c322004-09-03 18:38:44 +00006542 va_start(ap, zFormat);
drhf089aa42008-07-08 19:34:06 +00006543 if( pCheck->errMsg.nChar ){
6544 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
drh5eddca62001-06-30 21:53:53 +00006545 }
drhf089aa42008-07-08 19:34:06 +00006546 if( zMsg1 ){
6547 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6548 }
6549 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6550 va_end(ap);
drhc890fec2008-08-01 20:10:08 +00006551 if( pCheck->errMsg.mallocFailed ){
6552 pCheck->mallocFailed = 1;
6553 }
drh5eddca62001-06-30 21:53:53 +00006554}
drhb7f91642004-10-31 02:22:47 +00006555#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006556
drhb7f91642004-10-31 02:22:47 +00006557#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006558/*
6559** Add 1 to the reference count for page iPage. If this is the second
6560** reference to the page, add an error message to pCheck->zErrMsg.
6561** Return 1 if there are 2 ore more references to the page and 0 if
6562** if this is the first reference to the page.
6563**
6564** Also check that the page number is in bounds.
6565*/
drhaaab5722002-02-19 13:39:21 +00006566static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00006567 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00006568 if( iPage>pCheck->nPage || iPage<0 ){
drh2e38c322004-09-03 18:38:44 +00006569 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006570 return 1;
6571 }
6572 if( pCheck->anRef[iPage]==1 ){
drh2e38c322004-09-03 18:38:44 +00006573 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006574 return 1;
6575 }
6576 return (pCheck->anRef[iPage]++)>1;
6577}
6578
danielk1977afcdd022004-10-31 16:25:42 +00006579#ifndef SQLITE_OMIT_AUTOVACUUM
6580/*
6581** Check that the entry in the pointer-map for page iChild maps to
6582** page iParent, pointer type ptrType. If not, append an error message
6583** to pCheck.
6584*/
6585static void checkPtrmap(
6586 IntegrityCk *pCheck, /* Integrity check context */
6587 Pgno iChild, /* Child page number */
6588 u8 eType, /* Expected pointer map type */
6589 Pgno iParent, /* Expected pointer map parent page number */
6590 char *zContext /* Context description (used for error msg) */
6591){
6592 int rc;
6593 u8 ePtrmapType;
6594 Pgno iPtrmapParent;
6595
6596 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6597 if( rc!=SQLITE_OK ){
6598 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6599 return;
6600 }
6601
6602 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6603 checkAppendMsg(pCheck, zContext,
6604 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6605 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6606 }
6607}
6608#endif
6609
drh5eddca62001-06-30 21:53:53 +00006610/*
6611** Check the integrity of the freelist or of an overflow page list.
6612** Verify that the number of pages on the list is N.
6613*/
drh30e58752002-03-02 20:41:57 +00006614static void checkList(
6615 IntegrityCk *pCheck, /* Integrity checking context */
6616 int isFreeList, /* True for a freelist. False for overflow page list */
6617 int iPage, /* Page number for first page in the list */
6618 int N, /* Expected number of pages in the list */
6619 char *zContext /* Context for error messages */
6620){
6621 int i;
drh3a4c1412004-05-09 20:40:11 +00006622 int expected = N;
6623 int iFirst = iPage;
drh1dcdbc02007-01-27 02:24:54 +00006624 while( N-- > 0 && pCheck->mxErr ){
danielk19773b8a05f2007-03-19 17:44:26 +00006625 DbPage *pOvflPage;
6626 unsigned char *pOvflData;
drh5eddca62001-06-30 21:53:53 +00006627 if( iPage<1 ){
drh2e38c322004-09-03 18:38:44 +00006628 checkAppendMsg(pCheck, zContext,
6629 "%d of %d pages missing from overflow list starting at %d",
drh3a4c1412004-05-09 20:40:11 +00006630 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00006631 break;
6632 }
6633 if( checkRef(pCheck, iPage, zContext) ) break;
danielk19773b8a05f2007-03-19 17:44:26 +00006634 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
drh2e38c322004-09-03 18:38:44 +00006635 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006636 break;
6637 }
danielk19773b8a05f2007-03-19 17:44:26 +00006638 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
drh30e58752002-03-02 20:41:57 +00006639 if( isFreeList ){
danielk19773b8a05f2007-03-19 17:44:26 +00006640 int n = get4byte(&pOvflData[4]);
danielk1977687566d2004-11-02 12:56:41 +00006641#ifndef SQLITE_OMIT_AUTOVACUUM
6642 if( pCheck->pBt->autoVacuum ){
6643 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6644 }
6645#endif
drh45b1fac2008-07-04 17:52:42 +00006646 if( n>pCheck->pBt->usableSize/4-2 ){
drh2e38c322004-09-03 18:38:44 +00006647 checkAppendMsg(pCheck, zContext,
6648 "freelist leaf count too big on page %d", iPage);
drhee696e22004-08-30 16:52:17 +00006649 N--;
6650 }else{
6651 for(i=0; i<n; i++){
danielk19773b8a05f2007-03-19 17:44:26 +00006652 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
danielk1977687566d2004-11-02 12:56:41 +00006653#ifndef SQLITE_OMIT_AUTOVACUUM
6654 if( pCheck->pBt->autoVacuum ){
6655 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6656 }
6657#endif
6658 checkRef(pCheck, iFreePage, zContext);
drhee696e22004-08-30 16:52:17 +00006659 }
6660 N -= n;
drh30e58752002-03-02 20:41:57 +00006661 }
drh30e58752002-03-02 20:41:57 +00006662 }
danielk1977afcdd022004-10-31 16:25:42 +00006663#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006664 else{
6665 /* If this database supports auto-vacuum and iPage is not the last
6666 ** page in this overflow list, check that the pointer-map entry for
6667 ** the following page matches iPage.
6668 */
6669 if( pCheck->pBt->autoVacuum && N>0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00006670 i = get4byte(pOvflData);
danielk1977687566d2004-11-02 12:56:41 +00006671 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6672 }
danielk1977afcdd022004-10-31 16:25:42 +00006673 }
6674#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006675 iPage = get4byte(pOvflData);
6676 sqlite3PagerUnref(pOvflPage);
drh5eddca62001-06-30 21:53:53 +00006677 }
6678}
drhb7f91642004-10-31 02:22:47 +00006679#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006680
drhb7f91642004-10-31 02:22:47 +00006681#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006682/*
6683** Do various sanity checks on a single page of a tree. Return
6684** the tree depth. Root pages return 0. Parents of root pages
6685** return 1, and so forth.
6686**
6687** These checks are done:
6688**
6689** 1. Make sure that cells and freeblocks do not overlap
6690** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00006691** NO 2. Make sure cell keys are in order.
6692** NO 3. Make sure no key is less than or equal to zLowerBound.
6693** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00006694** 5. Check the integrity of overflow pages.
6695** 6. Recursively call checkTreePage on all children.
6696** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00006697** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00006698** the root of the tree.
6699*/
6700static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00006701 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00006702 int iPage, /* Page number of the page to check */
6703 MemPage *pParent, /* Parent page */
drh74161702006-02-24 02:53:49 +00006704 char *zParentContext /* Parent context */
drh5eddca62001-06-30 21:53:53 +00006705){
6706 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00006707 int i, rc, depth, d2, pgno, cnt;
drh43605152004-05-29 21:46:49 +00006708 int hdr, cellStart;
6709 int nCell;
drhda200cc2004-05-09 11:51:38 +00006710 u8 *data;
danielk1977aef0bf62005-12-30 16:28:01 +00006711 BtShared *pBt;
drh4f26bb62005-09-08 14:17:20 +00006712 int usableSize;
drh5eddca62001-06-30 21:53:53 +00006713 char zContext[100];
drh2e38c322004-09-03 18:38:44 +00006714 char *hit;
drh5eddca62001-06-30 21:53:53 +00006715
drh5bb3eb92007-05-04 13:15:55 +00006716 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
danielk1977ef73ee92004-11-06 12:26:07 +00006717
drh5eddca62001-06-30 21:53:53 +00006718 /* Check that the page exists
6719 */
drhd9cb6ac2005-10-20 07:28:17 +00006720 pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00006721 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00006722 if( iPage==0 ) return 0;
6723 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh16a9b832007-05-05 18:39:25 +00006724 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
drh2e38c322004-09-03 18:38:44 +00006725 checkAppendMsg(pCheck, zContext,
6726 "unable to get the page. error code=%d", rc);
drh5eddca62001-06-30 21:53:53 +00006727 return 0;
6728 }
danielk197771d5d2c2008-09-29 11:49:47 +00006729 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
drh16a9b832007-05-05 18:39:25 +00006730 checkAppendMsg(pCheck, zContext,
6731 "sqlite3BtreeInitPage() returns error code %d", rc);
drh91025292004-05-03 19:49:32 +00006732 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00006733 return 0;
6734 }
6735
6736 /* Check out all the cells.
6737 */
6738 depth = 0;
drh1dcdbc02007-01-27 02:24:54 +00006739 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
drh6f11bef2004-05-13 01:12:56 +00006740 u8 *pCell;
6741 int sz;
6742 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00006743
6744 /* Check payload overflow pages
6745 */
drh5bb3eb92007-05-04 13:15:55 +00006746 sqlite3_snprintf(sizeof(zContext), zContext,
6747 "On tree page %d cell %d: ", iPage, i);
danielk19771cc5ed82007-05-16 17:28:43 +00006748 pCell = findCell(pPage,i);
drh16a9b832007-05-05 18:39:25 +00006749 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00006750 sz = info.nData;
6751 if( !pPage->intKey ) sz += info.nKey;
drh72365832007-03-06 15:53:44 +00006752 assert( sz==info.nPayload );
drh6f11bef2004-05-13 01:12:56 +00006753 if( sz>info.nLocal ){
drhb6f41482004-05-14 01:58:11 +00006754 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
danielk1977afcdd022004-10-31 16:25:42 +00006755 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6756#ifndef SQLITE_OMIT_AUTOVACUUM
6757 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006758 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
danielk1977afcdd022004-10-31 16:25:42 +00006759 }
6760#endif
6761 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00006762 }
6763
6764 /* Check sanity of left child page.
6765 */
drhda200cc2004-05-09 11:51:38 +00006766 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006767 pgno = get4byte(pCell);
danielk1977afcdd022004-10-31 16:25:42 +00006768#ifndef SQLITE_OMIT_AUTOVACUUM
6769 if( pBt->autoVacuum ){
6770 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6771 }
6772#endif
drh74161702006-02-24 02:53:49 +00006773 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
drhda200cc2004-05-09 11:51:38 +00006774 if( i>0 && d2!=depth ){
6775 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6776 }
6777 depth = d2;
drh5eddca62001-06-30 21:53:53 +00006778 }
drh5eddca62001-06-30 21:53:53 +00006779 }
drhda200cc2004-05-09 11:51:38 +00006780 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006781 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh5bb3eb92007-05-04 13:15:55 +00006782 sqlite3_snprintf(sizeof(zContext), zContext,
6783 "On page %d at right child: ", iPage);
danielk1977afcdd022004-10-31 16:25:42 +00006784#ifndef SQLITE_OMIT_AUTOVACUUM
6785 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006786 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
danielk1977afcdd022004-10-31 16:25:42 +00006787 }
6788#endif
drh74161702006-02-24 02:53:49 +00006789 checkTreePage(pCheck, pgno, pPage, zContext);
drhda200cc2004-05-09 11:51:38 +00006790 }
drh5eddca62001-06-30 21:53:53 +00006791
6792 /* Check for complete coverage of the page
6793 */
drhda200cc2004-05-09 11:51:38 +00006794 data = pPage->aData;
6795 hdr = pPage->hdrOffset;
drhf7141992008-06-19 00:16:08 +00006796 hit = sqlite3PageMalloc( pBt->pageSize );
drhc890fec2008-08-01 20:10:08 +00006797 if( hit==0 ){
6798 pCheck->mallocFailed = 1;
6799 }else{
drhf7141992008-06-19 00:16:08 +00006800 memset(hit, 0, usableSize );
drh2e38c322004-09-03 18:38:44 +00006801 memset(hit, 1, get2byte(&data[hdr+5]));
6802 nCell = get2byte(&data[hdr+3]);
6803 cellStart = hdr + 12 - 4*pPage->leaf;
6804 for(i=0; i<nCell; i++){
6805 int pc = get2byte(&data[cellStart+i*2]);
danielk1977daca5432008-08-25 11:57:16 +00006806 u16 size = 1024;
drh2e38c322004-09-03 18:38:44 +00006807 int j;
danielk1977daca5432008-08-25 11:57:16 +00006808 if( pc<=usableSize ){
6809 size = cellSizePtr(pPage, &data[pc]);
6810 }
danielk19777701e812005-01-10 12:59:51 +00006811 if( (pc+size-1)>=usableSize || pc<0 ){
6812 checkAppendMsg(pCheck, 0,
6813 "Corruption detected in cell %d on page %d",i,iPage,0);
6814 }else{
6815 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6816 }
drh2e38c322004-09-03 18:38:44 +00006817 }
6818 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6819 cnt++){
6820 int size = get2byte(&data[i+2]);
6821 int j;
danielk19777701e812005-01-10 12:59:51 +00006822 if( (i+size-1)>=usableSize || i<0 ){
6823 checkAppendMsg(pCheck, 0,
6824 "Corruption detected in cell %d on page %d",i,iPage,0);
6825 }else{
6826 for(j=i+size-1; j>=i; j--) hit[j]++;
6827 }
drh2e38c322004-09-03 18:38:44 +00006828 i = get2byte(&data[i]);
6829 }
6830 for(i=cnt=0; i<usableSize; i++){
6831 if( hit[i]==0 ){
6832 cnt++;
6833 }else if( hit[i]>1 ){
6834 checkAppendMsg(pCheck, 0,
6835 "Multiple uses for byte %d of page %d", i, iPage);
6836 break;
6837 }
6838 }
6839 if( cnt!=data[hdr+7] ){
6840 checkAppendMsg(pCheck, 0,
6841 "Fragmented space is %d byte reported as %d on page %d",
6842 cnt, data[hdr+7], iPage);
drh5eddca62001-06-30 21:53:53 +00006843 }
6844 }
drhf7141992008-06-19 00:16:08 +00006845 sqlite3PageFree(hit);
drh6019e162001-07-02 17:51:45 +00006846
drh4b70f112004-05-02 21:12:19 +00006847 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00006848 return depth+1;
drh5eddca62001-06-30 21:53:53 +00006849}
drhb7f91642004-10-31 02:22:47 +00006850#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006851
drhb7f91642004-10-31 02:22:47 +00006852#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006853/*
6854** This routine does a complete check of the given BTree file. aRoot[] is
6855** an array of pages numbers were each page number is the root page of
6856** a table. nRoot is the number of entries in aRoot.
6857**
drhc890fec2008-08-01 20:10:08 +00006858** Write the number of error seen in *pnErr. Except for some memory
6859** allocation errors, nn error message is held in memory obtained from
6860** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
6861** returned.
drh5eddca62001-06-30 21:53:53 +00006862*/
drh1dcdbc02007-01-27 02:24:54 +00006863char *sqlite3BtreeIntegrityCheck(
6864 Btree *p, /* The btree to be checked */
6865 int *aRoot, /* An array of root pages numbers for individual trees */
6866 int nRoot, /* Number of entries in aRoot[] */
6867 int mxErr, /* Stop reporting errors after this many */
6868 int *pnErr /* Write number of errors seen to this variable */
6869){
drh5eddca62001-06-30 21:53:53 +00006870 int i;
6871 int nRef;
drhaaab5722002-02-19 13:39:21 +00006872 IntegrityCk sCheck;
danielk1977aef0bf62005-12-30 16:28:01 +00006873 BtShared *pBt = p->pBt;
drhf089aa42008-07-08 19:34:06 +00006874 char zErr[100];
drh5eddca62001-06-30 21:53:53 +00006875
drhd677b3d2007-08-20 22:48:41 +00006876 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006877 pBt->db = p->db;
danielk19773b8a05f2007-03-19 17:44:26 +00006878 nRef = sqlite3PagerRefcount(pBt->pPager);
danielk1977aef0bf62005-12-30 16:28:01 +00006879 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
drhc890fec2008-08-01 20:10:08 +00006880 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00006881 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00006882 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
drhefc251d2001-07-01 22:12:01 +00006883 }
drh5eddca62001-06-30 21:53:53 +00006884 sCheck.pBt = pBt;
6885 sCheck.pPager = pBt->pPager;
danielk1977ad0132d2008-06-07 08:58:22 +00006886 sCheck.nPage = pagerPagecount(sCheck.pPager);
drh1dcdbc02007-01-27 02:24:54 +00006887 sCheck.mxErr = mxErr;
6888 sCheck.nErr = 0;
drhc890fec2008-08-01 20:10:08 +00006889 sCheck.mallocFailed = 0;
drh1dcdbc02007-01-27 02:24:54 +00006890 *pnErr = 0;
danielk1977e5321f02007-04-27 07:05:44 +00006891#ifndef SQLITE_OMIT_AUTOVACUUM
6892 if( pBt->nTrunc!=0 ){
6893 sCheck.nPage = pBt->nTrunc;
6894 }
6895#endif
drh0de8c112002-07-06 16:32:14 +00006896 if( sCheck.nPage==0 ){
6897 unlockBtreeIfUnused(pBt);
drhd677b3d2007-08-20 22:48:41 +00006898 sqlite3BtreeLeave(p);
drh0de8c112002-07-06 16:32:14 +00006899 return 0;
6900 }
drhe5ae5732008-06-15 02:51:47 +00006901 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
danielk1977ac245ec2005-01-14 13:50:11 +00006902 if( !sCheck.anRef ){
6903 unlockBtreeIfUnused(pBt);
drh1dcdbc02007-01-27 02:24:54 +00006904 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00006905 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00006906 return 0;
danielk1977ac245ec2005-01-14 13:50:11 +00006907 }
drhda200cc2004-05-09 11:51:38 +00006908 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh42cac6d2004-11-20 20:31:11 +00006909 i = PENDING_BYTE_PAGE(pBt);
drh1f595712004-06-15 01:40:29 +00006910 if( i<=sCheck.nPage ){
6911 sCheck.anRef[i] = 1;
6912 }
drhf089aa42008-07-08 19:34:06 +00006913 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
drh5eddca62001-06-30 21:53:53 +00006914
6915 /* Check the integrity of the freelist
6916 */
drha34b6762004-05-07 13:30:42 +00006917 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6918 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00006919
6920 /* Check all the tables.
6921 */
drh1dcdbc02007-01-27 02:24:54 +00006922 for(i=0; i<nRoot && sCheck.mxErr; i++){
drh4ff6dfa2002-03-03 23:06:00 +00006923 if( aRoot[i]==0 ) continue;
danielk1977687566d2004-11-02 12:56:41 +00006924#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006925 if( pBt->autoVacuum && aRoot[i]>1 ){
6926 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6927 }
6928#endif
drh74161702006-02-24 02:53:49 +00006929 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
drh5eddca62001-06-30 21:53:53 +00006930 }
6931
6932 /* Make sure every page in the file is referenced
6933 */
drh1dcdbc02007-01-27 02:24:54 +00006934 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
danielk1977afcdd022004-10-31 16:25:42 +00006935#ifdef SQLITE_OMIT_AUTOVACUUM
drh5eddca62001-06-30 21:53:53 +00006936 if( sCheck.anRef[i]==0 ){
drh2e38c322004-09-03 18:38:44 +00006937 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
drh5eddca62001-06-30 21:53:53 +00006938 }
danielk1977afcdd022004-10-31 16:25:42 +00006939#else
6940 /* If the database supports auto-vacuum, make sure no tables contain
6941 ** references to pointer-map pages.
6942 */
6943 if( sCheck.anRef[i]==0 &&
danielk1977266664d2006-02-10 08:24:21 +00006944 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006945 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6946 }
6947 if( sCheck.anRef[i]!=0 &&
danielk1977266664d2006-02-10 08:24:21 +00006948 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006949 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6950 }
6951#endif
drh5eddca62001-06-30 21:53:53 +00006952 }
6953
6954 /* Make sure this analysis did not leave any unref() pages
6955 */
drh5e00f6c2001-09-13 13:46:56 +00006956 unlockBtreeIfUnused(pBt);
danielk19773b8a05f2007-03-19 17:44:26 +00006957 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
drh2e38c322004-09-03 18:38:44 +00006958 checkAppendMsg(&sCheck, 0,
drh5eddca62001-06-30 21:53:53 +00006959 "Outstanding page count goes from %d to %d during this analysis",
danielk19773b8a05f2007-03-19 17:44:26 +00006960 nRef, sqlite3PagerRefcount(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00006961 );
drh5eddca62001-06-30 21:53:53 +00006962 }
6963
6964 /* Clean up and report errors.
6965 */
drhd677b3d2007-08-20 22:48:41 +00006966 sqlite3BtreeLeave(p);
drh17435752007-08-16 04:30:38 +00006967 sqlite3_free(sCheck.anRef);
drhc890fec2008-08-01 20:10:08 +00006968 if( sCheck.mallocFailed ){
6969 sqlite3StrAccumReset(&sCheck.errMsg);
6970 *pnErr = sCheck.nErr+1;
6971 return 0;
6972 }
drh1dcdbc02007-01-27 02:24:54 +00006973 *pnErr = sCheck.nErr;
drhf089aa42008-07-08 19:34:06 +00006974 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
6975 return sqlite3StrAccumFinish(&sCheck.errMsg);
drh5eddca62001-06-30 21:53:53 +00006976}
drhb7f91642004-10-31 02:22:47 +00006977#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
paulb95a8862003-04-01 21:16:41 +00006978
drh73509ee2003-04-06 20:44:45 +00006979/*
6980** Return the full pathname of the underlying database file.
drhd0679ed2007-08-28 22:24:34 +00006981**
6982** The pager filename is invariant as long as the pager is
6983** open so it is safe to access without the BtShared mutex.
drh73509ee2003-04-06 20:44:45 +00006984*/
danielk1977aef0bf62005-12-30 16:28:01 +00006985const char *sqlite3BtreeGetFilename(Btree *p){
6986 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00006987 return sqlite3PagerFilename(p->pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00006988}
6989
6990/*
danielk19775865e3d2004-06-14 06:03:57 +00006991** Return the pathname of the directory that contains the database file.
drhd0679ed2007-08-28 22:24:34 +00006992**
6993** The pager directory name is invariant as long as the pager is
6994** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00006995*/
danielk1977aef0bf62005-12-30 16:28:01 +00006996const char *sqlite3BtreeGetDirname(Btree *p){
6997 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00006998 return sqlite3PagerDirname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00006999}
7000
7001/*
7002** Return the pathname of the journal file for this database. The return
7003** value of this routine is the same regardless of whether the journal file
7004** has been created or not.
drhd0679ed2007-08-28 22:24:34 +00007005**
7006** The pager journal filename is invariant as long as the pager is
7007** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007008*/
danielk1977aef0bf62005-12-30 16:28:01 +00007009const char *sqlite3BtreeGetJournalname(Btree *p){
7010 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007011 return sqlite3PagerJournalname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007012}
7013
drhb7f91642004-10-31 02:22:47 +00007014#ifndef SQLITE_OMIT_VACUUM
danielk19775865e3d2004-06-14 06:03:57 +00007015/*
drhf7c57532003-04-25 13:22:51 +00007016** Copy the complete content of pBtFrom into pBtTo. A transaction
7017** must be active for both files.
7018**
danielk1977f653d782008-03-20 11:04:21 +00007019** The size of file pTo may be reduced by this operation.
7020** If anything goes wrong, the transaction on pTo is rolled back.
7021**
7022** If successful, CommitPhaseOne() may be called on pTo before returning.
7023** The caller should finish committing the transaction on pTo by calling
7024** sqlite3BtreeCommit().
drh73509ee2003-04-06 20:44:45 +00007025*/
drhd677b3d2007-08-20 22:48:41 +00007026static int btreeCopyFile(Btree *pTo, Btree *pFrom){
drhf7c57532003-04-25 13:22:51 +00007027 int rc = SQLITE_OK;
danielk1977f653d782008-03-20 11:04:21 +00007028 Pgno i;
7029
7030 Pgno nFromPage; /* Number of pages in pFrom */
7031 Pgno nToPage; /* Number of pages in pTo */
7032 Pgno nNewPage; /* Number of pages in pTo after the copy */
7033
7034 Pgno iSkip; /* Pending byte page in pTo */
7035 int nToPageSize; /* Page size of pTo in bytes */
7036 int nFromPageSize; /* Page size of pFrom in bytes */
drhf7c57532003-04-25 13:22:51 +00007037
danielk1977aef0bf62005-12-30 16:28:01 +00007038 BtShared *pBtTo = pTo->pBt;
7039 BtShared *pBtFrom = pFrom->pBt;
drhe5fe6902007-12-07 18:55:28 +00007040 pBtTo->db = pTo->db;
7041 pBtFrom->db = pFrom->db;
danielk1977f653d782008-03-20 11:04:21 +00007042
7043 nToPageSize = pBtTo->pageSize;
7044 nFromPageSize = pBtFrom->pageSize;
danielk1977aef0bf62005-12-30 16:28:01 +00007045
7046 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
danielk1977ee5741e2004-05-31 10:01:34 +00007047 return SQLITE_ERROR;
7048 }
danielk1977f653d782008-03-20 11:04:21 +00007049 if( pBtTo->pCursor ){
7050 return SQLITE_BUSY;
drhf7c57532003-04-25 13:22:51 +00007051 }
drh538f5702007-04-13 02:14:30 +00007052
danielk1977ad0132d2008-06-07 08:58:22 +00007053 nToPage = pagerPagecount(pBtTo->pPager);
7054 nFromPage = pagerPagecount(pBtFrom->pPager);
danielk1977f653d782008-03-20 11:04:21 +00007055 iSkip = PENDING_BYTE_PAGE(pBtTo);
7056
7057 /* Variable nNewPage is the number of pages required to store the
7058 ** contents of pFrom using the current page-size of pTo.
drh538f5702007-04-13 02:14:30 +00007059 */
danielk1977f653d782008-03-20 11:04:21 +00007060 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
7061 (i64)nToPageSize;
7062
7063 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7064
7065 /* Journal the original page.
7066 **
7067 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7068 ** in database *pTo (before the copy). This page is never written
7069 ** into the journal file. Unless i==iSkip or the page was not
7070 ** present in pTo before the copy operation, journal page i from pTo.
7071 */
7072 if( i!=iSkip && i<=nToPage ){
danielk19774abd5442008-05-05 15:26:50 +00007073 DbPage *pDbPage = 0;
danielk1977f653d782008-03-20 11:04:21 +00007074 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
danielk19774abd5442008-05-05 15:26:50 +00007075 if( rc==SQLITE_OK ){
7076 rc = sqlite3PagerWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007077 if( rc==SQLITE_OK && i>nFromPage ){
7078 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7079 ** that is because the names of those procedures do not exactly
7080 ** represent what they do. Write() really means "put this page in the
7081 ** rollback journal and mark it as dirty so that it will be written
7082 ** to the database file later." DontWrite() undoes the second part of
7083 ** that and prevents the page from being written to the database. The
7084 ** page is still on the rollback journal, though. And that is the
7085 ** whole point of this block: to put pages on the rollback journal.
7086 */
danielk1977a1fa00d2008-08-27 15:16:33 +00007087 rc = sqlite3PagerDontWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007088 }
7089 sqlite3PagerUnref(pDbPage);
danielk1977f653d782008-03-20 11:04:21 +00007090 }
danielk1977f653d782008-03-20 11:04:21 +00007091 }
7092
7093 /* Overwrite the data in page i of the target database */
7094 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7095
7096 DbPage *pToPage = 0;
7097 sqlite3_int64 iOff;
7098
7099 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7100 if( rc==SQLITE_OK ){
7101 rc = sqlite3PagerWrite(pToPage);
7102 }
7103
7104 for(
7105 iOff=(i-1)*nToPageSize;
7106 rc==SQLITE_OK && iOff<i*nToPageSize;
7107 iOff += nFromPageSize
7108 ){
7109 DbPage *pFromPage = 0;
7110 Pgno iFrom = (iOff/nFromPageSize)+1;
7111
7112 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7113 continue;
7114 }
7115
7116 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7117 if( rc==SQLITE_OK ){
7118 char *zTo = sqlite3PagerGetData(pToPage);
7119 char *zFrom = sqlite3PagerGetData(pFromPage);
7120 int nCopy;
7121
7122 if( nFromPageSize>=nToPageSize ){
7123 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7124 nCopy = nToPageSize;
7125 }else{
7126 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7127 nCopy = nFromPageSize;
7128 }
7129
7130 memcpy(zTo, zFrom, nCopy);
danielk19772f78fc62008-09-30 09:31:45 +00007131 sqlite3PagerUnref(pFromPage);
danielk1977f653d782008-03-20 11:04:21 +00007132 }
7133 }
7134
danielk1977eaa06f62008-09-18 17:34:44 +00007135 if( pToPage ){
7136 MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
7137 p->isInit = 0;
7138 sqlite3PagerUnref(pToPage);
7139 }
danielk1977f653d782008-03-20 11:04:21 +00007140 }
drh2e6d11b2003-04-25 15:37:57 +00007141 }
danielk1977f653d782008-03-20 11:04:21 +00007142
7143 /* If things have worked so far, the database file may need to be
7144 ** truncated. The complex part is that it may need to be truncated to
7145 ** a size that is not an integer multiple of nToPageSize - the current
7146 ** page size used by the pager associated with B-Tree pTo.
7147 **
7148 ** For example, say the page-size of pTo is 2048 bytes and the original
7149 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7150 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7151 */
7152 if( rc==SQLITE_OK ){
7153 if( nFromPageSize!=nToPageSize ){
7154 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7155 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7156 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7157 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7158
7159 assert( iSize<=iNow );
7160
7161 /* Commit phase one syncs the journal file associated with pTo
7162 ** containing the original data. It does not sync the database file
7163 ** itself. After doing this it is safe to use OsTruncate() and other
7164 ** file APIs on the database file directly.
7165 */
7166 pBtTo->db = pTo->db;
7167 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7168 if( iSize<iNow && rc==SQLITE_OK ){
7169 rc = sqlite3OsTruncate(pFile, iSize);
7170 }
7171
7172 /* The loop that copied data from database pFrom to pTo did not
7173 ** populate the locking page of database pTo. If the page-size of
7174 ** pFrom is smaller than that of pTo, this means some data will
7175 ** not have been copied.
7176 **
7177 ** This block copies the missing data from database pFrom to pTo
7178 ** using file APIs. This is safe because at this point we know that
7179 ** all of the original data from pTo has been synced into the
7180 ** journal file. At this point it would be safe to do anything at
7181 ** all to the database file except truncate it to zero bytes.
7182 */
7183 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7184 i64 iOff;
7185 for(
7186 iOff=iPending;
7187 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7188 iOff += nFromPageSize
7189 ){
7190 DbPage *pFromPage = 0;
7191 Pgno iFrom = (iOff/nFromPageSize)+1;
7192
7193 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7194 continue;
7195 }
7196
7197 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7198 if( rc==SQLITE_OK ){
7199 char *zFrom = sqlite3PagerGetData(pFromPage);
danielk197706249db2008-08-23 16:17:55 +00007200 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
danielk1977f653d782008-03-20 11:04:21 +00007201 sqlite3PagerUnref(pFromPage);
7202 }
7203 }
7204 }
7205
7206 /* Sync the database file */
7207 if( rc==SQLITE_OK ){
7208 rc = sqlite3PagerSync(pBtTo->pPager);
7209 }
7210 }else{
7211 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7212 }
7213 if( rc==SQLITE_OK ){
7214 pBtTo->pageSizeFixed = 0;
7215 }
drh2e6d11b2003-04-25 15:37:57 +00007216 }
drh538f5702007-04-13 02:14:30 +00007217
drhf7c57532003-04-25 13:22:51 +00007218 if( rc ){
danielk1977aef0bf62005-12-30 16:28:01 +00007219 sqlite3BtreeRollback(pTo);
drhf7c57532003-04-25 13:22:51 +00007220 }
danielk1977f653d782008-03-20 11:04:21 +00007221
drhf7c57532003-04-25 13:22:51 +00007222 return rc;
drh73509ee2003-04-06 20:44:45 +00007223}
drhd677b3d2007-08-20 22:48:41 +00007224int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7225 int rc;
7226 sqlite3BtreeEnter(pTo);
7227 sqlite3BtreeEnter(pFrom);
7228 rc = btreeCopyFile(pTo, pFrom);
7229 sqlite3BtreeLeave(pFrom);
7230 sqlite3BtreeLeave(pTo);
7231 return rc;
7232}
7233
drhb7f91642004-10-31 02:22:47 +00007234#endif /* SQLITE_OMIT_VACUUM */
danielk19771d850a72004-05-31 08:26:49 +00007235
7236/*
7237** Return non-zero if a transaction is active.
7238*/
danielk1977aef0bf62005-12-30 16:28:01 +00007239int sqlite3BtreeIsInTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007240 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00007241 return (p && (p->inTrans==TRANS_WRITE));
danielk19771d850a72004-05-31 08:26:49 +00007242}
7243
7244/*
7245** Return non-zero if a statement transaction is active.
7246*/
danielk1977aef0bf62005-12-30 16:28:01 +00007247int sqlite3BtreeIsInStmt(Btree *p){
drh1fee73e2007-08-29 04:00:57 +00007248 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00007249 return (p->pBt && p->pBt->inStmt);
danielk19771d850a72004-05-31 08:26:49 +00007250}
danielk197713adf8a2004-06-03 16:08:41 +00007251
7252/*
danielk19772372c2b2006-06-27 16:34:56 +00007253** Return non-zero if a read (or write) transaction is active.
7254*/
7255int sqlite3BtreeIsInReadTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007256 assert( sqlite3_mutex_held(p->db->mutex) );
danielk19772372c2b2006-06-27 16:34:56 +00007257 return (p && (p->inTrans!=TRANS_NONE));
7258}
7259
7260/*
danielk1977da184232006-01-05 11:34:32 +00007261** This function returns a pointer to a blob of memory associated with
drh85b623f2007-12-13 21:54:09 +00007262** a single shared-btree. The memory is used by client code for its own
danielk1977da184232006-01-05 11:34:32 +00007263** purposes (for example, to store a high-level schema associated with
7264** the shared-btree). The btree layer manages reference counting issues.
7265**
7266** The first time this is called on a shared-btree, nBytes bytes of memory
7267** are allocated, zeroed, and returned to the caller. For each subsequent
7268** call the nBytes parameter is ignored and a pointer to the same blob
7269** of memory returned.
7270**
danielk1977171bfed2008-06-23 09:50:50 +00007271** If the nBytes parameter is 0 and the blob of memory has not yet been
7272** allocated, a null pointer is returned. If the blob has already been
7273** allocated, it is returned as normal.
7274**
danielk1977da184232006-01-05 11:34:32 +00007275** Just before the shared-btree is closed, the function passed as the
7276** xFree argument when the memory allocation was made is invoked on the
drh17435752007-08-16 04:30:38 +00007277** blob of allocated memory. This function should not call sqlite3_free()
danielk1977da184232006-01-05 11:34:32 +00007278** on the memory, the btree layer does that.
7279*/
7280void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7281 BtShared *pBt = p->pBt;
drh27641702007-08-22 02:56:42 +00007282 sqlite3BtreeEnter(p);
danielk1977171bfed2008-06-23 09:50:50 +00007283 if( !pBt->pSchema && nBytes ){
drh17435752007-08-16 04:30:38 +00007284 pBt->pSchema = sqlite3MallocZero(nBytes);
danielk1977da184232006-01-05 11:34:32 +00007285 pBt->xFreeSchema = xFree;
7286 }
drh27641702007-08-22 02:56:42 +00007287 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00007288 return pBt->pSchema;
7289}
7290
danielk1977c87d34d2006-01-06 13:00:28 +00007291/*
7292** Return true if another user of the same shared btree as the argument
7293** handle holds an exclusive lock on the sqlite_master table.
7294*/
7295int sqlite3BtreeSchemaLocked(Btree *p){
drh27641702007-08-22 02:56:42 +00007296 int rc;
drhe5fe6902007-12-07 18:55:28 +00007297 assert( sqlite3_mutex_held(p->db->mutex) );
drh27641702007-08-22 02:56:42 +00007298 sqlite3BtreeEnter(p);
7299 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7300 sqlite3BtreeLeave(p);
7301 return rc;
danielk1977c87d34d2006-01-06 13:00:28 +00007302}
7303
drha154dcd2006-03-22 22:10:07 +00007304
7305#ifndef SQLITE_OMIT_SHARED_CACHE
7306/*
7307** Obtain a lock on the table whose root page is iTab. The
7308** lock is a write lock if isWritelock is true or a read lock
7309** if it is false.
7310*/
danielk1977c00da102006-01-07 13:21:04 +00007311int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
danielk19772e94d4d2006-01-09 05:36:27 +00007312 int rc = SQLITE_OK;
drh6a9ad3d2008-04-02 16:29:30 +00007313 if( p->sharable ){
7314 u8 lockType = READ_LOCK + isWriteLock;
7315 assert( READ_LOCK+1==WRITE_LOCK );
7316 assert( isWriteLock==0 || isWriteLock==1 );
7317 sqlite3BtreeEnter(p);
7318 rc = queryTableLock(p, iTab, lockType);
7319 if( rc==SQLITE_OK ){
7320 rc = lockTable(p, iTab, lockType);
7321 }
7322 sqlite3BtreeLeave(p);
danielk1977c00da102006-01-07 13:21:04 +00007323 }
7324 return rc;
7325}
drha154dcd2006-03-22 22:10:07 +00007326#endif
danielk1977b82e7ed2006-01-11 14:09:31 +00007327
danielk1977b4e9af92007-05-01 17:49:49 +00007328#ifndef SQLITE_OMIT_INCRBLOB
7329/*
7330** Argument pCsr must be a cursor opened for writing on an
7331** INTKEY table currently pointing at a valid table entry.
7332** This function modifies the data stored as part of that entry.
7333** Only the data content may only be modified, it is not possible
7334** to change the length of the data stored.
7335*/
danielk1977dcbb5d32007-05-04 18:36:44 +00007336int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
drh1fee73e2007-08-29 04:00:57 +00007337 assert( cursorHoldsMutex(pCsr) );
drhe5fe6902007-12-07 18:55:28 +00007338 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007339 assert(pCsr->isIncrblobHandle);
danielk19773588ceb2008-06-10 17:30:26 +00007340
drha3460582008-07-11 21:02:53 +00007341 restoreCursorPosition(pCsr);
danielk19773588ceb2008-06-10 17:30:26 +00007342 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7343 if( pCsr->eState!=CURSOR_VALID ){
7344 return SQLITE_ABORT;
danielk1977dcbb5d32007-05-04 18:36:44 +00007345 }
7346
danielk1977d04417962007-05-02 13:16:30 +00007347 /* Check some preconditions:
danielk1977dcbb5d32007-05-04 18:36:44 +00007348 ** (a) the cursor is open for writing,
7349 ** (b) there is no read-lock on the table being modified and
7350 ** (c) the cursor points at a valid row of an intKey table.
danielk1977d04417962007-05-02 13:16:30 +00007351 */
danielk1977d04417962007-05-02 13:16:30 +00007352 if( !pCsr->wrFlag ){
danielk1977dcbb5d32007-05-04 18:36:44 +00007353 return SQLITE_READONLY;
danielk1977d04417962007-05-02 13:16:30 +00007354 }
drhd0679ed2007-08-28 22:24:34 +00007355 assert( !pCsr->pBt->readOnly
7356 && pCsr->pBt->inTransaction==TRANS_WRITE );
danielk19773588ceb2008-06-10 17:30:26 +00007357 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
danielk1977d04417962007-05-02 13:16:30 +00007358 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7359 }
danielk197771d5d2c2008-09-29 11:49:47 +00007360 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
danielk1977d04417962007-05-02 13:16:30 +00007361 return SQLITE_ERROR;
danielk1977b4e9af92007-05-01 17:49:49 +00007362 }
7363
danielk19779f8d6402007-05-02 17:48:45 +00007364 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
danielk1977b4e9af92007-05-01 17:49:49 +00007365}
danielk19772dec9702007-05-02 16:48:37 +00007366
7367/*
7368** Set a flag on this cursor to cache the locations of pages from the
danielk1977da107192007-05-04 08:32:13 +00007369** overflow list for the current row. This is used by cursors opened
7370** for incremental blob IO only.
7371**
7372** This function sets a flag only. The actual page location cache
7373** (stored in BtCursor.aOverflow[]) is allocated and used by function
7374** accessPayload() (the worker function for sqlite3BtreeData() and
7375** sqlite3BtreePutData()).
danielk19772dec9702007-05-02 16:48:37 +00007376*/
7377void sqlite3BtreeCacheOverflow(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00007378 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00007379 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007380 assert(!pCur->isIncrblobHandle);
danielk19772dec9702007-05-02 16:48:37 +00007381 assert(!pCur->aOverflow);
danielk1977dcbb5d32007-05-04 18:36:44 +00007382 pCur->isIncrblobHandle = 1;
danielk19772dec9702007-05-02 16:48:37 +00007383}
danielk1977b4e9af92007-05-01 17:49:49 +00007384#endif