blob: 1dc3f168d18a1cd73b123729fadacb30a7354de5 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
danielk1977360e6342008-11-12 08:49:51 +000012** $Id: btree.c,v 1.533 2008/11/12 08:49:52 danielk1977 Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
drha3152892007-05-05 11:48:52 +000015** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
drha059ad02001-04-17 20:09:11 +000017*/
drha3152892007-05-05 11:48:52 +000018#include "btreeInt.h"
paulb95a8862003-04-01 21:16:41 +000019
drh8c42ca92001-06-22 19:15:00 +000020/*
drha3152892007-05-05 11:48:52 +000021** The header string that appears at the beginning of every
22** SQLite database.
drh556b2a22005-06-14 16:04:05 +000023*/
drh556b2a22005-06-14 16:04:05 +000024static const char zMagicHeader[] = SQLITE_FILE_HEADER;
drh08ed44e2001-04-29 23:32:55 +000025
drh8c42ca92001-06-22 19:15:00 +000026/*
drha3152892007-05-05 11:48:52 +000027** Set this global variable to 1 to enable tracing using the TRACE
28** macro.
drh615ae552005-01-16 23:21:00 +000029*/
drhe8f52c52008-07-12 14:52:20 +000030#if 0
mlcreech3a00f902008-03-04 17:45:01 +000031int sqlite3BtreeTrace=0; /* True to enable tracing */
drhe8f52c52008-07-12 14:52:20 +000032# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
33#else
34# define TRACE(X)
drh615ae552005-01-16 23:21:00 +000035#endif
drh615ae552005-01-16 23:21:00 +000036
drhf94a1732008-09-30 17:18:17 +000037/*
38** Sometimes we need a small amount of code such as a variable initialization
39** to setup for a later assert() statement. We do not want this code to
40** appear when assert() is disabled. The following macro is therefore
41** used to contain that setup code. The "VVA" acronym stands for
42** "Verification, Validation, and Accreditation". In other words, the
43** code within VVA_ONLY() will only run during verification processes.
44*/
45#ifndef NDEBUG
46# define VVA_ONLY(X) X
47#else
48# define VVA_ONLY(X)
49#endif
50
drh86f8c192007-08-22 00:39:19 +000051
52
drhe53831d2007-08-17 01:14:38 +000053#ifndef SQLITE_OMIT_SHARED_CACHE
54/*
danielk1977502b4e02008-09-02 14:07:24 +000055** A list of BtShared objects that are eligible for participation
56** in shared cache. This variable has file scope during normal builds,
57** but the test harness needs to access it so we make it global for
58** test builds.
drhe53831d2007-08-17 01:14:38 +000059*/
60#ifdef SQLITE_TEST
drh78f82d12008-09-02 00:52:52 +000061BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000062#else
drh78f82d12008-09-02 00:52:52 +000063static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000064#endif
drhe53831d2007-08-17 01:14:38 +000065#endif /* SQLITE_OMIT_SHARED_CACHE */
66
67#ifndef SQLITE_OMIT_SHARED_CACHE
68/*
69** Enable or disable the shared pager and schema features.
70**
71** This routine has no effect on existing database connections.
72** The shared cache setting effects only future calls to
73** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
74*/
75int sqlite3_enable_shared_cache(int enable){
danielk1977502b4e02008-09-02 14:07:24 +000076 sqlite3GlobalConfig.sharedCacheEnabled = enable;
drhe53831d2007-08-17 01:14:38 +000077 return SQLITE_OK;
78}
79#endif
80
drhd677b3d2007-08-20 22:48:41 +000081
drh615ae552005-01-16 23:21:00 +000082/*
drh66cbd152004-09-01 16:12:25 +000083** Forward declaration
84*/
danielk19773588ceb2008-06-10 17:30:26 +000085static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
drh66cbd152004-09-01 16:12:25 +000086
danielk1977aef0bf62005-12-30 16:28:01 +000087
88#ifdef SQLITE_OMIT_SHARED_CACHE
89 /*
90 ** The functions queryTableLock(), lockTable() and unlockAllTables()
91 ** manipulate entries in the BtShared.pLock linked list used to store
92 ** shared-cache table level locks. If the library is compiled with the
93 ** shared-cache feature disabled, then there is only ever one user
danielk1977da184232006-01-05 11:34:32 +000094 ** of each BtShared structure and so this locking is not necessary.
95 ** So define the lock related functions as no-ops.
danielk1977aef0bf62005-12-30 16:28:01 +000096 */
97 #define queryTableLock(a,b,c) SQLITE_OK
98 #define lockTable(a,b,c) SQLITE_OK
danielk1977da184232006-01-05 11:34:32 +000099 #define unlockAllTables(a)
drhe53831d2007-08-17 01:14:38 +0000100#endif
danielk1977aef0bf62005-12-30 16:28:01 +0000101
drhe53831d2007-08-17 01:14:38 +0000102#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977da184232006-01-05 11:34:32 +0000103/*
danielk1977aef0bf62005-12-30 16:28:01 +0000104** Query to see if btree handle p may obtain a lock of type eLock
105** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
106** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
danielk1977c87d34d2006-01-06 13:00:28 +0000107** SQLITE_LOCKED if not.
danielk1977aef0bf62005-12-30 16:28:01 +0000108*/
109static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
110 BtShared *pBt = p->pBt;
111 BtLock *pIter;
112
drh1fee73e2007-08-29 04:00:57 +0000113 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000114 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
115 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000116
danielk1977da184232006-01-05 11:34:32 +0000117 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000118 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000119 return SQLITE_OK;
120 }
121
danielk1977641b0f42007-12-21 04:47:25 +0000122 /* If some other connection is holding an exclusive lock, the
123 ** requested lock may not be obtained.
124 */
125 if( pBt->pExclusive && pBt->pExclusive!=p ){
126 return SQLITE_LOCKED;
127 }
128
danielk1977da184232006-01-05 11:34:32 +0000129 /* This (along with lockTable()) is where the ReadUncommitted flag is
130 ** dealt with. If the caller is querying for a read-lock and the flag is
131 ** set, it is unconditionally granted - even if there are write-locks
132 ** on the table. If a write-lock is requested, the ReadUncommitted flag
133 ** is not considered.
134 **
135 ** In function lockTable(), if a read-lock is demanded and the
136 ** ReadUncommitted flag is set, no entry is added to the locks list
137 ** (BtShared.pLock).
138 **
139 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
140 ** not create or respect table locks. The locking procedure for a
141 ** write-cursor does not change.
142 */
143 if(
drhe5fe6902007-12-07 18:55:28 +0000144 0==(p->db->flags&SQLITE_ReadUncommitted) ||
danielk1977da184232006-01-05 11:34:32 +0000145 eLock==WRITE_LOCK ||
drh47ded162006-01-06 01:42:58 +0000146 iTab==MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000147 ){
148 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
149 if( pIter->pBtree!=p && pIter->iTable==iTab &&
150 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
danielk1977c87d34d2006-01-06 13:00:28 +0000151 return SQLITE_LOCKED;
danielk1977da184232006-01-05 11:34:32 +0000152 }
danielk1977aef0bf62005-12-30 16:28:01 +0000153 }
154 }
155 return SQLITE_OK;
156}
drhe53831d2007-08-17 01:14:38 +0000157#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000158
drhe53831d2007-08-17 01:14:38 +0000159#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000160/*
161** Add a lock on the table with root-page iTable to the shared-btree used
162** by Btree handle p. Parameter eLock must be either READ_LOCK or
163** WRITE_LOCK.
164**
165** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
166** SQLITE_NOMEM may also be returned.
167*/
168static int lockTable(Btree *p, Pgno iTable, u8 eLock){
169 BtShared *pBt = p->pBt;
170 BtLock *pLock = 0;
171 BtLock *pIter;
172
drh1fee73e2007-08-29 04:00:57 +0000173 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000174 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
175 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000176
danielk1977da184232006-01-05 11:34:32 +0000177 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000178 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000179 return SQLITE_OK;
180 }
181
danielk1977aef0bf62005-12-30 16:28:01 +0000182 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
183
danielk1977da184232006-01-05 11:34:32 +0000184 /* If the read-uncommitted flag is set and a read-lock is requested,
185 ** return early without adding an entry to the BtShared.pLock list. See
186 ** comment in function queryTableLock() for more info on handling
187 ** the ReadUncommitted flag.
188 */
189 if(
drhe5fe6902007-12-07 18:55:28 +0000190 (p->db->flags&SQLITE_ReadUncommitted) &&
danielk1977da184232006-01-05 11:34:32 +0000191 (eLock==READ_LOCK) &&
drh47ded162006-01-06 01:42:58 +0000192 iTable!=MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000193 ){
194 return SQLITE_OK;
195 }
196
danielk1977aef0bf62005-12-30 16:28:01 +0000197 /* First search the list for an existing lock on this table. */
198 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
199 if( pIter->iTable==iTable && pIter->pBtree==p ){
200 pLock = pIter;
201 break;
202 }
203 }
204
205 /* If the above search did not find a BtLock struct associating Btree p
206 ** with table iTable, allocate one and link it into the list.
207 */
208 if( !pLock ){
drh17435752007-08-16 04:30:38 +0000209 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
danielk1977aef0bf62005-12-30 16:28:01 +0000210 if( !pLock ){
211 return SQLITE_NOMEM;
212 }
213 pLock->iTable = iTable;
214 pLock->pBtree = p;
215 pLock->pNext = pBt->pLock;
216 pBt->pLock = pLock;
217 }
218
219 /* Set the BtLock.eLock variable to the maximum of the current lock
220 ** and the requested lock. This means if a write-lock was already held
221 ** and a read-lock requested, we don't incorrectly downgrade the lock.
222 */
223 assert( WRITE_LOCK>READ_LOCK );
danielk19775118b912005-12-30 16:31:53 +0000224 if( eLock>pLock->eLock ){
225 pLock->eLock = eLock;
226 }
danielk1977aef0bf62005-12-30 16:28:01 +0000227
228 return SQLITE_OK;
229}
drhe53831d2007-08-17 01:14:38 +0000230#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000231
drhe53831d2007-08-17 01:14:38 +0000232#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000233/*
234** Release all the table locks (locks obtained via calls to the lockTable()
235** procedure) held by Btree handle p.
236*/
237static void unlockAllTables(Btree *p){
danielk1977641b0f42007-12-21 04:47:25 +0000238 BtShared *pBt = p->pBt;
239 BtLock **ppIter = &pBt->pLock;
danielk1977da184232006-01-05 11:34:32 +0000240
drh1fee73e2007-08-29 04:00:57 +0000241 assert( sqlite3BtreeHoldsMutex(p) );
drhe53831d2007-08-17 01:14:38 +0000242 assert( p->sharable || 0==*ppIter );
danielk1977da184232006-01-05 11:34:32 +0000243
danielk1977aef0bf62005-12-30 16:28:01 +0000244 while( *ppIter ){
245 BtLock *pLock = *ppIter;
danielk1977641b0f42007-12-21 04:47:25 +0000246 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
danielk1977aef0bf62005-12-30 16:28:01 +0000247 if( pLock->pBtree==p ){
248 *ppIter = pLock->pNext;
drh17435752007-08-16 04:30:38 +0000249 sqlite3_free(pLock);
danielk1977aef0bf62005-12-30 16:28:01 +0000250 }else{
251 ppIter = &pLock->pNext;
252 }
253 }
danielk1977641b0f42007-12-21 04:47:25 +0000254
255 if( pBt->pExclusive==p ){
256 pBt->pExclusive = 0;
257 }
danielk1977aef0bf62005-12-30 16:28:01 +0000258}
259#endif /* SQLITE_OMIT_SHARED_CACHE */
260
drh980b1a72006-08-16 16:42:48 +0000261static void releasePage(MemPage *pPage); /* Forward reference */
262
drh1fee73e2007-08-29 04:00:57 +0000263/*
264** Verify that the cursor holds a mutex on the BtShared
265*/
266#ifndef NDEBUG
267static int cursorHoldsMutex(BtCursor *p){
drhff0587c2007-08-29 17:43:19 +0000268 return sqlite3_mutex_held(p->pBt->mutex);
drh1fee73e2007-08-29 04:00:57 +0000269}
270#endif
271
272
danielk197792d4d7a2007-05-04 12:05:56 +0000273#ifndef SQLITE_OMIT_INCRBLOB
274/*
275** Invalidate the overflow page-list cache for cursor pCur, if any.
276*/
277static void invalidateOverflowCache(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000278 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000279 sqlite3_free(pCur->aOverflow);
danielk197792d4d7a2007-05-04 12:05:56 +0000280 pCur->aOverflow = 0;
281}
282
283/*
284** Invalidate the overflow page-list cache for all cursors opened
285** on the shared btree structure pBt.
286*/
287static void invalidateAllOverflowCache(BtShared *pBt){
288 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000289 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +0000290 for(p=pBt->pCursor; p; p=p->pNext){
291 invalidateOverflowCache(p);
292 }
293}
294#else
295 #define invalidateOverflowCache(x)
296 #define invalidateAllOverflowCache(x)
297#endif
298
drh980b1a72006-08-16 16:42:48 +0000299/*
300** Save the current cursor position in the variables BtCursor.nKey
301** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
302*/
303static int saveCursorPosition(BtCursor *pCur){
304 int rc;
305
306 assert( CURSOR_VALID==pCur->eState );
307 assert( 0==pCur->pKey );
drh1fee73e2007-08-29 04:00:57 +0000308 assert( cursorHoldsMutex(pCur) );
drh980b1a72006-08-16 16:42:48 +0000309
310 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
311
312 /* If this is an intKey table, then the above call to BtreeKeySize()
313 ** stores the integer key in pCur->nKey. In this case this value is
314 ** all that is required. Otherwise, if pCur is not open on an intKey
315 ** table, then malloc space for and store the pCur->nKey bytes of key
316 ** data.
317 */
danielk197771d5d2c2008-09-29 11:49:47 +0000318 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
drhe5ae5732008-06-15 02:51:47 +0000319 void *pKey = sqlite3Malloc(pCur->nKey);
drh980b1a72006-08-16 16:42:48 +0000320 if( pKey ){
321 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
322 if( rc==SQLITE_OK ){
323 pCur->pKey = pKey;
324 }else{
drh17435752007-08-16 04:30:38 +0000325 sqlite3_free(pKey);
drh980b1a72006-08-16 16:42:48 +0000326 }
327 }else{
328 rc = SQLITE_NOMEM;
329 }
330 }
danielk197771d5d2c2008-09-29 11:49:47 +0000331 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
drh980b1a72006-08-16 16:42:48 +0000332
333 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +0000334 int i;
335 for(i=0; i<=pCur->iPage; i++){
336 releasePage(pCur->apPage[i]);
337 pCur->apPage[i] = 0;
338 }
339 pCur->iPage = -1;
drh980b1a72006-08-16 16:42:48 +0000340 pCur->eState = CURSOR_REQUIRESEEK;
341 }
342
danielk197792d4d7a2007-05-04 12:05:56 +0000343 invalidateOverflowCache(pCur);
drh980b1a72006-08-16 16:42:48 +0000344 return rc;
345}
346
347/*
348** Save the positions of all cursors except pExcept open on the table
349** with root-page iRoot. Usually, this is called just before cursor
350** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
351*/
352static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
353 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000354 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +0000355 assert( pExcept==0 || pExcept->pBt==pBt );
drh980b1a72006-08-16 16:42:48 +0000356 for(p=pBt->pCursor; p; p=p->pNext){
357 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
358 p->eState==CURSOR_VALID ){
359 int rc = saveCursorPosition(p);
360 if( SQLITE_OK!=rc ){
361 return rc;
362 }
363 }
364 }
365 return SQLITE_OK;
366}
367
368/*
drhbf700f32007-03-31 02:36:44 +0000369** Clear the current cursor position.
370*/
danielk1977be51a652008-10-08 17:58:48 +0000371void sqlite3BtreeClearCursor(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000372 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000373 sqlite3_free(pCur->pKey);
drhbf700f32007-03-31 02:36:44 +0000374 pCur->pKey = 0;
375 pCur->eState = CURSOR_INVALID;
376}
377
378/*
drh980b1a72006-08-16 16:42:48 +0000379** Restore the cursor to the position it was in (or as close to as possible)
380** when saveCursorPosition() was called. Note that this call deletes the
381** saved position info stored by saveCursorPosition(), so there can be
drha3460582008-07-11 21:02:53 +0000382** at most one effective restoreCursorPosition() call after each
drh980b1a72006-08-16 16:42:48 +0000383** saveCursorPosition().
drh980b1a72006-08-16 16:42:48 +0000384*/
drha3460582008-07-11 21:02:53 +0000385int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
drhbf700f32007-03-31 02:36:44 +0000386 int rc;
drh1fee73e2007-08-29 04:00:57 +0000387 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +0000388 assert( pCur->eState>=CURSOR_REQUIRESEEK );
389 if( pCur->eState==CURSOR_FAULT ){
390 return pCur->skip;
391 }
drh980b1a72006-08-16 16:42:48 +0000392 pCur->eState = CURSOR_INVALID;
drhe63d9992008-08-13 19:11:48 +0000393 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
drh980b1a72006-08-16 16:42:48 +0000394 if( rc==SQLITE_OK ){
drh17435752007-08-16 04:30:38 +0000395 sqlite3_free(pCur->pKey);
drh980b1a72006-08-16 16:42:48 +0000396 pCur->pKey = 0;
drhbf700f32007-03-31 02:36:44 +0000397 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
drh980b1a72006-08-16 16:42:48 +0000398 }
399 return rc;
400}
401
drha3460582008-07-11 21:02:53 +0000402#define restoreCursorPosition(p) \
drhfb982642007-08-30 01:19:59 +0000403 (p->eState>=CURSOR_REQUIRESEEK ? \
drha3460582008-07-11 21:02:53 +0000404 sqlite3BtreeRestoreCursorPosition(p) : \
drh16a9b832007-05-05 18:39:25 +0000405 SQLITE_OK)
drh980b1a72006-08-16 16:42:48 +0000406
drha3460582008-07-11 21:02:53 +0000407/*
408** Determine whether or not a cursor has moved from the position it
drhdfe88ec2008-11-03 20:55:06 +0000409** was last placed at. Cursors can move when the row they are pointing
drha3460582008-07-11 21:02:53 +0000410** at is deleted out from under them.
411**
412** This routine returns an error code if something goes wrong. The
413** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
414*/
415int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
416 int rc;
417
418 rc = restoreCursorPosition(pCur);
419 if( rc ){
420 *pHasMoved = 1;
421 return rc;
422 }
423 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
424 *pHasMoved = 1;
425 }else{
426 *pHasMoved = 0;
427 }
428 return SQLITE_OK;
429}
430
danielk1977599fcba2004-11-08 07:13:13 +0000431#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977afcdd022004-10-31 16:25:42 +0000432/*
drha3152892007-05-05 11:48:52 +0000433** Given a page number of a regular database page, return the page
434** number for the pointer-map page that contains the entry for the
435** input page number.
danielk1977afcdd022004-10-31 16:25:42 +0000436*/
danielk1977266664d2006-02-10 08:24:21 +0000437static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
drhd677b3d2007-08-20 22:48:41 +0000438 int nPagesPerMapPage, iPtrMap, ret;
drh1fee73e2007-08-29 04:00:57 +0000439 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000440 nPagesPerMapPage = (pBt->usableSize/5)+1;
441 iPtrMap = (pgno-2)/nPagesPerMapPage;
442 ret = (iPtrMap*nPagesPerMapPage) + 2;
danielk1977266664d2006-02-10 08:24:21 +0000443 if( ret==PENDING_BYTE_PAGE(pBt) ){
444 ret++;
445 }
446 return ret;
447}
danielk1977a19df672004-11-03 11:37:07 +0000448
danielk1977afcdd022004-10-31 16:25:42 +0000449/*
danielk1977afcdd022004-10-31 16:25:42 +0000450** Write an entry into the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000451**
452** This routine updates the pointer map entry for page number 'key'
453** so that it maps to type 'eType' and parent page number 'pgno'.
454** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000455*/
danielk1977aef0bf62005-12-30 16:28:01 +0000456static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
danielk19773b8a05f2007-03-19 17:44:26 +0000457 DbPage *pDbPage; /* The pointer map page */
458 u8 *pPtrmap; /* The pointer map data */
459 Pgno iPtrmap; /* The pointer map page number */
460 int offset; /* Offset in pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000461 int rc;
462
drh1fee73e2007-08-29 04:00:57 +0000463 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977266664d2006-02-10 08:24:21 +0000464 /* The master-journal page number must never be used as a pointer map page */
465 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
466
danielk1977ac11ee62005-01-15 12:45:51 +0000467 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +0000468 if( key==0 ){
drh49285702005-09-17 15:20:26 +0000469 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +0000470 }
danielk1977266664d2006-02-10 08:24:21 +0000471 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000472 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977687566d2004-11-02 12:56:41 +0000473 if( rc!=SQLITE_OK ){
danielk1977afcdd022004-10-31 16:25:42 +0000474 return rc;
475 }
danielk19778c666b12008-07-18 09:34:57 +0000476 offset = PTRMAP_PTROFFSET(iPtrmap, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000477 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000478
drh615ae552005-01-16 23:21:00 +0000479 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
480 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
danielk19773b8a05f2007-03-19 17:44:26 +0000481 rc = sqlite3PagerWrite(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000482 if( rc==SQLITE_OK ){
483 pPtrmap[offset] = eType;
484 put4byte(&pPtrmap[offset+1], parent);
danielk1977afcdd022004-10-31 16:25:42 +0000485 }
danielk1977afcdd022004-10-31 16:25:42 +0000486 }
487
danielk19773b8a05f2007-03-19 17:44:26 +0000488 sqlite3PagerUnref(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000489 return rc;
danielk1977afcdd022004-10-31 16:25:42 +0000490}
491
492/*
493** Read an entry from the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000494**
495** This routine retrieves the pointer map entry for page 'key', writing
496** the type and parent page number to *pEType and *pPgno respectively.
497** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000498*/
danielk1977aef0bf62005-12-30 16:28:01 +0000499static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
danielk19773b8a05f2007-03-19 17:44:26 +0000500 DbPage *pDbPage; /* The pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000501 int iPtrmap; /* Pointer map page index */
502 u8 *pPtrmap; /* Pointer map page data */
503 int offset; /* Offset of entry in pointer map */
504 int rc;
505
drh1fee73e2007-08-29 04:00:57 +0000506 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000507
danielk1977266664d2006-02-10 08:24:21 +0000508 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000509 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000510 if( rc!=0 ){
511 return rc;
512 }
danielk19773b8a05f2007-03-19 17:44:26 +0000513 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000514
danielk19778c666b12008-07-18 09:34:57 +0000515 offset = PTRMAP_PTROFFSET(iPtrmap, key);
drh43617e92006-03-06 20:55:46 +0000516 assert( pEType!=0 );
517 *pEType = pPtrmap[offset];
danielk1977687566d2004-11-02 12:56:41 +0000518 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
danielk1977afcdd022004-10-31 16:25:42 +0000519
danielk19773b8a05f2007-03-19 17:44:26 +0000520 sqlite3PagerUnref(pDbPage);
drh49285702005-09-17 15:20:26 +0000521 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
danielk1977afcdd022004-10-31 16:25:42 +0000522 return SQLITE_OK;
523}
524
danielk197785d90ca2008-07-19 14:25:15 +0000525#else /* if defined SQLITE_OMIT_AUTOVACUUM */
526 #define ptrmapPut(w,x,y,z) SQLITE_OK
527 #define ptrmapGet(w,x,y,z) SQLITE_OK
528 #define ptrmapPutOvfl(y,z) SQLITE_OK
529#endif
danielk1977afcdd022004-10-31 16:25:42 +0000530
drh0d316a42002-08-11 20:10:47 +0000531/*
drh271efa52004-05-30 19:19:05 +0000532** Given a btree page and a cell index (0 means the first cell on
533** the page, 1 means the second cell, and so forth) return a pointer
534** to the cell content.
535**
536** This routine works only for pages that do not contain overflow cells.
drh3aac2dd2004-04-26 14:10:20 +0000537*/
drh1688c862008-07-18 02:44:17 +0000538#define findCell(P,I) \
539 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
drh43605152004-05-29 21:46:49 +0000540
541/*
drh93a960a2008-07-10 00:32:42 +0000542** This a more complex version of findCell() that works for
drh43605152004-05-29 21:46:49 +0000543** pages that do contain overflow cells. See insert
544*/
545static u8 *findOverflowCell(MemPage *pPage, int iCell){
546 int i;
drh1fee73e2007-08-29 04:00:57 +0000547 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +0000548 for(i=pPage->nOverflow-1; i>=0; i--){
drh6d08b4d2004-07-20 12:45:22 +0000549 int k;
550 struct _OvflCell *pOvfl;
551 pOvfl = &pPage->aOvfl[i];
552 k = pOvfl->idx;
553 if( k<=iCell ){
554 if( k==iCell ){
555 return pOvfl->pCell;
drh43605152004-05-29 21:46:49 +0000556 }
557 iCell--;
558 }
559 }
danielk19771cc5ed82007-05-16 17:28:43 +0000560 return findCell(pPage, iCell);
drh43605152004-05-29 21:46:49 +0000561}
562
563/*
564** Parse a cell content block and fill in the CellInfo structure. There
drh16a9b832007-05-05 18:39:25 +0000565** are two versions of this function. sqlite3BtreeParseCell() takes a
566** cell index as the second argument and sqlite3BtreeParseCellPtr()
567** takes a pointer to the body of the cell as its second argument.
danielk19771cc5ed82007-05-16 17:28:43 +0000568**
569** Within this file, the parseCell() macro can be called instead of
570** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
drh43605152004-05-29 21:46:49 +0000571*/
drh16a9b832007-05-05 18:39:25 +0000572void sqlite3BtreeParseCellPtr(
drh3aac2dd2004-04-26 14:10:20 +0000573 MemPage *pPage, /* Page containing the cell */
drh43605152004-05-29 21:46:49 +0000574 u8 *pCell, /* Pointer to the cell text. */
drh6f11bef2004-05-13 01:12:56 +0000575 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000576){
drh271efa52004-05-30 19:19:05 +0000577 int n; /* Number bytes in cell content header */
578 u32 nPayload; /* Number of bytes of cell payload */
drh43605152004-05-29 21:46:49 +0000579
drh1fee73e2007-08-29 04:00:57 +0000580 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000581
drh43605152004-05-29 21:46:49 +0000582 pInfo->pCell = pCell;
drhab01f612004-05-22 02:55:23 +0000583 assert( pPage->leaf==0 || pPage->leaf==1 );
drh271efa52004-05-30 19:19:05 +0000584 n = pPage->childPtrSize;
585 assert( n==4-4*pPage->leaf );
drh504b6982006-01-22 21:52:56 +0000586 if( pPage->intKey ){
drh79df1f42008-07-18 00:57:33 +0000587 if( pPage->hasData ){
588 n += getVarint32(&pCell[n], nPayload);
589 }else{
590 nPayload = 0;
591 }
592 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
593 pInfo->nData = nPayload;
drh504b6982006-01-22 21:52:56 +0000594 }else{
drh79df1f42008-07-18 00:57:33 +0000595 pInfo->nData = 0;
596 n += getVarint32(&pCell[n], nPayload);
597 pInfo->nKey = nPayload;
drh6f11bef2004-05-13 01:12:56 +0000598 }
drh72365832007-03-06 15:53:44 +0000599 pInfo->nPayload = nPayload;
drh504b6982006-01-22 21:52:56 +0000600 pInfo->nHeader = n;
drh79df1f42008-07-18 00:57:33 +0000601 if( likely(nPayload<=pPage->maxLocal) ){
drh271efa52004-05-30 19:19:05 +0000602 /* This is the (easy) common case where the entire payload fits
603 ** on the local page. No overflow is required.
604 */
605 int nSize; /* Total size of cell content in bytes */
drh79df1f42008-07-18 00:57:33 +0000606 nSize = nPayload + n;
drh6f11bef2004-05-13 01:12:56 +0000607 pInfo->nLocal = nPayload;
608 pInfo->iOverflow = 0;
drh79df1f42008-07-18 00:57:33 +0000609 if( (nSize & ~3)==0 ){
drh271efa52004-05-30 19:19:05 +0000610 nSize = 4; /* Minimum cell size is 4 */
drh43605152004-05-29 21:46:49 +0000611 }
drh271efa52004-05-30 19:19:05 +0000612 pInfo->nSize = nSize;
drh6f11bef2004-05-13 01:12:56 +0000613 }else{
drh271efa52004-05-30 19:19:05 +0000614 /* If the payload will not fit completely on the local page, we have
615 ** to decide how much to store locally and how much to spill onto
616 ** overflow pages. The strategy is to minimize the amount of unused
617 ** space on overflow pages while keeping the amount of local storage
618 ** in between minLocal and maxLocal.
619 **
620 ** Warning: changing the way overflow payload is distributed in any
621 ** way will result in an incompatible file format.
622 */
623 int minLocal; /* Minimum amount of payload held locally */
624 int maxLocal; /* Maximum amount of payload held locally */
625 int surplus; /* Overflow payload available for local storage */
626
627 minLocal = pPage->minLocal;
628 maxLocal = pPage->maxLocal;
629 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +0000630 if( surplus <= maxLocal ){
631 pInfo->nLocal = surplus;
632 }else{
633 pInfo->nLocal = minLocal;
634 }
635 pInfo->iOverflow = pInfo->nLocal + n;
636 pInfo->nSize = pInfo->iOverflow + 4;
637 }
drh3aac2dd2004-04-26 14:10:20 +0000638}
danielk19771cc5ed82007-05-16 17:28:43 +0000639#define parseCell(pPage, iCell, pInfo) \
640 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
drh16a9b832007-05-05 18:39:25 +0000641void sqlite3BtreeParseCell(
drh43605152004-05-29 21:46:49 +0000642 MemPage *pPage, /* Page containing the cell */
643 int iCell, /* The cell index. First cell is 0 */
644 CellInfo *pInfo /* Fill in this structure */
645){
danielk19771cc5ed82007-05-16 17:28:43 +0000646 parseCell(pPage, iCell, pInfo);
drh43605152004-05-29 21:46:49 +0000647}
drh3aac2dd2004-04-26 14:10:20 +0000648
649/*
drh43605152004-05-29 21:46:49 +0000650** Compute the total number of bytes that a Cell needs in the cell
651** data area of the btree-page. The return number includes the cell
652** data header and the local payload, but not any overflow page or
653** the space used by the cell pointer.
drh3b7511c2001-05-26 13:15:44 +0000654*/
danielk1977bc6ada42004-06-30 08:20:16 +0000655#ifndef NDEBUG
drha9121e42008-02-19 14:59:35 +0000656static u16 cellSize(MemPage *pPage, int iCell){
drh6f11bef2004-05-13 01:12:56 +0000657 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000658 sqlite3BtreeParseCell(pPage, iCell, &info);
drh43605152004-05-29 21:46:49 +0000659 return info.nSize;
660}
danielk1977bc6ada42004-06-30 08:20:16 +0000661#endif
drha9121e42008-02-19 14:59:35 +0000662static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
drh43605152004-05-29 21:46:49 +0000663 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000664 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +0000665 return info.nSize;
drh3b7511c2001-05-26 13:15:44 +0000666}
667
danielk197779a40da2005-01-16 08:00:01 +0000668#ifndef SQLITE_OMIT_AUTOVACUUM
drh3b7511c2001-05-26 13:15:44 +0000669/*
danielk197726836652005-01-17 01:33:13 +0000670** If the cell pCell, part of page pPage contains a pointer
danielk197779a40da2005-01-16 08:00:01 +0000671** to an overflow page, insert an entry into the pointer-map
672** for the overflow page.
danielk1977ac11ee62005-01-15 12:45:51 +0000673*/
danielk197726836652005-01-17 01:33:13 +0000674static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
drhfa67c3c2008-07-11 02:21:40 +0000675 CellInfo info;
676 assert( pCell!=0 );
677 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
678 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
679 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
680 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
681 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977ac11ee62005-01-15 12:45:51 +0000682 }
danielk197779a40da2005-01-16 08:00:01 +0000683 return SQLITE_OK;
danielk1977ac11ee62005-01-15 12:45:51 +0000684}
danielk197726836652005-01-17 01:33:13 +0000685/*
686** If the cell with index iCell on page pPage contains a pointer
687** to an overflow page, insert an entry into the pointer-map
688** for the overflow page.
689*/
690static int ptrmapPutOvfl(MemPage *pPage, int iCell){
691 u8 *pCell;
drh1fee73e2007-08-29 04:00:57 +0000692 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197726836652005-01-17 01:33:13 +0000693 pCell = findOverflowCell(pPage, iCell);
694 return ptrmapPutOvflPtr(pPage, pCell);
695}
danielk197779a40da2005-01-16 08:00:01 +0000696#endif
697
danielk1977ac11ee62005-01-15 12:45:51 +0000698
drhda200cc2004-05-09 11:51:38 +0000699/*
drh72f82862001-05-24 21:06:34 +0000700** Defragment the page given. All Cells are moved to the
drh3a4a2d42005-11-24 14:24:28 +0000701** end of the page and all free space is collected into one
702** big FreeBlk that occurs in between the header and cell
drh31beae92005-11-24 14:34:36 +0000703** pointer array and the cell content area.
drh365d68f2001-05-11 11:02:46 +0000704*/
shane0af3f892008-11-12 04:55:34 +0000705static int defragmentPage(MemPage *pPage){
drh43605152004-05-29 21:46:49 +0000706 int i; /* Loop counter */
707 int pc; /* Address of a i-th cell */
708 int addr; /* Offset of first byte after cell pointer array */
709 int hdr; /* Offset to the page header */
710 int size; /* Size of a cell */
711 int usableSize; /* Number of usable bytes on a page */
712 int cellOffset; /* Offset to the cell pointer array */
drh281b21d2008-08-22 12:57:08 +0000713 int cbrk; /* Offset to the cell content area */
drh43605152004-05-29 21:46:49 +0000714 int nCell; /* Number of cells on the page */
drh2e38c322004-09-03 18:38:44 +0000715 unsigned char *data; /* The page data */
716 unsigned char *temp; /* Temp area for cell content */
drh2af926b2001-05-15 00:39:25 +0000717
danielk19773b8a05f2007-03-19 17:44:26 +0000718 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000719 assert( pPage->pBt!=0 );
drh90f5ecb2004-07-22 01:19:35 +0000720 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
drh43605152004-05-29 21:46:49 +0000721 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +0000722 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh26b79942007-11-28 16:19:56 +0000723 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
drh43605152004-05-29 21:46:49 +0000724 data = pPage->aData;
drh9e572e62004-04-23 23:43:10 +0000725 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000726 cellOffset = pPage->cellOffset;
727 nCell = pPage->nCell;
728 assert( nCell==get2byte(&data[hdr+3]) );
729 usableSize = pPage->pBt->usableSize;
drh281b21d2008-08-22 12:57:08 +0000730 cbrk = get2byte(&data[hdr+5]);
731 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
732 cbrk = usableSize;
drh43605152004-05-29 21:46:49 +0000733 for(i=0; i<nCell; i++){
734 u8 *pAddr; /* The i-th cell pointer */
735 pAddr = &data[cellOffset + i*2];
736 pc = get2byte(pAddr);
shane0af3f892008-11-12 04:55:34 +0000737 if (pc >= pPage->pBt->usableSize) {
738 return SQLITE_CORRUPT_BKPT;
739 }
drh43605152004-05-29 21:46:49 +0000740 size = cellSizePtr(pPage, &temp[pc]);
drh281b21d2008-08-22 12:57:08 +0000741 cbrk -= size;
shane0af3f892008-11-12 04:55:34 +0000742 if ((cbrk < cellOffset+2*nCell) || (cbrk+size>pPage->pBt->usableSize)) {
743 return SQLITE_CORRUPT_BKPT;
744 }
drh281b21d2008-08-22 12:57:08 +0000745 memcpy(&data[cbrk], &temp[pc], size);
746 put2byte(pAddr, cbrk);
drh2af926b2001-05-15 00:39:25 +0000747 }
drh281b21d2008-08-22 12:57:08 +0000748 assert( cbrk>=cellOffset+2*nCell );
749 put2byte(&data[hdr+5], cbrk);
drh43605152004-05-29 21:46:49 +0000750 data[hdr+1] = 0;
751 data[hdr+2] = 0;
752 data[hdr+7] = 0;
753 addr = cellOffset+2*nCell;
drh281b21d2008-08-22 12:57:08 +0000754 memset(&data[addr], 0, cbrk-addr);
danielk1977360e6342008-11-12 08:49:51 +0000755 if( cbrk-addr!=pPage->nFree ){
756 return SQLITE_CORRUPT_BKPT;
757 }
shane0af3f892008-11-12 04:55:34 +0000758 return SQLITE_OK;
drh365d68f2001-05-11 11:02:46 +0000759}
760
drha059ad02001-04-17 20:09:11 +0000761/*
drh43605152004-05-29 21:46:49 +0000762** Allocate nByte bytes of space on a page.
drhbd03cae2001-06-02 02:40:57 +0000763**
drh9e572e62004-04-23 23:43:10 +0000764** Return the index into pPage->aData[] of the first byte of
drhfa67c3c2008-07-11 02:21:40 +0000765** the new allocation. The caller guarantees that there is enough
766** space. This routine will never fail.
drh2af926b2001-05-15 00:39:25 +0000767**
drh72f82862001-05-24 21:06:34 +0000768** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000769** nBytes of contiguous free space, then this routine automatically
770** calls defragementPage() to consolidate all free space before
771** allocating the new chunk.
drh7e3b0a02001-04-28 16:52:40 +0000772*/
drh9e572e62004-04-23 23:43:10 +0000773static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000774 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000775 int size;
drh24cd67e2004-05-10 16:18:47 +0000776 int nFrag;
drh43605152004-05-29 21:46:49 +0000777 int top;
778 int nCell;
779 int cellOffset;
drh9e572e62004-04-23 23:43:10 +0000780 unsigned char *data;
drh43605152004-05-29 21:46:49 +0000781
drh9e572e62004-04-23 23:43:10 +0000782 data = pPage->aData;
danielk19773b8a05f2007-03-19 17:44:26 +0000783 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000784 assert( pPage->pBt );
drh1fee73e2007-08-29 04:00:57 +0000785 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa67c3c2008-07-11 02:21:40 +0000786 assert( nByte>=0 ); /* Minimum cell size is 4 */
787 assert( pPage->nFree>=nByte );
788 assert( pPage->nOverflow==0 );
drh43605152004-05-29 21:46:49 +0000789 pPage->nFree -= nByte;
drh9e572e62004-04-23 23:43:10 +0000790 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000791
792 nFrag = data[hdr+7];
793 if( nFrag<60 ){
794 /* Search the freelist looking for a slot big enough to satisfy the
795 ** space request. */
796 addr = hdr+1;
797 while( (pc = get2byte(&data[addr]))>0 ){
798 size = get2byte(&data[pc+2]);
799 if( size>=nByte ){
800 if( size<nByte+4 ){
801 memcpy(&data[addr], &data[pc], 2);
802 data[hdr+7] = nFrag + size - nByte;
803 return pc;
804 }else{
805 put2byte(&data[pc+2], size-nByte);
806 return pc + size - nByte;
807 }
808 }
809 addr = pc;
drh9e572e62004-04-23 23:43:10 +0000810 }
811 }
drh43605152004-05-29 21:46:49 +0000812
813 /* Allocate memory from the gap in between the cell pointer array
814 ** and the cell content area.
815 */
816 top = get2byte(&data[hdr+5]);
817 nCell = get2byte(&data[hdr+3]);
818 cellOffset = pPage->cellOffset;
819 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
danielk1977474b7cc2008-07-09 11:49:46 +0000820 defragmentPage(pPage);
drh43605152004-05-29 21:46:49 +0000821 top = get2byte(&data[hdr+5]);
drh2af926b2001-05-15 00:39:25 +0000822 }
drh43605152004-05-29 21:46:49 +0000823 top -= nByte;
824 assert( cellOffset + 2*nCell <= top );
825 put2byte(&data[hdr+5], top);
826 return top;
drh7e3b0a02001-04-28 16:52:40 +0000827}
828
829/*
drh9e572e62004-04-23 23:43:10 +0000830** Return a section of the pPage->aData to the freelist.
831** The first byte of the new free block is pPage->aDisk[start]
832** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000833**
834** Most of the effort here is involved in coalesing adjacent
835** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000836*/
drh9e572e62004-04-23 23:43:10 +0000837static void freeSpace(MemPage *pPage, int start, int size){
drh43605152004-05-29 21:46:49 +0000838 int addr, pbegin, hdr;
drh9e572e62004-04-23 23:43:10 +0000839 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000840
drh9e572e62004-04-23 23:43:10 +0000841 assert( pPage->pBt!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +0000842 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000843 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
danielk1977bc6ada42004-06-30 08:20:16 +0000844 assert( (start + size)<=pPage->pBt->usableSize );
drh1fee73e2007-08-29 04:00:57 +0000845 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh34004ce2008-07-11 16:15:17 +0000846 assert( size>=0 ); /* Minimum cell size is 4 */
drh9e572e62004-04-23 23:43:10 +0000847
drhfcce93f2006-02-22 03:08:32 +0000848#ifdef SQLITE_SECURE_DELETE
849 /* Overwrite deleted information with zeros when the SECURE_DELETE
850 ** option is enabled at compile-time */
851 memset(&data[start], 0, size);
852#endif
853
drh9e572e62004-04-23 23:43:10 +0000854 /* Add the space back into the linked list of freeblocks */
drh43605152004-05-29 21:46:49 +0000855 hdr = pPage->hdrOffset;
856 addr = hdr + 1;
drh3aac2dd2004-04-26 14:10:20 +0000857 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drhb6f41482004-05-14 01:58:11 +0000858 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000859 assert( pbegin>addr );
860 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000861 }
drhb6f41482004-05-14 01:58:11 +0000862 assert( pbegin<=pPage->pBt->usableSize-4 );
drh3aac2dd2004-04-26 14:10:20 +0000863 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000864 put2byte(&data[addr], start);
865 put2byte(&data[start], pbegin);
866 put2byte(&data[start+2], size);
drh2af926b2001-05-15 00:39:25 +0000867 pPage->nFree += size;
drh9e572e62004-04-23 23:43:10 +0000868
869 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000870 addr = pPage->hdrOffset + 1;
871 while( (pbegin = get2byte(&data[addr]))>0 ){
drh9e572e62004-04-23 23:43:10 +0000872 int pnext, psize;
drh3aac2dd2004-04-26 14:10:20 +0000873 assert( pbegin>addr );
drh43605152004-05-29 21:46:49 +0000874 assert( pbegin<=pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +0000875 pnext = get2byte(&data[pbegin]);
876 psize = get2byte(&data[pbegin+2]);
877 if( pbegin + psize + 3 >= pnext && pnext>0 ){
878 int frag = pnext - (pbegin+psize);
drh43605152004-05-29 21:46:49 +0000879 assert( frag<=data[pPage->hdrOffset+7] );
880 data[pPage->hdrOffset+7] -= frag;
drh9e572e62004-04-23 23:43:10 +0000881 put2byte(&data[pbegin], get2byte(&data[pnext]));
882 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
883 }else{
drh3aac2dd2004-04-26 14:10:20 +0000884 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000885 }
886 }
drh7e3b0a02001-04-28 16:52:40 +0000887
drh43605152004-05-29 21:46:49 +0000888 /* If the cell content area begins with a freeblock, remove it. */
889 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
890 int top;
891 pbegin = get2byte(&data[hdr+1]);
892 memcpy(&data[hdr+1], &data[pbegin], 2);
893 top = get2byte(&data[hdr+5]);
894 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
drh4b70f112004-05-02 21:12:19 +0000895 }
drh4b70f112004-05-02 21:12:19 +0000896}
897
898/*
drh271efa52004-05-30 19:19:05 +0000899** Decode the flags byte (the first byte of the header) for a page
900** and initialize fields of the MemPage structure accordingly.
drh44845222008-07-17 18:39:57 +0000901**
902** Only the following combinations are supported. Anything different
903** indicates a corrupt database files:
904**
905** PTF_ZERODATA
906** PTF_ZERODATA | PTF_LEAF
907** PTF_LEAFDATA | PTF_INTKEY
908** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
drh271efa52004-05-30 19:19:05 +0000909*/
drh44845222008-07-17 18:39:57 +0000910static int decodeFlags(MemPage *pPage, int flagByte){
danielk1977aef0bf62005-12-30 16:28:01 +0000911 BtShared *pBt; /* A copy of pPage->pBt */
drh271efa52004-05-30 19:19:05 +0000912
913 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
drh1fee73e2007-08-29 04:00:57 +0000914 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh44845222008-07-17 18:39:57 +0000915 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
916 flagByte &= ~PTF_LEAF;
917 pPage->childPtrSize = 4-4*pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000918 pBt = pPage->pBt;
drh44845222008-07-17 18:39:57 +0000919 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
920 pPage->intKey = 1;
921 pPage->hasData = pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000922 pPage->maxLocal = pBt->maxLeaf;
923 pPage->minLocal = pBt->minLeaf;
drh44845222008-07-17 18:39:57 +0000924 }else if( flagByte==PTF_ZERODATA ){
925 pPage->intKey = 0;
926 pPage->hasData = 0;
drh271efa52004-05-30 19:19:05 +0000927 pPage->maxLocal = pBt->maxLocal;
928 pPage->minLocal = pBt->minLocal;
drh44845222008-07-17 18:39:57 +0000929 }else{
930 return SQLITE_CORRUPT_BKPT;
drh271efa52004-05-30 19:19:05 +0000931 }
drh44845222008-07-17 18:39:57 +0000932 return SQLITE_OK;
drh271efa52004-05-30 19:19:05 +0000933}
934
935/*
drh7e3b0a02001-04-28 16:52:40 +0000936** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000937**
938** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000939** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000940** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
941** guarantee that the page is well-formed. It only shows that
942** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000943*/
danielk197771d5d2c2008-09-29 11:49:47 +0000944int sqlite3BtreeInitPage(MemPage *pPage){
drh2af926b2001-05-15 00:39:25 +0000945
danielk197771d5d2c2008-09-29 11:49:47 +0000946 assert( pPage->pBt!=0 );
947 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +0000948 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
drhbf4bca52007-09-06 22:19:14 +0000949 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
950 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +0000951
952 if( !pPage->isInit ){
953 int pc; /* Address of a freeblock within pPage->aData[] */
954 int hdr; /* Offset to beginning of page header */
955 u8 *data; /* Equal to pPage->aData */
956 BtShared *pBt; /* The main btree structure */
957 int usableSize; /* Amount of usable space on each page */
958 int cellOffset; /* Offset from start of page to first cell pointer */
959 int nFree; /* Number of unused bytes on the page */
960 int top; /* First byte of the cell content area */
961
962 pBt = pPage->pBt;
963
danielk1977eaa06f62008-09-18 17:34:44 +0000964 hdr = pPage->hdrOffset;
965 data = pPage->aData;
966 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
967 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
968 pPage->maskPage = pBt->pageSize - 1;
969 pPage->nOverflow = 0;
danielk1977eaa06f62008-09-18 17:34:44 +0000970 usableSize = pBt->usableSize;
971 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
972 top = get2byte(&data[hdr+5]);
973 pPage->nCell = get2byte(&data[hdr+3]);
974 if( pPage->nCell>MX_CELL(pBt) ){
975 /* To many cells for a single page. The page must be corrupt */
976 return SQLITE_CORRUPT_BKPT;
977 }
danielk1977eaa06f62008-09-18 17:34:44 +0000978
979 /* Compute the total free space on the page */
980 pc = get2byte(&data[hdr+1]);
981 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
982 while( pc>0 ){
983 int next, size;
984 if( pc>usableSize-4 ){
985 /* Free block is off the page */
986 return SQLITE_CORRUPT_BKPT;
987 }
988 next = get2byte(&data[pc]);
989 size = get2byte(&data[pc+2]);
990 if( next>0 && next<=pc+size+3 ){
991 /* Free blocks must be in accending order */
992 return SQLITE_CORRUPT_BKPT;
993 }
994 nFree += size;
995 pc = next;
996 }
997 pPage->nFree = nFree;
998 if( nFree>=usableSize ){
999 /* Free space cannot exceed total page size */
drh49285702005-09-17 15:20:26 +00001000 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001001 }
drh9e572e62004-04-23 23:43:10 +00001002
drh1688c862008-07-18 02:44:17 +00001003#if 0
1004 /* Check that all the offsets in the cell offset array are within range.
1005 **
1006 ** Omitting this consistency check and using the pPage->maskPage mask
1007 ** to prevent overrunning the page buffer in findCell() results in a
1008 ** 2.5% performance gain.
1009 */
1010 {
1011 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1012 u8 *pEnd; /* Pointer to end of cell offset array */
1013 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1014 mask = ~(((u8)(pBt->pageSize>>8))-1);
1015 pEnd = &data[cellOffset + pPage->nCell*2];
1016 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1017 if( pOff!=pEnd ){
1018 return SQLITE_CORRUPT_BKPT;
1019 }
danielk1977e16535f2008-06-11 18:15:29 +00001020 }
drh1688c862008-07-18 02:44:17 +00001021#endif
danielk1977e16535f2008-06-11 18:15:29 +00001022
danielk197771d5d2c2008-09-29 11:49:47 +00001023 pPage->isInit = 1;
1024 }
drh9e572e62004-04-23 23:43:10 +00001025 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001026}
1027
1028/*
drh8b2f49b2001-06-08 00:21:52 +00001029** Set up a raw page so that it looks like a database page holding
1030** no entries.
drhbd03cae2001-06-02 02:40:57 +00001031*/
drh9e572e62004-04-23 23:43:10 +00001032static void zeroPage(MemPage *pPage, int flags){
1033 unsigned char *data = pPage->aData;
danielk1977aef0bf62005-12-30 16:28:01 +00001034 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00001035 int hdr = pPage->hdrOffset;
drh9e572e62004-04-23 23:43:10 +00001036 int first;
1037
danielk19773b8a05f2007-03-19 17:44:26 +00001038 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
drhbf4bca52007-09-06 22:19:14 +00001039 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1040 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
danielk19773b8a05f2007-03-19 17:44:26 +00001041 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00001042 assert( sqlite3_mutex_held(pBt->mutex) );
drh1af4a6e2008-07-18 03:32:51 +00001043 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
drh9e572e62004-04-23 23:43:10 +00001044 data[hdr] = flags;
drh43605152004-05-29 21:46:49 +00001045 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1046 memset(&data[hdr+1], 0, 4);
1047 data[hdr+7] = 0;
1048 put2byte(&data[hdr+5], pBt->usableSize);
drhb6f41482004-05-14 01:58:11 +00001049 pPage->nFree = pBt->usableSize - first;
drh271efa52004-05-30 19:19:05 +00001050 decodeFlags(pPage, flags);
drh9e572e62004-04-23 23:43:10 +00001051 pPage->hdrOffset = hdr;
drh43605152004-05-29 21:46:49 +00001052 pPage->cellOffset = first;
1053 pPage->nOverflow = 0;
drh1688c862008-07-18 02:44:17 +00001054 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1055 pPage->maskPage = pBt->pageSize - 1;
drh43605152004-05-29 21:46:49 +00001056 pPage->nCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001057 pPage->isInit = 1;
drhbd03cae2001-06-02 02:40:57 +00001058}
1059
drh897a8202008-09-18 01:08:15 +00001060
1061/*
1062** Convert a DbPage obtained from the pager into a MemPage used by
1063** the btree layer.
1064*/
1065static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1066 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1067 pPage->aData = sqlite3PagerGetData(pDbPage);
1068 pPage->pDbPage = pDbPage;
1069 pPage->pBt = pBt;
1070 pPage->pgno = pgno;
1071 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1072 return pPage;
1073}
1074
drhbd03cae2001-06-02 02:40:57 +00001075/*
drh3aac2dd2004-04-26 14:10:20 +00001076** Get a page from the pager. Initialize the MemPage.pBt and
1077** MemPage.aData elements if needed.
drh538f5702007-04-13 02:14:30 +00001078**
1079** If the noContent flag is set, it means that we do not care about
1080** the content of the page at this time. So do not go to the disk
1081** to fetch the content. Just fill in the content with zeros for now.
1082** If in the future we call sqlite3PagerWrite() on this page, that
1083** means we have started to be concerned about content and the disk
1084** read should occur at that point.
drh3aac2dd2004-04-26 14:10:20 +00001085*/
drh16a9b832007-05-05 18:39:25 +00001086int sqlite3BtreeGetPage(
1087 BtShared *pBt, /* The btree */
1088 Pgno pgno, /* Number of the page to fetch */
1089 MemPage **ppPage, /* Return the page in this parameter */
1090 int noContent /* Do not load page content if true */
1091){
drh3aac2dd2004-04-26 14:10:20 +00001092 int rc;
danielk19773b8a05f2007-03-19 17:44:26 +00001093 DbPage *pDbPage;
1094
drh1fee73e2007-08-29 04:00:57 +00001095 assert( sqlite3_mutex_held(pBt->mutex) );
drh538f5702007-04-13 02:14:30 +00001096 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
drh3aac2dd2004-04-26 14:10:20 +00001097 if( rc ) return rc;
drh897a8202008-09-18 01:08:15 +00001098 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
drh3aac2dd2004-04-26 14:10:20 +00001099 return SQLITE_OK;
1100}
1101
1102/*
danielk197767fd7a92008-09-10 17:53:35 +00001103** Return the size of the database file in pages. Or return -1 if
1104** there is any kind of error.
1105*/
1106static int pagerPagecount(Pager *pPager){
1107 int rc;
1108 int nPage;
1109 rc = sqlite3PagerPagecount(pPager, &nPage);
1110 return (rc==SQLITE_OK?nPage:-1);
1111}
1112
1113/*
drhde647132004-05-07 17:57:49 +00001114** Get a page from the pager and initialize it. This routine
1115** is just a convenience wrapper around separate calls to
drh16a9b832007-05-05 18:39:25 +00001116** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
drhde647132004-05-07 17:57:49 +00001117*/
1118static int getAndInitPage(
danielk1977aef0bf62005-12-30 16:28:01 +00001119 BtShared *pBt, /* The database file */
drhde647132004-05-07 17:57:49 +00001120 Pgno pgno, /* Number of the page to get */
danielk197771d5d2c2008-09-29 11:49:47 +00001121 MemPage **ppPage /* Write the page pointer here */
drhde647132004-05-07 17:57:49 +00001122){
1123 int rc;
drh897a8202008-09-18 01:08:15 +00001124 DbPage *pDbPage;
1125 MemPage *pPage;
1126
drh1fee73e2007-08-29 04:00:57 +00001127 assert( sqlite3_mutex_held(pBt->mutex) );
drh897a8202008-09-18 01:08:15 +00001128 if( pgno==0 ){
drh49285702005-09-17 15:20:26 +00001129 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001130 }
danielk19779f580ad2008-09-10 14:45:57 +00001131
drh897a8202008-09-18 01:08:15 +00001132 /* It is often the case that the page we want is already in cache.
1133 ** If so, get it directly. This saves us from having to call
1134 ** pagerPagecount() to make sure pgno is within limits, which results
1135 ** in a measureable performance improvements.
1136 */
1137 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1138 if( pDbPage ){
1139 /* Page is already in cache */
1140 *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1141 rc = SQLITE_OK;
1142 }else{
1143 /* Page not in cache. Acquire it. */
1144 if( pgno>pagerPagecount(pBt->pPager) ){
1145 return SQLITE_CORRUPT_BKPT;
1146 }
1147 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1148 if( rc ) return rc;
1149 pPage = *ppPage;
1150 }
danielk197771d5d2c2008-09-29 11:49:47 +00001151 if( !pPage->isInit ){
1152 rc = sqlite3BtreeInitPage(pPage);
drh897a8202008-09-18 01:08:15 +00001153 }
1154 if( rc!=SQLITE_OK ){
1155 releasePage(pPage);
1156 *ppPage = 0;
1157 }
drhde647132004-05-07 17:57:49 +00001158 return rc;
1159}
1160
1161/*
drh3aac2dd2004-04-26 14:10:20 +00001162** Release a MemPage. This should be called once for each prior
drh16a9b832007-05-05 18:39:25 +00001163** call to sqlite3BtreeGetPage.
drh3aac2dd2004-04-26 14:10:20 +00001164*/
drh4b70f112004-05-02 21:12:19 +00001165static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +00001166 if( pPage ){
1167 assert( pPage->aData );
1168 assert( pPage->pBt );
drhbf4bca52007-09-06 22:19:14 +00001169 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1170 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
drh1fee73e2007-08-29 04:00:57 +00001171 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +00001172 sqlite3PagerUnref(pPage->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00001173 }
1174}
1175
1176/*
drha6abd042004-06-09 17:37:22 +00001177** During a rollback, when the pager reloads information into the cache
1178** so that the cache is restored to its original state at the start of
1179** the transaction, for each page restored this routine is called.
1180**
1181** This routine needs to reset the extra data section at the end of the
1182** page to agree with the restored data.
1183*/
danielk1977eaa06f62008-09-18 17:34:44 +00001184static void pageReinit(DbPage *pData){
drh07d183d2005-05-01 22:52:42 +00001185 MemPage *pPage;
danielk19773b8a05f2007-03-19 17:44:26 +00001186 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
danielk197771d5d2c2008-09-29 11:49:47 +00001187 if( pPage->isInit ){
drh1fee73e2007-08-29 04:00:57 +00001188 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drha6abd042004-06-09 17:37:22 +00001189 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001190 if( sqlite3PagerPageRefcount(pData)>0 ){
1191 sqlite3BtreeInitPage(pPage);
1192 }
drha6abd042004-06-09 17:37:22 +00001193 }
1194}
1195
1196/*
drhe5fe6902007-12-07 18:55:28 +00001197** Invoke the busy handler for a btree.
1198*/
1199static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1200 BtShared *pBt = (BtShared*)pArg;
1201 assert( pBt->db );
1202 assert( sqlite3_mutex_held(pBt->db->mutex) );
1203 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1204}
1205
1206/*
drhad3e0102004-09-03 23:32:18 +00001207** Open a database file.
1208**
drh382c0242001-10-06 16:33:02 +00001209** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +00001210** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +00001211** database file will be deleted when sqlite3BtreeClose() is called.
drhe53831d2007-08-17 01:14:38 +00001212** If zFilename is ":memory:" then an in-memory database is created
1213** that is automatically destroyed when it is closed.
drha059ad02001-04-17 20:09:11 +00001214*/
drh23e11ca2004-05-04 17:27:28 +00001215int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +00001216 const char *zFilename, /* Name of the file containing the BTree database */
drhe5fe6902007-12-07 18:55:28 +00001217 sqlite3 *db, /* Associated database handle */
drh3aac2dd2004-04-26 14:10:20 +00001218 Btree **ppBtree, /* Pointer to new Btree object written here */
drh33f4e022007-09-03 15:19:34 +00001219 int flags, /* Options */
1220 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
drh6019e162001-07-02 17:51:45 +00001221){
drhd677b3d2007-08-20 22:48:41 +00001222 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
drhe53831d2007-08-17 01:14:38 +00001223 BtShared *pBt = 0; /* Shared part of btree structure */
danielk1977aef0bf62005-12-30 16:28:01 +00001224 Btree *p; /* Handle to return */
danielk1977dddbcdc2007-04-26 14:42:34 +00001225 int rc = SQLITE_OK;
drh90f5ecb2004-07-22 01:19:35 +00001226 int nReserve;
1227 unsigned char zDbHeader[100];
danielk1977aef0bf62005-12-30 16:28:01 +00001228
1229 /* Set the variable isMemdb to true for an in-memory database, or
1230 ** false for a file-based database. This symbol is only required if
1231 ** either of the shared-data or autovacuum features are compiled
1232 ** into the library.
1233 */
1234#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1235 #ifdef SQLITE_OMIT_MEMORYDB
drh980b1a72006-08-16 16:42:48 +00001236 const int isMemdb = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00001237 #else
drh980b1a72006-08-16 16:42:48 +00001238 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
danielk1977aef0bf62005-12-30 16:28:01 +00001239 #endif
1240#endif
1241
drhe5fe6902007-12-07 18:55:28 +00001242 assert( db!=0 );
1243 assert( sqlite3_mutex_held(db->mutex) );
drh153c62c2007-08-24 03:51:33 +00001244
drhe5fe6902007-12-07 18:55:28 +00001245 pVfs = db->pVfs;
drh17435752007-08-16 04:30:38 +00001246 p = sqlite3MallocZero(sizeof(Btree));
danielk1977aef0bf62005-12-30 16:28:01 +00001247 if( !p ){
1248 return SQLITE_NOMEM;
1249 }
1250 p->inTrans = TRANS_NONE;
drhe5fe6902007-12-07 18:55:28 +00001251 p->db = db;
danielk1977aef0bf62005-12-30 16:28:01 +00001252
drh198bf392006-01-06 21:52:49 +00001253#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001254 /*
1255 ** If this Btree is a candidate for shared cache, try to find an
1256 ** existing BtShared object that we can share with
1257 */
drh34004ce2008-07-11 16:15:17 +00001258 if( isMemdb==0
drhe5fe6902007-12-07 18:55:28 +00001259 && (db->flags & SQLITE_Vtab)==0
drhe53831d2007-08-17 01:14:38 +00001260 && zFilename && zFilename[0]
drhe53831d2007-08-17 01:14:38 +00001261 ){
danielk1977502b4e02008-09-02 14:07:24 +00001262 if( sqlite3GlobalConfig.sharedCacheEnabled ){
danielk1977adfb9b02007-09-17 07:02:56 +00001263 int nFullPathname = pVfs->mxPathname+1;
drhe5ae5732008-06-15 02:51:47 +00001264 char *zFullPathname = sqlite3Malloc(nFullPathname);
drhff0587c2007-08-29 17:43:19 +00001265 sqlite3_mutex *mutexShared;
1266 p->sharable = 1;
drh34004ce2008-07-11 16:15:17 +00001267 db->flags |= SQLITE_SharedCache;
drhff0587c2007-08-29 17:43:19 +00001268 if( !zFullPathname ){
1269 sqlite3_free(p);
1270 return SQLITE_NOMEM;
1271 }
danielk1977adfb9b02007-09-17 07:02:56 +00001272 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
danielk197759f8c082008-06-18 17:09:10 +00001273 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhff0587c2007-08-29 17:43:19 +00001274 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001275 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
drhff0587c2007-08-29 17:43:19 +00001276 assert( pBt->nRef>0 );
1277 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1278 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1279 p->pBt = pBt;
1280 pBt->nRef++;
1281 break;
1282 }
1283 }
1284 sqlite3_mutex_leave(mutexShared);
1285 sqlite3_free(zFullPathname);
danielk1977aef0bf62005-12-30 16:28:01 +00001286 }
drhff0587c2007-08-29 17:43:19 +00001287#ifdef SQLITE_DEBUG
1288 else{
1289 /* In debug mode, we mark all persistent databases as sharable
1290 ** even when they are not. This exercises the locking code and
1291 ** gives more opportunity for asserts(sqlite3_mutex_held())
1292 ** statements to find locking problems.
1293 */
1294 p->sharable = 1;
1295 }
1296#endif
danielk1977aef0bf62005-12-30 16:28:01 +00001297 }
1298#endif
drha059ad02001-04-17 20:09:11 +00001299 if( pBt==0 ){
drhe53831d2007-08-17 01:14:38 +00001300 /*
1301 ** The following asserts make sure that structures used by the btree are
1302 ** the right size. This is to guard against size changes that result
1303 ** when compiling on a different architecture.
danielk197703aded42004-11-22 05:26:27 +00001304 */
drhe53831d2007-08-17 01:14:38 +00001305 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1306 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1307 assert( sizeof(u32)==4 );
1308 assert( sizeof(u16)==2 );
1309 assert( sizeof(Pgno)==4 );
1310
1311 pBt = sqlite3MallocZero( sizeof(*pBt) );
1312 if( pBt==0 ){
1313 rc = SQLITE_NOMEM;
1314 goto btree_open_out;
1315 }
drhe5fe6902007-12-07 18:55:28 +00001316 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1317 pBt->busyHdr.pArg = pBt;
danielk197771d5d2c2008-09-29 11:49:47 +00001318 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
drh33f4e022007-09-03 15:19:34 +00001319 EXTRA_SIZE, flags, vfsFlags);
drhe53831d2007-08-17 01:14:38 +00001320 if( rc==SQLITE_OK ){
1321 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1322 }
1323 if( rc!=SQLITE_OK ){
1324 goto btree_open_out;
1325 }
drhe5fe6902007-12-07 18:55:28 +00001326 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
drhe53831d2007-08-17 01:14:38 +00001327 p->pBt = pBt;
1328
drhe53831d2007-08-17 01:14:38 +00001329 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1330 pBt->pCursor = 0;
1331 pBt->pPage1 = 0;
1332 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1333 pBt->pageSize = get2byte(&zDbHeader[16]);
1334 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1335 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
danielk1977a1644fd2007-08-29 12:31:25 +00001336 pBt->pageSize = 0;
1337 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001338#ifndef SQLITE_OMIT_AUTOVACUUM
1339 /* If the magic name ":memory:" will create an in-memory database, then
1340 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1341 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1342 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1343 ** regular file-name. In this case the auto-vacuum applies as per normal.
1344 */
1345 if( zFilename && !isMemdb ){
1346 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1347 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1348 }
1349#endif
1350 nReserve = 0;
1351 }else{
1352 nReserve = zDbHeader[20];
drhe53831d2007-08-17 01:14:38 +00001353 pBt->pageSizeFixed = 1;
1354#ifndef SQLITE_OMIT_AUTOVACUUM
1355 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1356 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1357#endif
1358 }
1359 pBt->usableSize = pBt->pageSize - nReserve;
1360 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
danielk1977a1644fd2007-08-29 12:31:25 +00001361 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001362
1363#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1364 /* Add the new BtShared object to the linked list sharable BtShareds.
1365 */
1366 if( p->sharable ){
1367 sqlite3_mutex *mutexShared;
1368 pBt->nRef = 1;
danielk197759f8c082008-06-18 17:09:10 +00001369 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
danielk1977075c23a2008-09-01 18:34:20 +00001370 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
danielk197759f8c082008-06-18 17:09:10 +00001371 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
drh3285db22007-09-03 22:00:39 +00001372 if( pBt->mutex==0 ){
1373 rc = SQLITE_NOMEM;
drhe5fe6902007-12-07 18:55:28 +00001374 db->mallocFailed = 0;
drh3285db22007-09-03 22:00:39 +00001375 goto btree_open_out;
1376 }
drhff0587c2007-08-29 17:43:19 +00001377 }
drhe53831d2007-08-17 01:14:38 +00001378 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001379 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1380 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
drhe53831d2007-08-17 01:14:38 +00001381 sqlite3_mutex_leave(mutexShared);
danielk1977951af802004-11-05 15:45:09 +00001382 }
drheee46cf2004-11-06 00:02:48 +00001383#endif
drh90f5ecb2004-07-22 01:19:35 +00001384 }
danielk1977aef0bf62005-12-30 16:28:01 +00001385
drhcfed7bc2006-03-13 14:28:05 +00001386#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001387 /* If the new Btree uses a sharable pBtShared, then link the new
1388 ** Btree into the list of all sharable Btrees for the same connection.
drhabddb0c2007-08-20 13:14:28 +00001389 ** The list is kept in ascending order by pBt address.
danielk197754f01982006-01-18 15:25:17 +00001390 */
drhe53831d2007-08-17 01:14:38 +00001391 if( p->sharable ){
1392 int i;
1393 Btree *pSib;
drhe5fe6902007-12-07 18:55:28 +00001394 for(i=0; i<db->nDb; i++){
1395 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
drhe53831d2007-08-17 01:14:38 +00001396 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1397 if( p->pBt<pSib->pBt ){
1398 p->pNext = pSib;
1399 p->pPrev = 0;
1400 pSib->pPrev = p;
1401 }else{
drhabddb0c2007-08-20 13:14:28 +00001402 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
drhe53831d2007-08-17 01:14:38 +00001403 pSib = pSib->pNext;
1404 }
1405 p->pNext = pSib->pNext;
1406 p->pPrev = pSib;
1407 if( p->pNext ){
1408 p->pNext->pPrev = p;
1409 }
1410 pSib->pNext = p;
1411 }
1412 break;
1413 }
1414 }
danielk1977aef0bf62005-12-30 16:28:01 +00001415 }
danielk1977aef0bf62005-12-30 16:28:01 +00001416#endif
1417 *ppBtree = p;
danielk1977dddbcdc2007-04-26 14:42:34 +00001418
1419btree_open_out:
1420 if( rc!=SQLITE_OK ){
1421 if( pBt && pBt->pPager ){
1422 sqlite3PagerClose(pBt->pPager);
1423 }
drh17435752007-08-16 04:30:38 +00001424 sqlite3_free(pBt);
1425 sqlite3_free(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001426 *ppBtree = 0;
1427 }
1428 return rc;
drha059ad02001-04-17 20:09:11 +00001429}
1430
1431/*
drhe53831d2007-08-17 01:14:38 +00001432** Decrement the BtShared.nRef counter. When it reaches zero,
1433** remove the BtShared structure from the sharing list. Return
1434** true if the BtShared.nRef counter reaches zero and return
1435** false if it is still positive.
1436*/
1437static int removeFromSharingList(BtShared *pBt){
1438#ifndef SQLITE_OMIT_SHARED_CACHE
1439 sqlite3_mutex *pMaster;
1440 BtShared *pList;
1441 int removed = 0;
1442
drhd677b3d2007-08-20 22:48:41 +00001443 assert( sqlite3_mutex_notheld(pBt->mutex) );
danielk197759f8c082008-06-18 17:09:10 +00001444 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhe53831d2007-08-17 01:14:38 +00001445 sqlite3_mutex_enter(pMaster);
1446 pBt->nRef--;
1447 if( pBt->nRef<=0 ){
drh78f82d12008-09-02 00:52:52 +00001448 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1449 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
drhe53831d2007-08-17 01:14:38 +00001450 }else{
drh78f82d12008-09-02 00:52:52 +00001451 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
drh34004ce2008-07-11 16:15:17 +00001452 while( ALWAYS(pList) && pList->pNext!=pBt ){
drhe53831d2007-08-17 01:14:38 +00001453 pList=pList->pNext;
1454 }
drh34004ce2008-07-11 16:15:17 +00001455 if( ALWAYS(pList) ){
drhe53831d2007-08-17 01:14:38 +00001456 pList->pNext = pBt->pNext;
1457 }
1458 }
drh3285db22007-09-03 22:00:39 +00001459 if( SQLITE_THREADSAFE ){
1460 sqlite3_mutex_free(pBt->mutex);
1461 }
drhe53831d2007-08-17 01:14:38 +00001462 removed = 1;
1463 }
1464 sqlite3_mutex_leave(pMaster);
1465 return removed;
1466#else
1467 return 1;
1468#endif
1469}
1470
1471/*
drhf7141992008-06-19 00:16:08 +00001472** Make sure pBt->pTmpSpace points to an allocation of
1473** MX_CELL_SIZE(pBt) bytes.
1474*/
1475static void allocateTempSpace(BtShared *pBt){
1476 if( !pBt->pTmpSpace ){
1477 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1478 }
1479}
1480
1481/*
1482** Free the pBt->pTmpSpace allocation
1483*/
1484static void freeTempSpace(BtShared *pBt){
1485 sqlite3PageFree( pBt->pTmpSpace);
1486 pBt->pTmpSpace = 0;
1487}
1488
1489/*
drha059ad02001-04-17 20:09:11 +00001490** Close an open database and invalidate all cursors.
1491*/
danielk1977aef0bf62005-12-30 16:28:01 +00001492int sqlite3BtreeClose(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00001493 BtShared *pBt = p->pBt;
1494 BtCursor *pCur;
1495
danielk1977aef0bf62005-12-30 16:28:01 +00001496 /* Close all cursors opened via this handle. */
drhe5fe6902007-12-07 18:55:28 +00001497 assert( sqlite3_mutex_held(p->db->mutex) );
drhe53831d2007-08-17 01:14:38 +00001498 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001499 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001500 pCur = pBt->pCursor;
1501 while( pCur ){
1502 BtCursor *pTmp = pCur;
1503 pCur = pCur->pNext;
1504 if( pTmp->pBtree==p ){
1505 sqlite3BtreeCloseCursor(pTmp);
1506 }
drha059ad02001-04-17 20:09:11 +00001507 }
danielk1977aef0bf62005-12-30 16:28:01 +00001508
danielk19778d34dfd2006-01-24 16:37:57 +00001509 /* Rollback any active transaction and free the handle structure.
1510 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1511 ** this handle.
1512 */
danielk1977b597f742006-01-15 11:39:18 +00001513 sqlite3BtreeRollback(p);
drhe53831d2007-08-17 01:14:38 +00001514 sqlite3BtreeLeave(p);
danielk1977aef0bf62005-12-30 16:28:01 +00001515
danielk1977aef0bf62005-12-30 16:28:01 +00001516 /* If there are still other outstanding references to the shared-btree
1517 ** structure, return now. The remainder of this procedure cleans
1518 ** up the shared-btree.
1519 */
drhe53831d2007-08-17 01:14:38 +00001520 assert( p->wantToLock==0 && p->locked==0 );
1521 if( !p->sharable || removeFromSharingList(pBt) ){
1522 /* The pBt is no longer on the sharing list, so we can access
1523 ** it without having to hold the mutex.
1524 **
1525 ** Clean out and delete the BtShared object.
1526 */
1527 assert( !pBt->pCursor );
drhe53831d2007-08-17 01:14:38 +00001528 sqlite3PagerClose(pBt->pPager);
1529 if( pBt->xFreeSchema && pBt->pSchema ){
1530 pBt->xFreeSchema(pBt->pSchema);
1531 }
1532 sqlite3_free(pBt->pSchema);
drhf7141992008-06-19 00:16:08 +00001533 freeTempSpace(pBt);
drh65bbf292008-06-19 01:03:17 +00001534 sqlite3_free(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00001535 }
1536
drhe53831d2007-08-17 01:14:38 +00001537#ifndef SQLITE_OMIT_SHARED_CACHE
drhcab5ed72007-08-22 11:41:18 +00001538 assert( p->wantToLock==0 );
1539 assert( p->locked==0 );
1540 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1541 if( p->pNext ) p->pNext->pPrev = p->pPrev;
danielk1977aef0bf62005-12-30 16:28:01 +00001542#endif
1543
drhe53831d2007-08-17 01:14:38 +00001544 sqlite3_free(p);
drha059ad02001-04-17 20:09:11 +00001545 return SQLITE_OK;
1546}
1547
1548/*
drhda47d772002-12-02 04:25:19 +00001549** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +00001550**
1551** The maximum number of cache pages is set to the absolute
1552** value of mxPage. If mxPage is negative, the pager will
1553** operate asynchronously - it will not stop to do fsync()s
1554** to insure data is written to the disk surface before
1555** continuing. Transactions still work if synchronous is off,
1556** and the database cannot be corrupted if this program
1557** crashes. But if the operating system crashes or there is
1558** an abrupt power failure when synchronous is off, the database
1559** could be left in an inconsistent and unrecoverable state.
1560** Synchronous is on by default so database corruption is not
1561** normally a worry.
drhf57b14a2001-09-14 18:54:08 +00001562*/
danielk1977aef0bf62005-12-30 16:28:01 +00001563int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1564 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001565 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001566 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001567 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
drhd677b3d2007-08-20 22:48:41 +00001568 sqlite3BtreeLeave(p);
drhf57b14a2001-09-14 18:54:08 +00001569 return SQLITE_OK;
1570}
1571
1572/*
drh973b6e32003-02-12 14:09:42 +00001573** Change the way data is synced to disk in order to increase or decrease
1574** how well the database resists damage due to OS crashes and power
1575** failures. Level 1 is the same as asynchronous (no syncs() occur and
1576** there is a high probability of damage) Level 2 is the default. There
1577** is a very low but non-zero probability of damage. Level 3 reduces the
1578** probability of damage to near zero but with a write performance reduction.
1579*/
danielk197793758c82005-01-21 08:13:14 +00001580#ifndef SQLITE_OMIT_PAGER_PRAGMAS
drhac530b12006-02-11 01:25:50 +00001581int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
danielk1977aef0bf62005-12-30 16:28:01 +00001582 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001583 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001584 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001585 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
drhd677b3d2007-08-20 22:48:41 +00001586 sqlite3BtreeLeave(p);
drh973b6e32003-02-12 14:09:42 +00001587 return SQLITE_OK;
1588}
danielk197793758c82005-01-21 08:13:14 +00001589#endif
drh973b6e32003-02-12 14:09:42 +00001590
drh2c8997b2005-08-27 16:36:48 +00001591/*
1592** Return TRUE if the given btree is set to safety level 1. In other
1593** words, return TRUE if no sync() occurs on the disk files.
1594*/
danielk1977aef0bf62005-12-30 16:28:01 +00001595int sqlite3BtreeSyncDisabled(Btree *p){
1596 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001597 int rc;
drhe5fe6902007-12-07 18:55:28 +00001598 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001599 sqlite3BtreeEnter(p);
drhd0679ed2007-08-28 22:24:34 +00001600 assert( pBt && pBt->pPager );
drhd677b3d2007-08-20 22:48:41 +00001601 rc = sqlite3PagerNosync(pBt->pPager);
1602 sqlite3BtreeLeave(p);
1603 return rc;
drh2c8997b2005-08-27 16:36:48 +00001604}
1605
danielk1977576ec6b2005-01-21 11:55:25 +00001606#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
drh973b6e32003-02-12 14:09:42 +00001607/*
drh90f5ecb2004-07-22 01:19:35 +00001608** Change the default pages size and the number of reserved bytes per page.
drh06f50212004-11-02 14:24:33 +00001609**
1610** The page size must be a power of 2 between 512 and 65536. If the page
1611** size supplied does not meet this constraint then the page size is not
1612** changed.
1613**
1614** Page sizes are constrained to be a power of two so that the region
1615** of the database file used for locking (beginning at PENDING_BYTE,
1616** the first byte past the 1GB boundary, 0x40000000) needs to occur
1617** at the beginning of a page.
danielk197728129562005-01-11 10:25:06 +00001618**
1619** If parameter nReserve is less than zero, then the number of reserved
1620** bytes per page is left unchanged.
drh90f5ecb2004-07-22 01:19:35 +00001621*/
danielk1977aef0bf62005-12-30 16:28:01 +00001622int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
danielk1977a1644fd2007-08-29 12:31:25 +00001623 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00001624 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001625 sqlite3BtreeEnter(p);
drh90f5ecb2004-07-22 01:19:35 +00001626 if( pBt->pageSizeFixed ){
drhd677b3d2007-08-20 22:48:41 +00001627 sqlite3BtreeLeave(p);
drh90f5ecb2004-07-22 01:19:35 +00001628 return SQLITE_READONLY;
1629 }
1630 if( nReserve<0 ){
1631 nReserve = pBt->pageSize - pBt->usableSize;
1632 }
drh06f50212004-11-02 14:24:33 +00001633 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1634 ((pageSize-1)&pageSize)==0 ){
drh07d183d2005-05-01 22:52:42 +00001635 assert( (pageSize & 7)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00001636 assert( !pBt->pPage1 && !pBt->pCursor );
danielk1977a1644fd2007-08-29 12:31:25 +00001637 pBt->pageSize = pageSize;
drhf7141992008-06-19 00:16:08 +00001638 freeTempSpace(pBt);
danielk1977a1644fd2007-08-29 12:31:25 +00001639 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drh90f5ecb2004-07-22 01:19:35 +00001640 }
1641 pBt->usableSize = pBt->pageSize - nReserve;
drhd677b3d2007-08-20 22:48:41 +00001642 sqlite3BtreeLeave(p);
danielk1977a1644fd2007-08-29 12:31:25 +00001643 return rc;
drh90f5ecb2004-07-22 01:19:35 +00001644}
1645
1646/*
1647** Return the currently defined page size
1648*/
danielk1977aef0bf62005-12-30 16:28:01 +00001649int sqlite3BtreeGetPageSize(Btree *p){
1650 return p->pBt->pageSize;
drh90f5ecb2004-07-22 01:19:35 +00001651}
danielk1977aef0bf62005-12-30 16:28:01 +00001652int sqlite3BtreeGetReserve(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00001653 int n;
1654 sqlite3BtreeEnter(p);
1655 n = p->pBt->pageSize - p->pBt->usableSize;
1656 sqlite3BtreeLeave(p);
1657 return n;
drh2011d5f2004-07-22 02:40:37 +00001658}
drhf8e632b2007-05-08 14:51:36 +00001659
1660/*
1661** Set the maximum page count for a database if mxPage is positive.
1662** No changes are made if mxPage is 0 or negative.
1663** Regardless of the value of mxPage, return the maximum page count.
1664*/
1665int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
drhd677b3d2007-08-20 22:48:41 +00001666 int n;
1667 sqlite3BtreeEnter(p);
1668 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1669 sqlite3BtreeLeave(p);
1670 return n;
drhf8e632b2007-05-08 14:51:36 +00001671}
danielk1977576ec6b2005-01-21 11:55:25 +00001672#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
drh90f5ecb2004-07-22 01:19:35 +00001673
1674/*
danielk1977951af802004-11-05 15:45:09 +00001675** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1676** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1677** is disabled. The default value for the auto-vacuum property is
1678** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1679*/
danielk1977aef0bf62005-12-30 16:28:01 +00001680int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
danielk1977951af802004-11-05 15:45:09 +00001681#ifdef SQLITE_OMIT_AUTOVACUUM
drheee46cf2004-11-06 00:02:48 +00001682 return SQLITE_READONLY;
danielk1977951af802004-11-05 15:45:09 +00001683#else
danielk1977dddbcdc2007-04-26 14:42:34 +00001684 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001685 int rc = SQLITE_OK;
danielk1977dddbcdc2007-04-26 14:42:34 +00001686 int av = (autoVacuum?1:0);
drhd677b3d2007-08-20 22:48:41 +00001687
1688 sqlite3BtreeEnter(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001689 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00001690 rc = SQLITE_READONLY;
1691 }else{
1692 pBt->autoVacuum = av;
danielk1977951af802004-11-05 15:45:09 +00001693 }
drhd677b3d2007-08-20 22:48:41 +00001694 sqlite3BtreeLeave(p);
1695 return rc;
danielk1977951af802004-11-05 15:45:09 +00001696#endif
1697}
1698
1699/*
1700** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1701** enabled 1 is returned. Otherwise 0.
1702*/
danielk1977aef0bf62005-12-30 16:28:01 +00001703int sqlite3BtreeGetAutoVacuum(Btree *p){
danielk1977951af802004-11-05 15:45:09 +00001704#ifdef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001705 return BTREE_AUTOVACUUM_NONE;
danielk1977951af802004-11-05 15:45:09 +00001706#else
drhd677b3d2007-08-20 22:48:41 +00001707 int rc;
1708 sqlite3BtreeEnter(p);
1709 rc = (
danielk1977dddbcdc2007-04-26 14:42:34 +00001710 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1711 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1712 BTREE_AUTOVACUUM_INCR
1713 );
drhd677b3d2007-08-20 22:48:41 +00001714 sqlite3BtreeLeave(p);
1715 return rc;
danielk1977951af802004-11-05 15:45:09 +00001716#endif
1717}
1718
1719
1720/*
drha34b6762004-05-07 13:30:42 +00001721** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +00001722** also acquire a readlock on that file.
1723**
1724** SQLITE_OK is returned on success. If the file is not a
1725** well-formed database file, then SQLITE_CORRUPT is returned.
1726** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
drh4f0ee682007-03-30 20:43:40 +00001727** is returned if we run out of memory.
drh306dc212001-05-21 13:45:10 +00001728*/
danielk1977aef0bf62005-12-30 16:28:01 +00001729static int lockBtree(BtShared *pBt){
danielk1977f653d782008-03-20 11:04:21 +00001730 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001731 MemPage *pPage1;
danielk197793f7af92008-05-09 16:57:50 +00001732 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001733
drh1fee73e2007-08-29 04:00:57 +00001734 assert( sqlite3_mutex_held(pBt->mutex) );
drha34b6762004-05-07 13:30:42 +00001735 if( pBt->pPage1 ) return SQLITE_OK;
drh16a9b832007-05-05 18:39:25 +00001736 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
drh306dc212001-05-21 13:45:10 +00001737 if( rc!=SQLITE_OK ) return rc;
drh306dc212001-05-21 13:45:10 +00001738
1739 /* Do some checking to help insure the file we opened really is
1740 ** a valid database file.
1741 */
danielk1977ad0132d2008-06-07 08:58:22 +00001742 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1743 if( rc!=SQLITE_OK ){
danielk197793f7af92008-05-09 16:57:50 +00001744 goto page1_init_failed;
1745 }else if( nPage>0 ){
danielk1977f653d782008-03-20 11:04:21 +00001746 int pageSize;
1747 int usableSize;
drhb6f41482004-05-14 01:58:11 +00001748 u8 *page1 = pPage1->aData;
danielk1977ad0132d2008-06-07 08:58:22 +00001749 rc = SQLITE_NOTADB;
drhb6f41482004-05-14 01:58:11 +00001750 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00001751 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00001752 }
drh309169a2007-04-24 17:27:51 +00001753 if( page1[18]>1 ){
1754 pBt->readOnly = 1;
1755 }
1756 if( page1[19]>1 ){
drhb6f41482004-05-14 01:58:11 +00001757 goto page1_init_failed;
1758 }
drhe5ae5732008-06-15 02:51:47 +00001759
1760 /* The maximum embedded fraction must be exactly 25%. And the minimum
1761 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1762 ** The original design allowed these amounts to vary, but as of
1763 ** version 3.6.0, we require them to be fixed.
1764 */
1765 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1766 goto page1_init_failed;
1767 }
drh07d183d2005-05-01 22:52:42 +00001768 pageSize = get2byte(&page1[16]);
drh7dc385e2007-09-06 23:39:36 +00001769 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1770 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1771 ){
drh07d183d2005-05-01 22:52:42 +00001772 goto page1_init_failed;
1773 }
1774 assert( (pageSize & 7)==0 );
danielk1977f653d782008-03-20 11:04:21 +00001775 usableSize = pageSize - page1[20];
1776 if( pageSize!=pBt->pageSize ){
1777 /* After reading the first page of the database assuming a page size
1778 ** of BtShared.pageSize, we have discovered that the page-size is
1779 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1780 ** zero and return SQLITE_OK. The caller will call this function
1781 ** again with the correct page-size.
1782 */
1783 releasePage(pPage1);
1784 pBt->usableSize = usableSize;
1785 pBt->pageSize = pageSize;
drhf7141992008-06-19 00:16:08 +00001786 freeTempSpace(pBt);
danielk1977f653d782008-03-20 11:04:21 +00001787 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1788 return SQLITE_OK;
1789 }
1790 if( usableSize<500 ){
drhb6f41482004-05-14 01:58:11 +00001791 goto page1_init_failed;
1792 }
danielk1977f653d782008-03-20 11:04:21 +00001793 pBt->pageSize = pageSize;
1794 pBt->usableSize = usableSize;
drh057cd3a2005-02-15 16:23:02 +00001795#ifndef SQLITE_OMIT_AUTOVACUUM
1796 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
danielk197727b1f952007-06-25 08:16:58 +00001797 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
drh057cd3a2005-02-15 16:23:02 +00001798#endif
drh306dc212001-05-21 13:45:10 +00001799 }
drhb6f41482004-05-14 01:58:11 +00001800
1801 /* maxLocal is the maximum amount of payload to store locally for
1802 ** a cell. Make sure it is small enough so that at least minFanout
1803 ** cells can will fit on one page. We assume a 10-byte page header.
1804 ** Besides the payload, the cell must store:
drh43605152004-05-29 21:46:49 +00001805 ** 2-byte pointer to the cell
drhb6f41482004-05-14 01:58:11 +00001806 ** 4-byte child pointer
1807 ** 9-byte nKey value
1808 ** 4-byte nData value
1809 ** 4-byte overflow page pointer
drh43605152004-05-29 21:46:49 +00001810 ** So a cell consists of a 2-byte poiner, a header which is as much as
1811 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1812 ** page pointer.
drhb6f41482004-05-14 01:58:11 +00001813 */
drhe5ae5732008-06-15 02:51:47 +00001814 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1815 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
drh43605152004-05-29 21:46:49 +00001816 pBt->maxLeaf = pBt->usableSize - 35;
drhe5ae5732008-06-15 02:51:47 +00001817 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
drh2e38c322004-09-03 18:38:44 +00001818 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00001819 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00001820 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00001821
drh72f82862001-05-24 21:06:34 +00001822page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00001823 releasePage(pPage1);
1824 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00001825 return rc;
drh306dc212001-05-21 13:45:10 +00001826}
1827
1828/*
drhb8ef32c2005-03-14 02:01:49 +00001829** This routine works like lockBtree() except that it also invokes the
1830** busy callback if there is lock contention.
1831*/
danielk1977aef0bf62005-12-30 16:28:01 +00001832static int lockBtreeWithRetry(Btree *pRef){
drhb8ef32c2005-03-14 02:01:49 +00001833 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00001834
drh1fee73e2007-08-29 04:00:57 +00001835 assert( sqlite3BtreeHoldsMutex(pRef) );
danielk1977aef0bf62005-12-30 16:28:01 +00001836 if( pRef->inTrans==TRANS_NONE ){
1837 u8 inTransaction = pRef->pBt->inTransaction;
1838 btreeIntegrity(pRef);
1839 rc = sqlite3BtreeBeginTrans(pRef, 0);
1840 pRef->pBt->inTransaction = inTransaction;
1841 pRef->inTrans = TRANS_NONE;
1842 if( rc==SQLITE_OK ){
1843 pRef->pBt->nTransaction--;
1844 }
1845 btreeIntegrity(pRef);
drhb8ef32c2005-03-14 02:01:49 +00001846 }
1847 return rc;
1848}
1849
1850
1851/*
drhb8ca3072001-12-05 00:21:20 +00001852** If there are no outstanding cursors and we are not in the middle
1853** of a transaction but there is a read lock on the database, then
1854** this routine unrefs the first page of the database file which
1855** has the effect of releasing the read lock.
1856**
1857** If there are any outstanding cursors, this routine is a no-op.
1858**
1859** If there is a transaction in progress, this routine is a no-op.
1860*/
danielk1977aef0bf62005-12-30 16:28:01 +00001861static void unlockBtreeIfUnused(BtShared *pBt){
drh1fee73e2007-08-29 04:00:57 +00001862 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00001863 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00001864 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
drhde4fcfd2008-01-19 23:50:26 +00001865 assert( pBt->pPage1->aData );
1866#if 0
drh24c9a2e2007-01-05 02:00:47 +00001867 if( pBt->pPage1->aData==0 ){
1868 MemPage *pPage = pBt->pPage1;
drhbf4bca52007-09-06 22:19:14 +00001869 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
drh24c9a2e2007-01-05 02:00:47 +00001870 pPage->pBt = pBt;
1871 pPage->pgno = 1;
1872 }
drhde4fcfd2008-01-19 23:50:26 +00001873#endif
drh24c9a2e2007-01-05 02:00:47 +00001874 releasePage(pBt->pPage1);
drh51c6d962004-06-06 00:42:25 +00001875 }
drh3aac2dd2004-04-26 14:10:20 +00001876 pBt->pPage1 = 0;
drh3aac2dd2004-04-26 14:10:20 +00001877 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001878 }
1879}
1880
1881/*
drh9e572e62004-04-23 23:43:10 +00001882** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001883** file.
drh8b2f49b2001-06-08 00:21:52 +00001884*/
danielk1977aef0bf62005-12-30 16:28:01 +00001885static int newDatabase(BtShared *pBt){
drh9e572e62004-04-23 23:43:10 +00001886 MemPage *pP1;
1887 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001888 int rc;
danielk1977ad0132d2008-06-07 08:58:22 +00001889 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001890
drh1fee73e2007-08-29 04:00:57 +00001891 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977ad0132d2008-06-07 08:58:22 +00001892 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1893 if( rc!=SQLITE_OK || nPage>0 ){
1894 return rc;
1895 }
drh3aac2dd2004-04-26 14:10:20 +00001896 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001897 assert( pP1!=0 );
1898 data = pP1->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00001899 rc = sqlite3PagerWrite(pP1->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00001900 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001901 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1902 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00001903 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00001904 data[18] = 1;
1905 data[19] = 1;
drhb6f41482004-05-14 01:58:11 +00001906 data[20] = pBt->pageSize - pBt->usableSize;
drhe5ae5732008-06-15 02:51:47 +00001907 data[21] = 64;
1908 data[22] = 32;
1909 data[23] = 32;
drhb6f41482004-05-14 01:58:11 +00001910 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00001911 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drhf2a611c2004-09-05 00:33:43 +00001912 pBt->pageSizeFixed = 1;
danielk1977003ba062004-11-04 02:57:33 +00001913#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001914 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
danielk1977418899a2007-06-24 10:14:00 +00001915 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
danielk1977dddbcdc2007-04-26 14:42:34 +00001916 put4byte(&data[36 + 4*4], pBt->autoVacuum);
danielk1977418899a2007-06-24 10:14:00 +00001917 put4byte(&data[36 + 7*4], pBt->incrVacuum);
danielk1977003ba062004-11-04 02:57:33 +00001918#endif
drh8b2f49b2001-06-08 00:21:52 +00001919 return SQLITE_OK;
1920}
1921
1922/*
danielk1977ee5741e2004-05-31 10:01:34 +00001923** Attempt to start a new transaction. A write-transaction
drh684917c2004-10-05 02:41:42 +00001924** is started if the second argument is nonzero, otherwise a read-
1925** transaction. If the second argument is 2 or more and exclusive
1926** transaction is started, meaning that no other process is allowed
1927** to access the database. A preexisting transaction may not be
drhb8ef32c2005-03-14 02:01:49 +00001928** upgraded to exclusive by calling this routine a second time - the
drh684917c2004-10-05 02:41:42 +00001929** exclusivity flag only works for a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001930**
danielk1977ee5741e2004-05-31 10:01:34 +00001931** A write-transaction must be started before attempting any
1932** changes to the database. None of the following routines
1933** will work unless a transaction is started first:
drh8b2f49b2001-06-08 00:21:52 +00001934**
drh23e11ca2004-05-04 17:27:28 +00001935** sqlite3BtreeCreateTable()
1936** sqlite3BtreeCreateIndex()
1937** sqlite3BtreeClearTable()
1938** sqlite3BtreeDropTable()
1939** sqlite3BtreeInsert()
1940** sqlite3BtreeDelete()
1941** sqlite3BtreeUpdateMeta()
danielk197713adf8a2004-06-03 16:08:41 +00001942**
drhb8ef32c2005-03-14 02:01:49 +00001943** If an initial attempt to acquire the lock fails because of lock contention
1944** and the database was previously unlocked, then invoke the busy handler
1945** if there is one. But if there was previously a read-lock, do not
1946** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1947** returned when there is already a read-lock in order to avoid a deadlock.
1948**
1949** Suppose there are two processes A and B. A has a read lock and B has
1950** a reserved lock. B tries to promote to exclusive but is blocked because
1951** of A's read lock. A tries to promote to reserved but is blocked by B.
1952** One or the other of the two processes must give way or there can be
1953** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1954** when A already has a read lock, we encourage A to give up and let B
1955** proceed.
drha059ad02001-04-17 20:09:11 +00001956*/
danielk1977aef0bf62005-12-30 16:28:01 +00001957int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1958 BtShared *pBt = p->pBt;
danielk1977ee5741e2004-05-31 10:01:34 +00001959 int rc = SQLITE_OK;
1960
drhd677b3d2007-08-20 22:48:41 +00001961 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001962 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001963 btreeIntegrity(p);
1964
danielk1977ee5741e2004-05-31 10:01:34 +00001965 /* If the btree is already in a write-transaction, or it
1966 ** is already in a read-transaction and a read-transaction
1967 ** is requested, this is a no-op.
1968 */
danielk1977aef0bf62005-12-30 16:28:01 +00001969 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
drhd677b3d2007-08-20 22:48:41 +00001970 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001971 }
drhb8ef32c2005-03-14 02:01:49 +00001972
1973 /* Write transactions are not possible on a read-only database */
danielk1977ee5741e2004-05-31 10:01:34 +00001974 if( pBt->readOnly && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00001975 rc = SQLITE_READONLY;
1976 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001977 }
1978
danielk1977aef0bf62005-12-30 16:28:01 +00001979 /* If another database handle has already opened a write transaction
1980 ** on this shared-btree structure and a second write transaction is
1981 ** requested, return SQLITE_BUSY.
1982 */
1983 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00001984 rc = SQLITE_BUSY;
1985 goto trans_begun;
danielk1977aef0bf62005-12-30 16:28:01 +00001986 }
1987
danielk1977641b0f42007-12-21 04:47:25 +00001988#ifndef SQLITE_OMIT_SHARED_CACHE
1989 if( wrflag>1 ){
1990 BtLock *pIter;
1991 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1992 if( pIter->pBtree!=p ){
1993 rc = SQLITE_BUSY;
1994 goto trans_begun;
1995 }
1996 }
1997 }
1998#endif
1999
drhb8ef32c2005-03-14 02:01:49 +00002000 do {
drh8a9c17f2008-05-02 14:23:54 +00002001 if( pBt->pPage1==0 ){
2002 do{
2003 rc = lockBtree(pBt);
2004 }while( pBt->pPage1==0 && rc==SQLITE_OK );
drh8c42ca92001-06-22 19:15:00 +00002005 }
drh309169a2007-04-24 17:27:51 +00002006
drhb8ef32c2005-03-14 02:01:49 +00002007 if( rc==SQLITE_OK && wrflag ){
drh309169a2007-04-24 17:27:51 +00002008 if( pBt->readOnly ){
2009 rc = SQLITE_READONLY;
2010 }else{
2011 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2012 if( rc==SQLITE_OK ){
2013 rc = newDatabase(pBt);
2014 }
drhb8ef32c2005-03-14 02:01:49 +00002015 }
2016 }
2017
2018 if( rc==SQLITE_OK ){
drhb8ef32c2005-03-14 02:01:49 +00002019 if( wrflag ) pBt->inStmt = 0;
2020 }else{
2021 unlockBtreeIfUnused(pBt);
2022 }
danielk1977aef0bf62005-12-30 16:28:01 +00002023 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
drhe5fe6902007-12-07 18:55:28 +00002024 sqlite3BtreeInvokeBusyHandler(pBt, 0) );
danielk1977aef0bf62005-12-30 16:28:01 +00002025
2026 if( rc==SQLITE_OK ){
2027 if( p->inTrans==TRANS_NONE ){
2028 pBt->nTransaction++;
2029 }
2030 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2031 if( p->inTrans>pBt->inTransaction ){
2032 pBt->inTransaction = p->inTrans;
2033 }
danielk1977641b0f42007-12-21 04:47:25 +00002034#ifndef SQLITE_OMIT_SHARED_CACHE
2035 if( wrflag>1 ){
2036 assert( !pBt->pExclusive );
2037 pBt->pExclusive = p;
2038 }
2039#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002040 }
2041
drhd677b3d2007-08-20 22:48:41 +00002042
2043trans_begun:
danielk1977aef0bf62005-12-30 16:28:01 +00002044 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002045 sqlite3BtreeLeave(p);
drhb8ca3072001-12-05 00:21:20 +00002046 return rc;
drha059ad02001-04-17 20:09:11 +00002047}
2048
danielk1977687566d2004-11-02 12:56:41 +00002049#ifndef SQLITE_OMIT_AUTOVACUUM
2050
2051/*
2052** Set the pointer-map entries for all children of page pPage. Also, if
2053** pPage contains cells that point to overflow pages, set the pointer
2054** map entries for the overflow pages as well.
2055*/
2056static int setChildPtrmaps(MemPage *pPage){
2057 int i; /* Counter variable */
2058 int nCell; /* Number of cells in page pPage */
danielk19772df71c72007-05-24 07:22:42 +00002059 int rc; /* Return code */
danielk1977aef0bf62005-12-30 16:28:01 +00002060 BtShared *pBt = pPage->pBt;
danielk1977687566d2004-11-02 12:56:41 +00002061 int isInitOrig = pPage->isInit;
2062 Pgno pgno = pPage->pgno;
2063
drh1fee73e2007-08-29 04:00:57 +00002064 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00002065 rc = sqlite3BtreeInitPage(pPage);
danielk19772df71c72007-05-24 07:22:42 +00002066 if( rc!=SQLITE_OK ){
2067 goto set_child_ptrmaps_out;
2068 }
danielk1977687566d2004-11-02 12:56:41 +00002069 nCell = pPage->nCell;
2070
2071 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002072 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002073
danielk197726836652005-01-17 01:33:13 +00002074 rc = ptrmapPutOvflPtr(pPage, pCell);
2075 if( rc!=SQLITE_OK ){
2076 goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002077 }
danielk197726836652005-01-17 01:33:13 +00002078
danielk1977687566d2004-11-02 12:56:41 +00002079 if( !pPage->leaf ){
2080 Pgno childPgno = get4byte(pCell);
2081 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
danielk197700a696d2008-09-29 16:41:31 +00002082 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002083 }
2084 }
2085
2086 if( !pPage->leaf ){
2087 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2088 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2089 }
2090
2091set_child_ptrmaps_out:
2092 pPage->isInit = isInitOrig;
2093 return rc;
2094}
2095
2096/*
2097** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2098** page, is a pointer to page iFrom. Modify this pointer so that it points to
2099** iTo. Parameter eType describes the type of pointer to be modified, as
2100** follows:
2101**
2102** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2103** page of pPage.
2104**
2105** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2106** page pointed to by one of the cells on pPage.
2107**
2108** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2109** overflow page in the list.
2110*/
danielk1977fdb7cdb2005-01-17 02:12:18 +00002111static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
drh1fee73e2007-08-29 04:00:57 +00002112 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk1977687566d2004-11-02 12:56:41 +00002113 if( eType==PTRMAP_OVERFLOW2 ){
danielk1977f78fc082004-11-02 14:40:32 +00002114 /* The pointer is always the first 4 bytes of the page in this case. */
danielk1977fdb7cdb2005-01-17 02:12:18 +00002115 if( get4byte(pPage->aData)!=iFrom ){
drh49285702005-09-17 15:20:26 +00002116 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002117 }
danielk1977f78fc082004-11-02 14:40:32 +00002118 put4byte(pPage->aData, iTo);
danielk1977687566d2004-11-02 12:56:41 +00002119 }else{
2120 int isInitOrig = pPage->isInit;
2121 int i;
2122 int nCell;
2123
danielk197771d5d2c2008-09-29 11:49:47 +00002124 sqlite3BtreeInitPage(pPage);
danielk1977687566d2004-11-02 12:56:41 +00002125 nCell = pPage->nCell;
2126
danielk1977687566d2004-11-02 12:56:41 +00002127 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002128 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002129 if( eType==PTRMAP_OVERFLOW1 ){
2130 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00002131 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
danielk1977687566d2004-11-02 12:56:41 +00002132 if( info.iOverflow ){
2133 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2134 put4byte(&pCell[info.iOverflow], iTo);
2135 break;
2136 }
2137 }
2138 }else{
2139 if( get4byte(pCell)==iFrom ){
2140 put4byte(pCell, iTo);
2141 break;
2142 }
2143 }
2144 }
2145
2146 if( i==nCell ){
danielk1977fdb7cdb2005-01-17 02:12:18 +00002147 if( eType!=PTRMAP_BTREE ||
2148 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
drh49285702005-09-17 15:20:26 +00002149 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002150 }
danielk1977687566d2004-11-02 12:56:41 +00002151 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2152 }
2153
2154 pPage->isInit = isInitOrig;
2155 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002156 return SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002157}
2158
danielk1977003ba062004-11-04 02:57:33 +00002159
danielk19777701e812005-01-10 12:59:51 +00002160/*
2161** Move the open database page pDbPage to location iFreePage in the
2162** database. The pDbPage reference remains valid.
2163*/
danielk1977003ba062004-11-04 02:57:33 +00002164static int relocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00002165 BtShared *pBt, /* Btree */
danielk19777701e812005-01-10 12:59:51 +00002166 MemPage *pDbPage, /* Open page to move */
2167 u8 eType, /* Pointer map 'type' entry for pDbPage */
2168 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
danielk19774c999992008-07-16 18:17:55 +00002169 Pgno iFreePage, /* The location to move pDbPage to */
2170 int isCommit
danielk1977003ba062004-11-04 02:57:33 +00002171){
2172 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2173 Pgno iDbPage = pDbPage->pgno;
2174 Pager *pPager = pBt->pPager;
2175 int rc;
2176
danielk1977a0bf2652004-11-04 14:30:04 +00002177 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2178 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
drh1fee73e2007-08-29 04:00:57 +00002179 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +00002180 assert( pDbPage->pBt==pBt );
danielk1977003ba062004-11-04 02:57:33 +00002181
drh85b623f2007-12-13 21:54:09 +00002182 /* Move page iDbPage from its current location to page number iFreePage */
danielk1977003ba062004-11-04 02:57:33 +00002183 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2184 iDbPage, iFreePage, iPtrPage, eType));
danielk19774c999992008-07-16 18:17:55 +00002185 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
danielk1977003ba062004-11-04 02:57:33 +00002186 if( rc!=SQLITE_OK ){
2187 return rc;
2188 }
2189 pDbPage->pgno = iFreePage;
2190
2191 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2192 ** that point to overflow pages. The pointer map entries for all these
2193 ** pages need to be changed.
2194 **
2195 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2196 ** pointer to a subsequent overflow page. If this is the case, then
2197 ** the pointer map needs to be updated for the subsequent overflow page.
2198 */
danielk1977a0bf2652004-11-04 14:30:04 +00002199 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00002200 rc = setChildPtrmaps(pDbPage);
2201 if( rc!=SQLITE_OK ){
2202 return rc;
2203 }
2204 }else{
2205 Pgno nextOvfl = get4byte(pDbPage->aData);
2206 if( nextOvfl!=0 ){
danielk1977003ba062004-11-04 02:57:33 +00002207 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2208 if( rc!=SQLITE_OK ){
2209 return rc;
2210 }
2211 }
2212 }
2213
2214 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2215 ** that it points at iFreePage. Also fix the pointer map entry for
2216 ** iPtrPage.
2217 */
danielk1977a0bf2652004-11-04 14:30:04 +00002218 if( eType!=PTRMAP_ROOTPAGE ){
drh16a9b832007-05-05 18:39:25 +00002219 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00002220 if( rc!=SQLITE_OK ){
2221 return rc;
2222 }
danielk19773b8a05f2007-03-19 17:44:26 +00002223 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
danielk1977a0bf2652004-11-04 14:30:04 +00002224 if( rc!=SQLITE_OK ){
2225 releasePage(pPtrPage);
2226 return rc;
2227 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002228 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
danielk1977003ba062004-11-04 02:57:33 +00002229 releasePage(pPtrPage);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002230 if( rc==SQLITE_OK ){
2231 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2232 }
danielk1977003ba062004-11-04 02:57:33 +00002233 }
danielk1977003ba062004-11-04 02:57:33 +00002234 return rc;
2235}
2236
danielk1977dddbcdc2007-04-26 14:42:34 +00002237/* Forward declaration required by incrVacuumStep(). */
drh4f0c5872007-03-26 22:05:01 +00002238static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
danielk1977687566d2004-11-02 12:56:41 +00002239
2240/*
danielk1977dddbcdc2007-04-26 14:42:34 +00002241** Perform a single step of an incremental-vacuum. If successful,
2242** return SQLITE_OK. If there is no work to do (and therefore no
2243** point in calling this function again), return SQLITE_DONE.
2244**
2245** More specificly, this function attempts to re-organize the
2246** database so that the last page of the file currently in use
2247** is no longer in use.
2248**
2249** If the nFin parameter is non-zero, the implementation assumes
2250** that the caller will keep calling incrVacuumStep() until
2251** it returns SQLITE_DONE or an error, and that nFin is the
2252** number of pages the database file will contain after this
2253** process is complete.
2254*/
2255static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2256 Pgno iLastPg; /* Last page in the database */
2257 Pgno nFreeList; /* Number of pages still on the free-list */
2258
drh1fee73e2007-08-29 04:00:57 +00002259 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977dddbcdc2007-04-26 14:42:34 +00002260 iLastPg = pBt->nTrunc;
2261 if( iLastPg==0 ){
danielk1977ad0132d2008-06-07 08:58:22 +00002262 iLastPg = pagerPagecount(pBt->pPager);
danielk1977dddbcdc2007-04-26 14:42:34 +00002263 }
2264
2265 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2266 int rc;
2267 u8 eType;
2268 Pgno iPtrPage;
2269
2270 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2271 if( nFreeList==0 || nFin==iLastPg ){
2272 return SQLITE_DONE;
2273 }
2274
2275 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2276 if( rc!=SQLITE_OK ){
2277 return rc;
2278 }
2279 if( eType==PTRMAP_ROOTPAGE ){
2280 return SQLITE_CORRUPT_BKPT;
2281 }
2282
2283 if( eType==PTRMAP_FREEPAGE ){
2284 if( nFin==0 ){
2285 /* Remove the page from the files free-list. This is not required
danielk19774ef24492007-05-23 09:52:41 +00002286 ** if nFin is non-zero. In that case, the free-list will be
danielk1977dddbcdc2007-04-26 14:42:34 +00002287 ** truncated to zero after this function returns, so it doesn't
2288 ** matter if it still contains some garbage entries.
2289 */
2290 Pgno iFreePg;
2291 MemPage *pFreePg;
2292 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2293 if( rc!=SQLITE_OK ){
2294 return rc;
2295 }
2296 assert( iFreePg==iLastPg );
2297 releasePage(pFreePg);
2298 }
2299 } else {
2300 Pgno iFreePg; /* Index of free page to move pLastPg to */
2301 MemPage *pLastPg;
2302
drh16a9b832007-05-05 18:39:25 +00002303 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002304 if( rc!=SQLITE_OK ){
2305 return rc;
2306 }
2307
danielk1977b4626a32007-04-28 15:47:43 +00002308 /* If nFin is zero, this loop runs exactly once and page pLastPg
2309 ** is swapped with the first free page pulled off the free list.
2310 **
2311 ** On the other hand, if nFin is greater than zero, then keep
2312 ** looping until a free-page located within the first nFin pages
2313 ** of the file is found.
2314 */
danielk1977dddbcdc2007-04-26 14:42:34 +00002315 do {
2316 MemPage *pFreePg;
2317 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2318 if( rc!=SQLITE_OK ){
2319 releasePage(pLastPg);
2320 return rc;
2321 }
2322 releasePage(pFreePg);
2323 }while( nFin!=0 && iFreePg>nFin );
2324 assert( iFreePg<iLastPg );
danielk1977b4626a32007-04-28 15:47:43 +00002325
2326 rc = sqlite3PagerWrite(pLastPg->pDbPage);
danielk1977662278e2007-11-05 15:30:12 +00002327 if( rc==SQLITE_OK ){
danielk19774c999992008-07-16 18:17:55 +00002328 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
danielk1977662278e2007-11-05 15:30:12 +00002329 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002330 releasePage(pLastPg);
2331 if( rc!=SQLITE_OK ){
2332 return rc;
danielk1977662278e2007-11-05 15:30:12 +00002333 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002334 }
2335 }
2336
2337 pBt->nTrunc = iLastPg - 1;
2338 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2339 pBt->nTrunc--;
2340 }
2341 return SQLITE_OK;
2342}
2343
2344/*
2345** A write-transaction must be opened before calling this function.
2346** It performs a single unit of work towards an incremental vacuum.
2347**
2348** If the incremental vacuum is finished after this function has run,
2349** SQLITE_DONE is returned. If it is not finished, but no error occured,
2350** SQLITE_OK is returned. Otherwise an SQLite error code.
2351*/
2352int sqlite3BtreeIncrVacuum(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00002353 int rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002354 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002355
2356 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002357 pBt->db = p->db;
danielk1977dddbcdc2007-04-26 14:42:34 +00002358 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2359 if( !pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00002360 rc = SQLITE_DONE;
2361 }else{
2362 invalidateAllOverflowCache(pBt);
2363 rc = incrVacuumStep(pBt, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002364 }
drhd677b3d2007-08-20 22:48:41 +00002365 sqlite3BtreeLeave(p);
2366 return rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002367}
2368
2369/*
danielk19773b8a05f2007-03-19 17:44:26 +00002370** This routine is called prior to sqlite3PagerCommit when a transaction
danielk1977687566d2004-11-02 12:56:41 +00002371** is commited for an auto-vacuum database.
danielk197724168722007-04-02 05:07:47 +00002372**
2373** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2374** the database file should be truncated to during the commit process.
2375** i.e. the database has been reorganized so that only the first *pnTrunc
2376** pages are in use.
danielk1977687566d2004-11-02 12:56:41 +00002377*/
danielk197724168722007-04-02 05:07:47 +00002378static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
danielk1977dddbcdc2007-04-26 14:42:34 +00002379 int rc = SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002380 Pager *pPager = pBt->pPager;
drhf94a1732008-09-30 17:18:17 +00002381 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002382
drh1fee73e2007-08-29 04:00:57 +00002383 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +00002384 invalidateAllOverflowCache(pBt);
danielk1977dddbcdc2007-04-26 14:42:34 +00002385 assert(pBt->autoVacuum);
2386 if( !pBt->incrVacuum ){
2387 Pgno nFin = 0;
danielk1977687566d2004-11-02 12:56:41 +00002388
danielk1977dddbcdc2007-04-26 14:42:34 +00002389 if( pBt->nTrunc==0 ){
2390 Pgno nFree;
2391 Pgno nPtrmap;
2392 const int pgsz = pBt->pageSize;
danielk1977ad0132d2008-06-07 08:58:22 +00002393 int nOrig = pagerPagecount(pBt->pPager);
danielk1977e5321f02007-04-27 07:05:44 +00002394
2395 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2396 return SQLITE_CORRUPT_BKPT;
2397 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002398 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2399 nOrig--;
danielk1977687566d2004-11-02 12:56:41 +00002400 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002401 nFree = get4byte(&pBt->pPage1->aData[36]);
2402 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2403 nFin = nOrig - nFree - nPtrmap;
2404 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2405 nFin--;
danielk1977ac11ee62005-01-15 12:45:51 +00002406 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002407 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2408 nFin--;
2409 }
2410 }
danielk1977687566d2004-11-02 12:56:41 +00002411
danielk1977dddbcdc2007-04-26 14:42:34 +00002412 while( rc==SQLITE_OK ){
2413 rc = incrVacuumStep(pBt, nFin);
2414 }
2415 if( rc==SQLITE_DONE ){
2416 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2417 rc = SQLITE_OK;
danielk19770ba32df2008-05-07 07:13:16 +00002418 if( pBt->nTrunc && nFin ){
drh67f80b62007-07-23 19:26:17 +00002419 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
danielk1977dddbcdc2007-04-26 14:42:34 +00002420 put4byte(&pBt->pPage1->aData[32], 0);
2421 put4byte(&pBt->pPage1->aData[36], 0);
2422 pBt->nTrunc = nFin;
2423 }
2424 }
2425 if( rc!=SQLITE_OK ){
2426 sqlite3PagerRollback(pPager);
2427 }
danielk1977687566d2004-11-02 12:56:41 +00002428 }
2429
danielk1977dddbcdc2007-04-26 14:42:34 +00002430 if( rc==SQLITE_OK ){
2431 *pnTrunc = pBt->nTrunc;
2432 pBt->nTrunc = 0;
2433 }
danielk19773b8a05f2007-03-19 17:44:26 +00002434 assert( nRef==sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002435 return rc;
2436}
danielk1977dddbcdc2007-04-26 14:42:34 +00002437
shane831c3292008-11-10 17:14:58 +00002438#endif /* ifndef SQLITE_OMIT_AUTOVACUUM */
danielk1977687566d2004-11-02 12:56:41 +00002439
2440/*
drh80e35f42007-03-30 14:06:34 +00002441** This routine does the first phase of a two-phase commit. This routine
2442** causes a rollback journal to be created (if it does not already exist)
2443** and populated with enough information so that if a power loss occurs
2444** the database can be restored to its original state by playing back
2445** the journal. Then the contents of the journal are flushed out to
2446** the disk. After the journal is safely on oxide, the changes to the
2447** database are written into the database file and flushed to oxide.
2448** At the end of this call, the rollback journal still exists on the
2449** disk and we are still holding all locks, so the transaction has not
2450** committed. See sqlite3BtreeCommit() for the second phase of the
2451** commit process.
2452**
2453** This call is a no-op if no write-transaction is currently active on pBt.
2454**
2455** Otherwise, sync the database file for the btree pBt. zMaster points to
2456** the name of a master journal file that should be written into the
2457** individual journal file, or is NULL, indicating no master journal file
2458** (single database transaction).
2459**
2460** When this is called, the master journal should already have been
2461** created, populated with this journal pointer and synced to disk.
2462**
2463** Once this is routine has returned, the only thing required to commit
2464** the write-transaction for this database file is to delete the journal.
2465*/
2466int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2467 int rc = SQLITE_OK;
2468 if( p->inTrans==TRANS_WRITE ){
2469 BtShared *pBt = p->pBt;
2470 Pgno nTrunc = 0;
drhd677b3d2007-08-20 22:48:41 +00002471 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002472 pBt->db = p->db;
drh80e35f42007-03-30 14:06:34 +00002473#ifndef SQLITE_OMIT_AUTOVACUUM
2474 if( pBt->autoVacuum ){
2475 rc = autoVacuumCommit(pBt, &nTrunc);
2476 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002477 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002478 return rc;
2479 }
2480 }
2481#endif
danielk1977f653d782008-03-20 11:04:21 +00002482 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
drhd677b3d2007-08-20 22:48:41 +00002483 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002484 }
2485 return rc;
2486}
2487
2488/*
drh2aa679f2001-06-25 02:11:07 +00002489** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00002490**
drh6e345992007-03-30 11:12:08 +00002491** This routine implements the second phase of a 2-phase commit. The
2492** sqlite3BtreeSync() routine does the first phase and should be invoked
2493** prior to calling this routine. The sqlite3BtreeSync() routine did
2494** all the work of writing information out to disk and flushing the
2495** contents so that they are written onto the disk platter. All this
2496** routine has to do is delete or truncate the rollback journal
2497** (which causes the transaction to commit) and drop locks.
2498**
drh5e00f6c2001-09-13 13:46:56 +00002499** This will release the write lock on the database file. If there
2500** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002501*/
drh80e35f42007-03-30 14:06:34 +00002502int sqlite3BtreeCommitPhaseTwo(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00002503 BtShared *pBt = p->pBt;
2504
drhd677b3d2007-08-20 22:48:41 +00002505 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002506 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002507 btreeIntegrity(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002508
2509 /* If the handle has a write-transaction open, commit the shared-btrees
2510 ** transaction and set the shared state to TRANS_READ.
2511 */
2512 if( p->inTrans==TRANS_WRITE ){
danielk19777f7bc662006-01-23 13:47:47 +00002513 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002514 assert( pBt->inTransaction==TRANS_WRITE );
2515 assert( pBt->nTransaction>0 );
drh80e35f42007-03-30 14:06:34 +00002516 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
danielk19777f7bc662006-01-23 13:47:47 +00002517 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002518 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002519 return rc;
2520 }
danielk1977aef0bf62005-12-30 16:28:01 +00002521 pBt->inTransaction = TRANS_READ;
2522 pBt->inStmt = 0;
danielk1977ee5741e2004-05-31 10:01:34 +00002523 }
danielk19777f7bc662006-01-23 13:47:47 +00002524 unlockAllTables(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002525
2526 /* If the handle has any kind of transaction open, decrement the transaction
2527 ** count of the shared btree. If the transaction count reaches 0, set
2528 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2529 ** will unlock the pager.
2530 */
2531 if( p->inTrans!=TRANS_NONE ){
2532 pBt->nTransaction--;
2533 if( 0==pBt->nTransaction ){
2534 pBt->inTransaction = TRANS_NONE;
2535 }
2536 }
2537
2538 /* Set the handles current transaction state to TRANS_NONE and unlock
2539 ** the pager if this call closed the only read or write transaction.
2540 */
2541 p->inTrans = TRANS_NONE;
drh5e00f6c2001-09-13 13:46:56 +00002542 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002543
2544 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002545 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002546 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002547}
2548
drh80e35f42007-03-30 14:06:34 +00002549/*
2550** Do both phases of a commit.
2551*/
2552int sqlite3BtreeCommit(Btree *p){
2553 int rc;
drhd677b3d2007-08-20 22:48:41 +00002554 sqlite3BtreeEnter(p);
drh80e35f42007-03-30 14:06:34 +00002555 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2556 if( rc==SQLITE_OK ){
2557 rc = sqlite3BtreeCommitPhaseTwo(p);
2558 }
drhd677b3d2007-08-20 22:48:41 +00002559 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002560 return rc;
2561}
2562
danielk1977fbcd5852004-06-15 02:44:18 +00002563#ifndef NDEBUG
2564/*
2565** Return the number of write-cursors open on this handle. This is for use
2566** in assert() expressions, so it is only compiled if NDEBUG is not
2567** defined.
drhfb982642007-08-30 01:19:59 +00002568**
2569** For the purposes of this routine, a write-cursor is any cursor that
2570** is capable of writing to the databse. That means the cursor was
2571** originally opened for writing and the cursor has not be disabled
2572** by having its state changed to CURSOR_FAULT.
danielk1977fbcd5852004-06-15 02:44:18 +00002573*/
danielk1977aef0bf62005-12-30 16:28:01 +00002574static int countWriteCursors(BtShared *pBt){
danielk1977fbcd5852004-06-15 02:44:18 +00002575 BtCursor *pCur;
2576 int r = 0;
2577 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drhfb982642007-08-30 01:19:59 +00002578 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
danielk1977fbcd5852004-06-15 02:44:18 +00002579 }
2580 return r;
2581}
2582#endif
2583
drhc39e0002004-05-07 23:50:57 +00002584/*
drhfb982642007-08-30 01:19:59 +00002585** This routine sets the state to CURSOR_FAULT and the error
2586** code to errCode for every cursor on BtShared that pBtree
2587** references.
2588**
2589** Every cursor is tripped, including cursors that belong
2590** to other database connections that happen to be sharing
2591** the cache with pBtree.
2592**
2593** This routine gets called when a rollback occurs.
2594** All cursors using the same cache must be tripped
2595** to prevent them from trying to use the btree after
2596** the rollback. The rollback may have deleted tables
2597** or moved root pages, so it is not sufficient to
2598** save the state of the cursor. The cursor must be
2599** invalidated.
2600*/
2601void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2602 BtCursor *p;
2603 sqlite3BtreeEnter(pBtree);
2604 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
danielk1977be51a652008-10-08 17:58:48 +00002605 sqlite3BtreeClearCursor(p);
drhfb982642007-08-30 01:19:59 +00002606 p->eState = CURSOR_FAULT;
2607 p->skip = errCode;
2608 }
2609 sqlite3BtreeLeave(pBtree);
2610}
2611
2612/*
drhecdc7532001-09-23 02:35:53 +00002613** Rollback the transaction in progress. All cursors will be
2614** invalided by this operation. Any attempt to use a cursor
2615** that was open at the beginning of this operation will result
2616** in an error.
drh5e00f6c2001-09-13 13:46:56 +00002617**
2618** This will release the write lock on the database file. If there
2619** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002620*/
danielk1977aef0bf62005-12-30 16:28:01 +00002621int sqlite3BtreeRollback(Btree *p){
danielk19778d34dfd2006-01-24 16:37:57 +00002622 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002623 BtShared *pBt = p->pBt;
drh24cd67e2004-05-10 16:18:47 +00002624 MemPage *pPage1;
danielk1977aef0bf62005-12-30 16:28:01 +00002625
drhd677b3d2007-08-20 22:48:41 +00002626 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002627 pBt->db = p->db;
danielk19772b8c13e2006-01-24 14:21:24 +00002628 rc = saveAllCursors(pBt, 0, 0);
danielk19778d34dfd2006-01-24 16:37:57 +00002629#ifndef SQLITE_OMIT_SHARED_CACHE
danielk19772b8c13e2006-01-24 14:21:24 +00002630 if( rc!=SQLITE_OK ){
danielk19778d34dfd2006-01-24 16:37:57 +00002631 /* This is a horrible situation. An IO or malloc() error occured whilst
2632 ** trying to save cursor positions. If this is an automatic rollback (as
2633 ** the result of a constraint, malloc() failure or IO error) then
2634 ** the cache may be internally inconsistent (not contain valid trees) so
2635 ** we cannot simply return the error to the caller. Instead, abort
2636 ** all queries that may be using any of the cursors that failed to save.
2637 */
drhfb982642007-08-30 01:19:59 +00002638 sqlite3BtreeTripAllCursors(p, rc);
danielk19772b8c13e2006-01-24 14:21:24 +00002639 }
danielk19778d34dfd2006-01-24 16:37:57 +00002640#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002641 btreeIntegrity(p);
2642 unlockAllTables(p);
2643
2644 if( p->inTrans==TRANS_WRITE ){
danielk19778d34dfd2006-01-24 16:37:57 +00002645 int rc2;
danielk1977aef0bf62005-12-30 16:28:01 +00002646
danielk1977dddbcdc2007-04-26 14:42:34 +00002647#ifndef SQLITE_OMIT_AUTOVACUUM
2648 pBt->nTrunc = 0;
2649#endif
2650
danielk19778d34dfd2006-01-24 16:37:57 +00002651 assert( TRANS_WRITE==pBt->inTransaction );
danielk19773b8a05f2007-03-19 17:44:26 +00002652 rc2 = sqlite3PagerRollback(pBt->pPager);
danielk19778d34dfd2006-01-24 16:37:57 +00002653 if( rc2!=SQLITE_OK ){
2654 rc = rc2;
2655 }
2656
drh24cd67e2004-05-10 16:18:47 +00002657 /* The rollback may have destroyed the pPage1->aData value. So
drh16a9b832007-05-05 18:39:25 +00002658 ** call sqlite3BtreeGetPage() on page 1 again to make
2659 ** sure pPage1->aData is set correctly. */
2660 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
drh24cd67e2004-05-10 16:18:47 +00002661 releasePage(pPage1);
2662 }
danielk1977fbcd5852004-06-15 02:44:18 +00002663 assert( countWriteCursors(pBt)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00002664 pBt->inTransaction = TRANS_READ;
drh24cd67e2004-05-10 16:18:47 +00002665 }
danielk1977aef0bf62005-12-30 16:28:01 +00002666
2667 if( p->inTrans!=TRANS_NONE ){
2668 assert( pBt->nTransaction>0 );
2669 pBt->nTransaction--;
2670 if( 0==pBt->nTransaction ){
2671 pBt->inTransaction = TRANS_NONE;
2672 }
2673 }
2674
2675 p->inTrans = TRANS_NONE;
danielk1977ee5741e2004-05-31 10:01:34 +00002676 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00002677 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002678
2679 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002680 sqlite3BtreeLeave(p);
drha059ad02001-04-17 20:09:11 +00002681 return rc;
2682}
2683
2684/*
drhab01f612004-05-22 02:55:23 +00002685** Start a statement subtransaction. The subtransaction can
2686** can be rolled back independently of the main transaction.
2687** You must start a transaction before starting a subtransaction.
2688** The subtransaction is ended automatically if the main transaction
drh663fc632002-02-02 18:49:19 +00002689** commits or rolls back.
2690**
drhab01f612004-05-22 02:55:23 +00002691** Only one subtransaction may be active at a time. It is an error to try
2692** to start a new subtransaction if another subtransaction is already active.
2693**
2694** Statement subtransactions are used around individual SQL statements
2695** that are contained within a BEGIN...COMMIT block. If a constraint
2696** error occurs within the statement, the effect of that one statement
2697** can be rolled back without having to rollback the entire transaction.
drh663fc632002-02-02 18:49:19 +00002698*/
danielk1977aef0bf62005-12-30 16:28:01 +00002699int sqlite3BtreeBeginStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002700 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002701 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002702 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002703 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002704 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
drhd677b3d2007-08-20 22:48:41 +00002705 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2706 }else{
2707 assert( pBt->inTransaction==TRANS_WRITE );
2708 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2709 pBt->inStmt = 1;
drh0d65dc02002-02-03 00:56:09 +00002710 }
drhd677b3d2007-08-20 22:48:41 +00002711 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002712 return rc;
2713}
2714
2715
2716/*
drhab01f612004-05-22 02:55:23 +00002717** Commit the statment subtransaction currently in progress. If no
2718** subtransaction is active, this is a no-op.
drh663fc632002-02-02 18:49:19 +00002719*/
danielk1977aef0bf62005-12-30 16:28:01 +00002720int sqlite3BtreeCommitStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002721 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002722 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002723 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002724 pBt->db = p->db;
drh3aac2dd2004-04-26 14:10:20 +00002725 if( pBt->inStmt && !pBt->readOnly ){
danielk19773b8a05f2007-03-19 17:44:26 +00002726 rc = sqlite3PagerStmtCommit(pBt->pPager);
drh663fc632002-02-02 18:49:19 +00002727 }else{
2728 rc = SQLITE_OK;
2729 }
drh3aac2dd2004-04-26 14:10:20 +00002730 pBt->inStmt = 0;
drhd677b3d2007-08-20 22:48:41 +00002731 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002732 return rc;
2733}
2734
2735/*
drhab01f612004-05-22 02:55:23 +00002736** Rollback the active statement subtransaction. If no subtransaction
2737** is active this routine is a no-op.
drh663fc632002-02-02 18:49:19 +00002738**
drhab01f612004-05-22 02:55:23 +00002739** All cursors will be invalidated by this operation. Any attempt
drh663fc632002-02-02 18:49:19 +00002740** to use a cursor that was open at the beginning of this operation
2741** will result in an error.
2742*/
danielk1977aef0bf62005-12-30 16:28:01 +00002743int sqlite3BtreeRollbackStmt(Btree *p){
danielk197797a227c2006-01-20 16:32:04 +00002744 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002745 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002746 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002747 pBt->db = p->db;
danielk197797a227c2006-01-20 16:32:04 +00002748 if( pBt->inStmt && !pBt->readOnly ){
danielk19773b8a05f2007-03-19 17:44:26 +00002749 rc = sqlite3PagerStmtRollback(pBt->pPager);
danielk197797a227c2006-01-20 16:32:04 +00002750 pBt->inStmt = 0;
2751 }
drhd677b3d2007-08-20 22:48:41 +00002752 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002753 return rc;
2754}
2755
2756/*
drh8b2f49b2001-06-08 00:21:52 +00002757** Create a new cursor for the BTree whose root is on the page
2758** iTable. The act of acquiring a cursor gets a read lock on
2759** the database file.
drh1bee3d72001-10-15 00:44:35 +00002760**
2761** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00002762** If wrFlag==1, then the cursor can be used for reading or for
2763** writing if other conditions for writing are also met. These
2764** are the conditions that must be met in order for writing to
2765** be allowed:
drh6446c4d2001-12-15 14:22:18 +00002766**
drhf74b8d92002-09-01 23:20:45 +00002767** 1: The cursor must have been opened with wrFlag==1
2768**
drhfe5d71d2007-03-19 11:54:10 +00002769** 2: Other database connections that share the same pager cache
2770** but which are not in the READ_UNCOMMITTED state may not have
2771** cursors open with wrFlag==0 on the same table. Otherwise
2772** the changes made by this write cursor would be visible to
2773** the read cursors in the other database connection.
drhf74b8d92002-09-01 23:20:45 +00002774**
2775** 3: The database must be writable (not on read-only media)
2776**
2777** 4: There must be an active transaction.
2778**
drh6446c4d2001-12-15 14:22:18 +00002779** No checking is done to make sure that page iTable really is the
2780** root page of a b-tree. If it is not, then the cursor acquired
2781** will not work correctly.
danielk197771d5d2c2008-09-29 11:49:47 +00002782**
2783** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
2784** pointed to by pCur have been zeroed by the caller.
drha059ad02001-04-17 20:09:11 +00002785*/
drhd677b3d2007-08-20 22:48:41 +00002786static int btreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002787 Btree *p, /* The btree */
2788 int iTable, /* Root page of table to open */
2789 int wrFlag, /* 1 to write. 0 read-only */
2790 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2791 BtCursor *pCur /* Space for new cursor */
drh3aac2dd2004-04-26 14:10:20 +00002792){
drha059ad02001-04-17 20:09:11 +00002793 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002794 BtShared *pBt = p->pBt;
drhecdc7532001-09-23 02:35:53 +00002795
drh1fee73e2007-08-29 04:00:57 +00002796 assert( sqlite3BtreeHoldsMutex(p) );
drh8dcd7ca2004-08-08 19:43:29 +00002797 if( wrFlag ){
drh8dcd7ca2004-08-08 19:43:29 +00002798 if( pBt->readOnly ){
2799 return SQLITE_READONLY;
2800 }
danielk19773588ceb2008-06-10 17:30:26 +00002801 if( checkReadLocks(p, iTable, 0, 0) ){
drh8dcd7ca2004-08-08 19:43:29 +00002802 return SQLITE_LOCKED;
2803 }
drha0c9a112004-03-10 13:42:37 +00002804 }
danielk1977aef0bf62005-12-30 16:28:01 +00002805
drh4b70f112004-05-02 21:12:19 +00002806 if( pBt->pPage1==0 ){
danielk1977aef0bf62005-12-30 16:28:01 +00002807 rc = lockBtreeWithRetry(p);
drha059ad02001-04-17 20:09:11 +00002808 if( rc!=SQLITE_OK ){
drha059ad02001-04-17 20:09:11 +00002809 return rc;
2810 }
drh1831f182007-04-24 17:35:59 +00002811 if( pBt->readOnly && wrFlag ){
2812 return SQLITE_READONLY;
2813 }
drha059ad02001-04-17 20:09:11 +00002814 }
drh8b2f49b2001-06-08 00:21:52 +00002815 pCur->pgnoRoot = (Pgno)iTable;
danielk1977ad0132d2008-06-07 08:58:22 +00002816 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
drh24cd67e2004-05-10 16:18:47 +00002817 rc = SQLITE_EMPTY;
2818 goto create_cursor_exception;
2819 }
danielk197771d5d2c2008-09-29 11:49:47 +00002820 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
drhbd03cae2001-06-02 02:40:57 +00002821 if( rc!=SQLITE_OK ){
2822 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00002823 }
danielk1977aef0bf62005-12-30 16:28:01 +00002824
danielk1977aef0bf62005-12-30 16:28:01 +00002825 /* Now that no other errors can occur, finish filling in the BtCursor
2826 ** variables, link the cursor into the BtShared list and set *ppCur (the
2827 ** output argument to this function).
2828 */
drh1e968a02008-03-25 00:22:21 +00002829 pCur->pKeyInfo = pKeyInfo;
danielk1977aef0bf62005-12-30 16:28:01 +00002830 pCur->pBtree = p;
drhd0679ed2007-08-28 22:24:34 +00002831 pCur->pBt = pBt;
drhecdc7532001-09-23 02:35:53 +00002832 pCur->wrFlag = wrFlag;
drha059ad02001-04-17 20:09:11 +00002833 pCur->pNext = pBt->pCursor;
2834 if( pCur->pNext ){
2835 pCur->pNext->pPrev = pCur;
2836 }
2837 pBt->pCursor = pCur;
danielk1977da184232006-01-05 11:34:32 +00002838 pCur->eState = CURSOR_INVALID;
drhbd03cae2001-06-02 02:40:57 +00002839
danielk1977aef0bf62005-12-30 16:28:01 +00002840 return SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00002841
drhbd03cae2001-06-02 02:40:57 +00002842create_cursor_exception:
danielk197771d5d2c2008-09-29 11:49:47 +00002843 releasePage(pCur->apPage[0]);
drh5e00f6c2001-09-13 13:46:56 +00002844 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00002845 return rc;
drha059ad02001-04-17 20:09:11 +00002846}
drhd677b3d2007-08-20 22:48:41 +00002847int sqlite3BtreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002848 Btree *p, /* The btree */
2849 int iTable, /* Root page of table to open */
2850 int wrFlag, /* 1 to write. 0 read-only */
2851 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2852 BtCursor *pCur /* Write new cursor here */
drhd677b3d2007-08-20 22:48:41 +00002853){
2854 int rc;
2855 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002856 p->pBt->db = p->db;
danielk1977cd3e8f72008-03-25 09:47:35 +00002857 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
drhd677b3d2007-08-20 22:48:41 +00002858 sqlite3BtreeLeave(p);
2859 return rc;
2860}
danielk1977cd3e8f72008-03-25 09:47:35 +00002861int sqlite3BtreeCursorSize(){
2862 return sizeof(BtCursor);
2863}
2864
drhd677b3d2007-08-20 22:48:41 +00002865
drha059ad02001-04-17 20:09:11 +00002866
2867/*
drh5e00f6c2001-09-13 13:46:56 +00002868** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00002869** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00002870*/
drh3aac2dd2004-04-26 14:10:20 +00002871int sqlite3BtreeCloseCursor(BtCursor *pCur){
drhff0587c2007-08-29 17:43:19 +00002872 Btree *pBtree = pCur->pBtree;
danielk1977cd3e8f72008-03-25 09:47:35 +00002873 if( pBtree ){
danielk197771d5d2c2008-09-29 11:49:47 +00002874 int i;
danielk1977cd3e8f72008-03-25 09:47:35 +00002875 BtShared *pBt = pCur->pBt;
2876 sqlite3BtreeEnter(pBtree);
2877 pBt->db = pBtree->db;
danielk1977be51a652008-10-08 17:58:48 +00002878 sqlite3BtreeClearCursor(pCur);
danielk1977cd3e8f72008-03-25 09:47:35 +00002879 if( pCur->pPrev ){
2880 pCur->pPrev->pNext = pCur->pNext;
2881 }else{
2882 pBt->pCursor = pCur->pNext;
2883 }
2884 if( pCur->pNext ){
2885 pCur->pNext->pPrev = pCur->pPrev;
2886 }
danielk197771d5d2c2008-09-29 11:49:47 +00002887 for(i=0; i<=pCur->iPage; i++){
2888 releasePage(pCur->apPage[i]);
2889 }
danielk1977cd3e8f72008-03-25 09:47:35 +00002890 unlockBtreeIfUnused(pBt);
2891 invalidateOverflowCache(pCur);
2892 /* sqlite3_free(pCur); */
2893 sqlite3BtreeLeave(pBtree);
drha059ad02001-04-17 20:09:11 +00002894 }
drh8c42ca92001-06-22 19:15:00 +00002895 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002896}
2897
drh7e3b0a02001-04-28 16:52:40 +00002898/*
drh5e2f8b92001-05-28 00:41:15 +00002899** Make a temporary cursor by filling in the fields of pTempCur.
2900** The temporary cursor is not on the cursor list for the Btree.
2901*/
drh16a9b832007-05-05 18:39:25 +00002902void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002903 int i;
drh1fee73e2007-08-29 04:00:57 +00002904 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002905 memcpy(pTempCur, pCur, sizeof(BtCursor));
drh5e2f8b92001-05-28 00:41:15 +00002906 pTempCur->pNext = 0;
2907 pTempCur->pPrev = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00002908 for(i=0; i<=pTempCur->iPage; i++){
2909 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002910 }
drh5e2f8b92001-05-28 00:41:15 +00002911}
2912
2913/*
drhbd03cae2001-06-02 02:40:57 +00002914** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00002915** function above.
2916*/
drh16a9b832007-05-05 18:39:25 +00002917void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002918 int i;
drh1fee73e2007-08-29 04:00:57 +00002919 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002920 for(i=0; i<=pCur->iPage; i++){
2921 sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002922 }
drh5e2f8b92001-05-28 00:41:15 +00002923}
2924
2925/*
drh86057612007-06-26 01:04:48 +00002926** Make sure the BtCursor* given in the argument has a valid
2927** BtCursor.info structure. If it is not already valid, call
danielk19771cc5ed82007-05-16 17:28:43 +00002928** sqlite3BtreeParseCell() to fill it in.
drhab01f612004-05-22 02:55:23 +00002929**
2930** BtCursor.info is a cache of the information in the current cell.
drh16a9b832007-05-05 18:39:25 +00002931** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
drh86057612007-06-26 01:04:48 +00002932**
2933** 2007-06-25: There is a bug in some versions of MSVC that cause the
2934** compiler to crash when getCellInfo() is implemented as a macro.
2935** But there is a measureable speed advantage to using the macro on gcc
2936** (when less compiler optimizations like -Os or -O0 are used and the
2937** compiler is not doing agressive inlining.) So we use a real function
2938** for MSVC and a macro for everything else. Ticket #2457.
drh9188b382004-05-14 21:12:22 +00002939*/
drh9188b382004-05-14 21:12:22 +00002940#ifndef NDEBUG
danielk19771cc5ed82007-05-16 17:28:43 +00002941 static void assertCellInfo(BtCursor *pCur){
drh9188b382004-05-14 21:12:22 +00002942 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00002943 int iPage = pCur->iPage;
drh51c6d962004-06-06 00:42:25 +00002944 memset(&info, 0, sizeof(info));
danielk197771d5d2c2008-09-29 11:49:47 +00002945 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
drh9188b382004-05-14 21:12:22 +00002946 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
drh9188b382004-05-14 21:12:22 +00002947 }
danielk19771cc5ed82007-05-16 17:28:43 +00002948#else
2949 #define assertCellInfo(x)
2950#endif
drh86057612007-06-26 01:04:48 +00002951#ifdef _MSC_VER
2952 /* Use a real function in MSVC to work around bugs in that compiler. */
2953 static void getCellInfo(BtCursor *pCur){
2954 if( pCur->info.nSize==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00002955 int iPage = pCur->iPage;
2956 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
drha2c20e42008-03-29 16:01:04 +00002957 pCur->validNKey = 1;
drh86057612007-06-26 01:04:48 +00002958 }else{
2959 assertCellInfo(pCur);
2960 }
2961 }
2962#else /* if not _MSC_VER */
2963 /* Use a macro in all other compilers so that the function is inlined */
danielk197771d5d2c2008-09-29 11:49:47 +00002964#define getCellInfo(pCur) \
2965 if( pCur->info.nSize==0 ){ \
2966 int iPage = pCur->iPage; \
2967 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
2968 pCur->validNKey = 1; \
2969 }else{ \
2970 assertCellInfo(pCur); \
drh86057612007-06-26 01:04:48 +00002971 }
2972#endif /* _MSC_VER */
drh9188b382004-05-14 21:12:22 +00002973
2974/*
drh3aac2dd2004-04-26 14:10:20 +00002975** Set *pSize to the size of the buffer needed to hold the value of
2976** the key for the current entry. If the cursor is not pointing
2977** to a valid entry, *pSize is set to 0.
2978**
drh4b70f112004-05-02 21:12:19 +00002979** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00002980** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00002981*/
drh4a1c3802004-05-12 15:15:47 +00002982int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
drhd677b3d2007-08-20 22:48:41 +00002983 int rc;
2984
drh1fee73e2007-08-29 04:00:57 +00002985 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00002986 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00002987 if( rc==SQLITE_OK ){
2988 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2989 if( pCur->eState==CURSOR_INVALID ){
2990 *pSize = 0;
2991 }else{
drh86057612007-06-26 01:04:48 +00002992 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00002993 *pSize = pCur->info.nKey;
2994 }
drh72f82862001-05-24 21:06:34 +00002995 }
danielk1977da184232006-01-05 11:34:32 +00002996 return rc;
drha059ad02001-04-17 20:09:11 +00002997}
drh2af926b2001-05-15 00:39:25 +00002998
drh72f82862001-05-24 21:06:34 +00002999/*
drh0e1c19e2004-05-11 00:58:56 +00003000** Set *pSize to the number of bytes of data in the entry the
3001** cursor currently points to. Always return SQLITE_OK.
3002** Failure is not possible. If the cursor is not currently
3003** pointing to an entry (which can happen, for example, if
3004** the database is empty) then *pSize is set to 0.
3005*/
3006int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drhd677b3d2007-08-20 22:48:41 +00003007 int rc;
3008
drh1fee73e2007-08-29 04:00:57 +00003009 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003010 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003011 if( rc==SQLITE_OK ){
3012 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3013 if( pCur->eState==CURSOR_INVALID ){
3014 /* Not pointing at a valid entry - set *pSize to 0. */
3015 *pSize = 0;
3016 }else{
drh86057612007-06-26 01:04:48 +00003017 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00003018 *pSize = pCur->info.nData;
3019 }
drh0e1c19e2004-05-11 00:58:56 +00003020 }
danielk1977da184232006-01-05 11:34:32 +00003021 return rc;
drh0e1c19e2004-05-11 00:58:56 +00003022}
3023
3024/*
danielk1977d04417962007-05-02 13:16:30 +00003025** Given the page number of an overflow page in the database (parameter
3026** ovfl), this function finds the page number of the next page in the
3027** linked list of overflow pages. If possible, it uses the auto-vacuum
3028** pointer-map data instead of reading the content of page ovfl to do so.
3029**
3030** If an error occurs an SQLite error code is returned. Otherwise:
3031**
3032** Unless pPgnoNext is NULL, the page number of the next overflow
3033** page in the linked list is written to *pPgnoNext. If page ovfl
drh85b623f2007-12-13 21:54:09 +00003034** is the last page in its linked list, *pPgnoNext is set to zero.
danielk1977d04417962007-05-02 13:16:30 +00003035**
3036** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3037** for page ovfl. The underlying pager page may have been requested
3038** with the noContent flag set, so the page data accessable via
3039** this handle may not be trusted.
3040*/
3041static int getOverflowPage(
3042 BtShared *pBt,
3043 Pgno ovfl, /* Overflow page */
3044 MemPage **ppPage, /* OUT: MemPage handle */
3045 Pgno *pPgnoNext /* OUT: Next overflow page number */
3046){
3047 Pgno next = 0;
3048 int rc;
3049
drh1fee73e2007-08-29 04:00:57 +00003050 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977d04417962007-05-02 13:16:30 +00003051 /* One of these must not be NULL. Otherwise, why call this function? */
3052 assert(ppPage || pPgnoNext);
3053
3054 /* If pPgnoNext is NULL, then this function is being called to obtain
3055 ** a MemPage* reference only. No page-data is required in this case.
3056 */
3057 if( !pPgnoNext ){
drh16a9b832007-05-05 18:39:25 +00003058 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
danielk1977d04417962007-05-02 13:16:30 +00003059 }
3060
3061#ifndef SQLITE_OMIT_AUTOVACUUM
3062 /* Try to find the next page in the overflow list using the
3063 ** autovacuum pointer-map pages. Guess that the next page in
3064 ** the overflow list is page number (ovfl+1). If that guess turns
3065 ** out to be wrong, fall back to loading the data of page
3066 ** number ovfl to determine the next page number.
3067 */
3068 if( pBt->autoVacuum ){
3069 Pgno pgno;
3070 Pgno iGuess = ovfl+1;
3071 u8 eType;
3072
3073 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3074 iGuess++;
3075 }
3076
danielk1977ad0132d2008-06-07 08:58:22 +00003077 if( iGuess<=pagerPagecount(pBt->pPager) ){
danielk1977d04417962007-05-02 13:16:30 +00003078 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3079 if( rc!=SQLITE_OK ){
3080 return rc;
3081 }
3082 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3083 next = iGuess;
3084 }
3085 }
3086 }
3087#endif
3088
3089 if( next==0 || ppPage ){
3090 MemPage *pPage = 0;
3091
drh16a9b832007-05-05 18:39:25 +00003092 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
danielk1977d04417962007-05-02 13:16:30 +00003093 assert(rc==SQLITE_OK || pPage==0);
3094 if( next==0 && rc==SQLITE_OK ){
3095 next = get4byte(pPage->aData);
3096 }
3097
3098 if( ppPage ){
3099 *ppPage = pPage;
3100 }else{
3101 releasePage(pPage);
3102 }
3103 }
3104 *pPgnoNext = next;
3105
3106 return rc;
3107}
3108
danielk1977da107192007-05-04 08:32:13 +00003109/*
3110** Copy data from a buffer to a page, or from a page to a buffer.
3111**
3112** pPayload is a pointer to data stored on database page pDbPage.
3113** If argument eOp is false, then nByte bytes of data are copied
3114** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3115** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3116** of data are copied from the buffer pBuf to pPayload.
3117**
3118** SQLITE_OK is returned on success, otherwise an error code.
3119*/
3120static int copyPayload(
3121 void *pPayload, /* Pointer to page data */
3122 void *pBuf, /* Pointer to buffer */
3123 int nByte, /* Number of bytes to copy */
3124 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3125 DbPage *pDbPage /* Page containing pPayload */
3126){
3127 if( eOp ){
3128 /* Copy data from buffer to page (a write operation) */
3129 int rc = sqlite3PagerWrite(pDbPage);
3130 if( rc!=SQLITE_OK ){
3131 return rc;
3132 }
3133 memcpy(pPayload, pBuf, nByte);
3134 }else{
3135 /* Copy data from page to buffer (a read operation) */
3136 memcpy(pBuf, pPayload, nByte);
3137 }
3138 return SQLITE_OK;
3139}
danielk1977d04417962007-05-02 13:16:30 +00003140
3141/*
danielk19779f8d6402007-05-02 17:48:45 +00003142** This function is used to read or overwrite payload information
3143** for the entry that the pCur cursor is pointing to. If the eOp
3144** parameter is 0, this is a read operation (data copied into
3145** buffer pBuf). If it is non-zero, a write (data copied from
3146** buffer pBuf).
3147**
3148** A total of "amt" bytes are read or written beginning at "offset".
3149** Data is read to or from the buffer pBuf.
drh72f82862001-05-24 21:06:34 +00003150**
3151** This routine does not make a distinction between key and data.
danielk19779f8d6402007-05-02 17:48:45 +00003152** It just reads or writes bytes from the payload area. Data might
3153** appear on the main page or be scattered out on multiple overflow
3154** pages.
danielk1977da107192007-05-04 08:32:13 +00003155**
danielk1977dcbb5d32007-05-04 18:36:44 +00003156** If the BtCursor.isIncrblobHandle flag is set, and the current
danielk1977da107192007-05-04 08:32:13 +00003157** cursor entry uses one or more overflow pages, this function
3158** allocates space for and lazily popluates the overflow page-list
3159** cache array (BtCursor.aOverflow). Subsequent calls use this
3160** cache to make seeking to the supplied offset more efficient.
3161**
3162** Once an overflow page-list cache has been allocated, it may be
3163** invalidated if some other cursor writes to the same table, or if
3164** the cursor is moved to a different row. Additionally, in auto-vacuum
3165** mode, the following events may invalidate an overflow page-list cache.
3166**
3167** * An incremental vacuum,
3168** * A commit in auto_vacuum="full" mode,
3169** * Creating a table (may require moving an overflow page).
drh72f82862001-05-24 21:06:34 +00003170*/
danielk19779f8d6402007-05-02 17:48:45 +00003171static int accessPayload(
drh3aac2dd2004-04-26 14:10:20 +00003172 BtCursor *pCur, /* Cursor pointing to entry to read from */
3173 int offset, /* Begin reading this far into payload */
3174 int amt, /* Read this many bytes */
3175 unsigned char *pBuf, /* Write the bytes into this buffer */
danielk19779f8d6402007-05-02 17:48:45 +00003176 int skipKey, /* offset begins at data if this is true */
3177 int eOp /* zero to read. non-zero to write. */
drh3aac2dd2004-04-26 14:10:20 +00003178){
3179 unsigned char *aPayload;
danielk1977da107192007-05-04 08:32:13 +00003180 int rc = SQLITE_OK;
drhfa1a98a2004-05-14 19:08:17 +00003181 u32 nKey;
danielk19772dec9702007-05-02 16:48:37 +00003182 int iIdx = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003183 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3184 BtShared *pBt; /* Btree this cursor belongs to */
drh3aac2dd2004-04-26 14:10:20 +00003185
danielk1977da107192007-05-04 08:32:13 +00003186 assert( pPage );
danielk1977da184232006-01-05 11:34:32 +00003187 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003188 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
danielk1977da107192007-05-04 08:32:13 +00003189 assert( offset>=0 );
drh1fee73e2007-08-29 04:00:57 +00003190 assert( cursorHoldsMutex(pCur) );
danielk1977da107192007-05-04 08:32:13 +00003191
drh86057612007-06-26 01:04:48 +00003192 getCellInfo(pCur);
drh366fda62006-01-13 02:35:09 +00003193 aPayload = pCur->info.pCell + pCur->info.nHeader;
danielk1977da107192007-05-04 08:32:13 +00003194 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3195
drh3aac2dd2004-04-26 14:10:20 +00003196 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003197 offset += nKey;
drh3aac2dd2004-04-26 14:10:20 +00003198 }
drhfa1a98a2004-05-14 19:08:17 +00003199 if( offset+amt > nKey+pCur->info.nData ){
danielk1977da107192007-05-04 08:32:13 +00003200 /* Trying to read or write past the end of the data is an error */
danielk197767fd7a92008-09-10 17:53:35 +00003201 return SQLITE_CORRUPT_BKPT;
drh3aac2dd2004-04-26 14:10:20 +00003202 }
danielk1977da107192007-05-04 08:32:13 +00003203
3204 /* Check if data must be read/written to/from the btree page itself. */
drhfa1a98a2004-05-14 19:08:17 +00003205 if( offset<pCur->info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00003206 int a = amt;
drhfa1a98a2004-05-14 19:08:17 +00003207 if( a+offset>pCur->info.nLocal ){
3208 a = pCur->info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00003209 }
danielk1977da107192007-05-04 08:32:13 +00003210 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
drh2aa679f2001-06-25 02:11:07 +00003211 offset = 0;
drha34b6762004-05-07 13:30:42 +00003212 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00003213 amt -= a;
drhdd793422001-06-28 01:54:48 +00003214 }else{
drhfa1a98a2004-05-14 19:08:17 +00003215 offset -= pCur->info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00003216 }
danielk1977da107192007-05-04 08:32:13 +00003217
drh51f015e2007-10-16 19:45:29 +00003218 pBt = pCur->pBt;
danielk1977da107192007-05-04 08:32:13 +00003219 if( rc==SQLITE_OK && amt>0 ){
3220 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3221 Pgno nextPage;
3222
drhfa1a98a2004-05-14 19:08:17 +00003223 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
danielk1977da107192007-05-04 08:32:13 +00003224
danielk19772dec9702007-05-02 16:48:37 +00003225#ifndef SQLITE_OMIT_INCRBLOB
danielk1977dcbb5d32007-05-04 18:36:44 +00003226 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
danielk1977da107192007-05-04 08:32:13 +00003227 ** has not been allocated, allocate it now. The array is sized at
3228 ** one entry for each overflow page in the overflow chain. The
3229 ** page number of the first overflow page is stored in aOverflow[0],
3230 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3231 ** (the cache is lazily populated).
3232 */
danielk1977dcbb5d32007-05-04 18:36:44 +00003233 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
danielk19772dec9702007-05-02 16:48:37 +00003234 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
drh17435752007-08-16 04:30:38 +00003235 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
danielk19772dec9702007-05-02 16:48:37 +00003236 if( nOvfl && !pCur->aOverflow ){
danielk1977da107192007-05-04 08:32:13 +00003237 rc = SQLITE_NOMEM;
danielk19772dec9702007-05-02 16:48:37 +00003238 }
3239 }
danielk1977da107192007-05-04 08:32:13 +00003240
3241 /* If the overflow page-list cache has been allocated and the
3242 ** entry for the first required overflow page is valid, skip
3243 ** directly to it.
3244 */
danielk19772dec9702007-05-02 16:48:37 +00003245 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3246 iIdx = (offset/ovflSize);
3247 nextPage = pCur->aOverflow[iIdx];
3248 offset = (offset%ovflSize);
3249 }
3250#endif
danielk1977da107192007-05-04 08:32:13 +00003251
3252 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3253
3254#ifndef SQLITE_OMIT_INCRBLOB
3255 /* If required, populate the overflow page-list cache. */
3256 if( pCur->aOverflow ){
3257 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3258 pCur->aOverflow[iIdx] = nextPage;
3259 }
3260#endif
3261
danielk1977d04417962007-05-02 13:16:30 +00003262 if( offset>=ovflSize ){
3263 /* The only reason to read this page is to obtain the page
danielk1977da107192007-05-04 08:32:13 +00003264 ** number for the next page in the overflow chain. The page
drhfd131da2007-08-07 17:13:03 +00003265 ** data is not required. So first try to lookup the overflow
3266 ** page-list cache, if any, then fall back to the getOverflowPage()
danielk1977da107192007-05-04 08:32:13 +00003267 ** function.
danielk1977d04417962007-05-02 13:16:30 +00003268 */
danielk19772dec9702007-05-02 16:48:37 +00003269#ifndef SQLITE_OMIT_INCRBLOB
danielk1977da107192007-05-04 08:32:13 +00003270 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3271 nextPage = pCur->aOverflow[iIdx+1];
3272 } else
danielk19772dec9702007-05-02 16:48:37 +00003273#endif
danielk1977da107192007-05-04 08:32:13 +00003274 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
danielk1977da107192007-05-04 08:32:13 +00003275 offset -= ovflSize;
danielk1977d04417962007-05-02 13:16:30 +00003276 }else{
danielk19779f8d6402007-05-02 17:48:45 +00003277 /* Need to read this page properly. It contains some of the
3278 ** range of data that is being read (eOp==0) or written (eOp!=0).
danielk1977d04417962007-05-02 13:16:30 +00003279 */
3280 DbPage *pDbPage;
danielk1977cfe9a692004-06-16 12:00:29 +00003281 int a = amt;
danielk1977d04417962007-05-02 13:16:30 +00003282 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
danielk1977da107192007-05-04 08:32:13 +00003283 if( rc==SQLITE_OK ){
3284 aPayload = sqlite3PagerGetData(pDbPage);
3285 nextPage = get4byte(aPayload);
3286 if( a + offset > ovflSize ){
3287 a = ovflSize - offset;
danielk19779f8d6402007-05-02 17:48:45 +00003288 }
danielk1977da107192007-05-04 08:32:13 +00003289 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3290 sqlite3PagerUnref(pDbPage);
3291 offset = 0;
3292 amt -= a;
3293 pBuf += a;
danielk19779f8d6402007-05-02 17:48:45 +00003294 }
danielk1977cfe9a692004-06-16 12:00:29 +00003295 }
drh2af926b2001-05-15 00:39:25 +00003296 }
drh2af926b2001-05-15 00:39:25 +00003297 }
danielk1977cfe9a692004-06-16 12:00:29 +00003298
danielk1977da107192007-05-04 08:32:13 +00003299 if( rc==SQLITE_OK && amt>0 ){
drh49285702005-09-17 15:20:26 +00003300 return SQLITE_CORRUPT_BKPT;
drha7fcb052001-12-14 15:09:55 +00003301 }
danielk1977da107192007-05-04 08:32:13 +00003302 return rc;
drh2af926b2001-05-15 00:39:25 +00003303}
3304
drh72f82862001-05-24 21:06:34 +00003305/*
drh3aac2dd2004-04-26 14:10:20 +00003306** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003307** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003308** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00003309**
drh3aac2dd2004-04-26 14:10:20 +00003310** Return SQLITE_OK on success or an error code if anything goes
3311** wrong. An error is returned if "offset+amt" is larger than
3312** the available payload.
drh72f82862001-05-24 21:06:34 +00003313*/
drha34b6762004-05-07 13:30:42 +00003314int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003315 int rc;
3316
drh1fee73e2007-08-29 04:00:57 +00003317 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003318 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003319 if( rc==SQLITE_OK ){
3320 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003321 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3322 if( pCur->apPage[0]->intKey ){
danielk1977da184232006-01-05 11:34:32 +00003323 return SQLITE_CORRUPT_BKPT;
3324 }
danielk197771d5d2c2008-09-29 11:49:47 +00003325 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003326 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
drh6575a222005-03-10 17:06:34 +00003327 }
danielk1977da184232006-01-05 11:34:32 +00003328 return rc;
drh3aac2dd2004-04-26 14:10:20 +00003329}
3330
3331/*
drh3aac2dd2004-04-26 14:10:20 +00003332** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003333** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003334** begins at "offset".
3335**
3336** Return SQLITE_OK on success or an error code if anything goes
3337** wrong. An error is returned if "offset+amt" is larger than
3338** the available payload.
drh72f82862001-05-24 21:06:34 +00003339*/
drh3aac2dd2004-04-26 14:10:20 +00003340int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003341 int rc;
3342
danielk19773588ceb2008-06-10 17:30:26 +00003343#ifndef SQLITE_OMIT_INCRBLOB
3344 if ( pCur->eState==CURSOR_INVALID ){
3345 return SQLITE_ABORT;
3346 }
3347#endif
3348
drh1fee73e2007-08-29 04:00:57 +00003349 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003350 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003351 if( rc==SQLITE_OK ){
3352 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003353 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3354 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003355 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
danielk1977da184232006-01-05 11:34:32 +00003356 }
3357 return rc;
drh2af926b2001-05-15 00:39:25 +00003358}
3359
drh72f82862001-05-24 21:06:34 +00003360/*
drh0e1c19e2004-05-11 00:58:56 +00003361** Return a pointer to payload information from the entry that the
3362** pCur cursor is pointing to. The pointer is to the beginning of
3363** the key if skipKey==0 and it points to the beginning of data if
drhe51c44f2004-05-30 20:46:09 +00003364** skipKey==1. The number of bytes of available key/data is written
3365** into *pAmt. If *pAmt==0, then the value returned will not be
3366** a valid pointer.
drh0e1c19e2004-05-11 00:58:56 +00003367**
3368** This routine is an optimization. It is common for the entire key
3369** and data to fit on the local page and for there to be no overflow
3370** pages. When that is so, this routine can be used to access the
3371** key and data without making a copy. If the key and/or data spills
drh16a9b832007-05-05 18:39:25 +00003372** onto overflow pages, then accessPayload() must be used to reassembly
drh0e1c19e2004-05-11 00:58:56 +00003373** the key/data and copy it into a preallocated buffer.
3374**
3375** The pointer returned by this routine looks directly into the cached
3376** page of the database. The data might change or move the next time
3377** any btree routine is called.
3378*/
3379static const unsigned char *fetchPayload(
3380 BtCursor *pCur, /* Cursor pointing to entry to read from */
drhe51c44f2004-05-30 20:46:09 +00003381 int *pAmt, /* Write the number of available bytes here */
drh0e1c19e2004-05-11 00:58:56 +00003382 int skipKey /* read beginning at data if this is true */
3383){
3384 unsigned char *aPayload;
3385 MemPage *pPage;
drhfa1a98a2004-05-14 19:08:17 +00003386 u32 nKey;
3387 int nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003388
danielk197771d5d2c2008-09-29 11:49:47 +00003389 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
danielk1977da184232006-01-05 11:34:32 +00003390 assert( pCur->eState==CURSOR_VALID );
drh1fee73e2007-08-29 04:00:57 +00003391 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00003392 pPage = pCur->apPage[pCur->iPage];
3393 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drh86057612007-06-26 01:04:48 +00003394 getCellInfo(pCur);
drh43605152004-05-29 21:46:49 +00003395 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00003396 aPayload += pCur->info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00003397 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00003398 nKey = 0;
3399 }else{
3400 nKey = pCur->info.nKey;
drh0e1c19e2004-05-11 00:58:56 +00003401 }
drh0e1c19e2004-05-11 00:58:56 +00003402 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003403 aPayload += nKey;
3404 nLocal = pCur->info.nLocal - nKey;
drh0e1c19e2004-05-11 00:58:56 +00003405 }else{
drhfa1a98a2004-05-14 19:08:17 +00003406 nLocal = pCur->info.nLocal;
drhe51c44f2004-05-30 20:46:09 +00003407 if( nLocal>nKey ){
3408 nLocal = nKey;
3409 }
drh0e1c19e2004-05-11 00:58:56 +00003410 }
drhe51c44f2004-05-30 20:46:09 +00003411 *pAmt = nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003412 return aPayload;
3413}
3414
3415
3416/*
drhe51c44f2004-05-30 20:46:09 +00003417** For the entry that cursor pCur is point to, return as
3418** many bytes of the key or data as are available on the local
3419** b-tree page. Write the number of available bytes into *pAmt.
drh0e1c19e2004-05-11 00:58:56 +00003420**
3421** The pointer returned is ephemeral. The key/data may move
drhd677b3d2007-08-20 22:48:41 +00003422** or be destroyed on the next call to any Btree routine,
3423** including calls from other threads against the same cache.
3424** Hence, a mutex on the BtShared should be held prior to calling
3425** this routine.
drh0e1c19e2004-05-11 00:58:56 +00003426**
3427** These routines is used to get quick access to key and data
3428** in the common case where no overflow pages are used.
drh0e1c19e2004-05-11 00:58:56 +00003429*/
drhe51c44f2004-05-30 20:46:09 +00003430const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003431 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003432 if( pCur->eState==CURSOR_VALID ){
3433 return (const void*)fetchPayload(pCur, pAmt, 0);
3434 }
3435 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003436}
drhe51c44f2004-05-30 20:46:09 +00003437const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003438 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003439 if( pCur->eState==CURSOR_VALID ){
3440 return (const void*)fetchPayload(pCur, pAmt, 1);
3441 }
3442 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003443}
3444
3445
3446/*
drh8178a752003-01-05 21:41:40 +00003447** Move the cursor down to a new child page. The newPgno argument is the
drhab01f612004-05-22 02:55:23 +00003448** page number of the child page to move to.
drh72f82862001-05-24 21:06:34 +00003449*/
drh3aac2dd2004-04-26 14:10:20 +00003450static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00003451 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003452 int i = pCur->iPage;
drh72f82862001-05-24 21:06:34 +00003453 MemPage *pNewPage;
drhd0679ed2007-08-28 22:24:34 +00003454 BtShared *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00003455
drh1fee73e2007-08-29 04:00:57 +00003456 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003457 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003458 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3459 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3460 return SQLITE_CORRUPT_BKPT;
3461 }
3462 rc = getAndInitPage(pBt, newPgno, &pNewPage);
drh6019e162001-07-02 17:51:45 +00003463 if( rc ) return rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003464 pCur->apPage[i+1] = pNewPage;
3465 pCur->aiIdx[i+1] = 0;
3466 pCur->iPage++;
3467
drh271efa52004-05-30 19:19:05 +00003468 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003469 pCur->validNKey = 0;
drh4be295b2003-12-16 03:44:47 +00003470 if( pNewPage->nCell<1 ){
drh49285702005-09-17 15:20:26 +00003471 return SQLITE_CORRUPT_BKPT;
drh4be295b2003-12-16 03:44:47 +00003472 }
drh72f82862001-05-24 21:06:34 +00003473 return SQLITE_OK;
3474}
3475
danielk1977bf93c562008-09-29 15:53:25 +00003476#ifndef NDEBUG
3477/*
3478** Page pParent is an internal (non-leaf) tree page. This function
3479** asserts that page number iChild is the left-child if the iIdx'th
3480** cell in page pParent. Or, if iIdx is equal to the total number of
3481** cells in pParent, that page number iChild is the right-child of
3482** the page.
3483*/
3484static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3485 assert( iIdx<=pParent->nCell );
3486 if( iIdx==pParent->nCell ){
3487 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3488 }else{
3489 assert( get4byte(findCell(pParent, iIdx))==iChild );
3490 }
3491}
3492#else
3493# define assertParentIndex(x,y,z)
3494#endif
3495
drh72f82862001-05-24 21:06:34 +00003496/*
drh5e2f8b92001-05-28 00:41:15 +00003497** Move the cursor up to the parent page.
3498**
3499** pCur->idx is set to the cell index that contains the pointer
3500** to the page we are coming from. If we are coming from the
3501** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00003502** the largest cell index.
drh72f82862001-05-24 21:06:34 +00003503*/
drh16a9b832007-05-05 18:39:25 +00003504void sqlite3BtreeMoveToParent(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00003505 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003506 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003507 assert( pCur->iPage>0 );
3508 assert( pCur->apPage[pCur->iPage] );
danielk1977bf93c562008-09-29 15:53:25 +00003509 assertParentIndex(
3510 pCur->apPage[pCur->iPage-1],
3511 pCur->aiIdx[pCur->iPage-1],
3512 pCur->apPage[pCur->iPage]->pgno
3513 );
danielk197771d5d2c2008-09-29 11:49:47 +00003514 releasePage(pCur->apPage[pCur->iPage]);
3515 pCur->iPage--;
drh271efa52004-05-30 19:19:05 +00003516 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003517 pCur->validNKey = 0;
drh72f82862001-05-24 21:06:34 +00003518}
3519
3520/*
3521** Move the cursor to the root page
3522*/
drh5e2f8b92001-05-28 00:41:15 +00003523static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00003524 MemPage *pRoot;
drh777e4c42006-01-13 04:31:58 +00003525 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00003526 Btree *p = pCur->pBtree;
3527 BtShared *pBt = p->pBt;
drhbd03cae2001-06-02 02:40:57 +00003528
drh1fee73e2007-08-29 04:00:57 +00003529 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +00003530 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3531 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3532 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3533 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3534 if( pCur->eState==CURSOR_FAULT ){
3535 return pCur->skip;
3536 }
danielk1977be51a652008-10-08 17:58:48 +00003537 sqlite3BtreeClearCursor(pCur);
drhbf700f32007-03-31 02:36:44 +00003538 }
danielk197771d5d2c2008-09-29 11:49:47 +00003539
3540 if( pCur->iPage>=0 ){
3541 int i;
3542 for(i=1; i<=pCur->iPage; i++){
3543 releasePage(pCur->apPage[i]);
danielk1977d9f6c532008-09-19 16:39:38 +00003544 }
drh777e4c42006-01-13 04:31:58 +00003545 }else{
3546 if(
danielk197771d5d2c2008-09-29 11:49:47 +00003547 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
drh777e4c42006-01-13 04:31:58 +00003548 ){
3549 pCur->eState = CURSOR_INVALID;
3550 return rc;
3551 }
drhc39e0002004-05-07 23:50:57 +00003552 }
danielk197771d5d2c2008-09-29 11:49:47 +00003553
3554 pRoot = pCur->apPage[0];
3555 assert( pRoot->pgno==pCur->pgnoRoot );
3556 pCur->iPage = 0;
3557 pCur->aiIdx[0] = 0;
drh271efa52004-05-30 19:19:05 +00003558 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003559 pCur->atLast = 0;
3560 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003561
drh8856d6a2004-04-29 14:42:46 +00003562 if( pRoot->nCell==0 && !pRoot->leaf ){
3563 Pgno subpage;
3564 assert( pRoot->pgno==1 );
drh43605152004-05-29 21:46:49 +00003565 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
drh8856d6a2004-04-29 14:42:46 +00003566 assert( subpage>0 );
danielk1977da184232006-01-05 11:34:32 +00003567 pCur->eState = CURSOR_VALID;
drh4b70f112004-05-02 21:12:19 +00003568 rc = moveToChild(pCur, subpage);
danielk197771d5d2c2008-09-29 11:49:47 +00003569 }else{
3570 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
drh8856d6a2004-04-29 14:42:46 +00003571 }
3572 return rc;
drh72f82862001-05-24 21:06:34 +00003573}
drh2af926b2001-05-15 00:39:25 +00003574
drh5e2f8b92001-05-28 00:41:15 +00003575/*
3576** Move the cursor down to the left-most leaf entry beneath the
3577** entry to which it is currently pointing.
drh777e4c42006-01-13 04:31:58 +00003578**
3579** The left-most leaf is the one with the smallest key - the first
3580** in ascending order.
drh5e2f8b92001-05-28 00:41:15 +00003581*/
3582static int moveToLeftmost(BtCursor *pCur){
3583 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003584 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00003585 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00003586
drh1fee73e2007-08-29 04:00:57 +00003587 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003588 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003589 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3590 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3591 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
drh8178a752003-01-05 21:41:40 +00003592 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00003593 }
drhd677b3d2007-08-20 22:48:41 +00003594 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003595}
3596
drh2dcc9aa2002-12-04 13:40:25 +00003597/*
3598** Move the cursor down to the right-most leaf entry beneath the
3599** page to which it is currently pointing. Notice the difference
3600** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3601** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3602** finds the right-most entry beneath the *page*.
drh777e4c42006-01-13 04:31:58 +00003603**
3604** The right-most entry is the one with the largest key - the last
3605** key in ascending order.
drh2dcc9aa2002-12-04 13:40:25 +00003606*/
3607static int moveToRightmost(BtCursor *pCur){
3608 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003609 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00003610 MemPage *pPage;
drh2dcc9aa2002-12-04 13:40:25 +00003611
drh1fee73e2007-08-29 04:00:57 +00003612 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003613 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003614 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
drh43605152004-05-29 21:46:49 +00003615 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
danielk197771d5d2c2008-09-29 11:49:47 +00003616 pCur->aiIdx[pCur->iPage] = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00003617 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00003618 }
drhd677b3d2007-08-20 22:48:41 +00003619 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00003620 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
drhd677b3d2007-08-20 22:48:41 +00003621 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003622 pCur->validNKey = 0;
drhd677b3d2007-08-20 22:48:41 +00003623 }
danielk1977518002e2008-09-05 05:02:46 +00003624 return rc;
drh2dcc9aa2002-12-04 13:40:25 +00003625}
3626
drh5e00f6c2001-09-13 13:46:56 +00003627/* Move the cursor to the first entry in the table. Return SQLITE_OK
3628** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003629** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00003630*/
drh3aac2dd2004-04-26 14:10:20 +00003631int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00003632 int rc;
drhd677b3d2007-08-20 22:48:41 +00003633
drh1fee73e2007-08-29 04:00:57 +00003634 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003635 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh5e00f6c2001-09-13 13:46:56 +00003636 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003637 if( rc==SQLITE_OK ){
3638 if( pCur->eState==CURSOR_INVALID ){
danielk197771d5d2c2008-09-29 11:49:47 +00003639 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003640 *pRes = 1;
3641 rc = SQLITE_OK;
3642 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003643 assert( pCur->apPage[pCur->iPage]->nCell>0 );
drhd677b3d2007-08-20 22:48:41 +00003644 *pRes = 0;
3645 rc = moveToLeftmost(pCur);
3646 }
drh5e00f6c2001-09-13 13:46:56 +00003647 }
drh5e00f6c2001-09-13 13:46:56 +00003648 return rc;
3649}
drh5e2f8b92001-05-28 00:41:15 +00003650
drh9562b552002-02-19 15:00:07 +00003651/* Move the cursor to the last entry in the table. Return SQLITE_OK
3652** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003653** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00003654*/
drh3aac2dd2004-04-26 14:10:20 +00003655int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00003656 int rc;
drhd677b3d2007-08-20 22:48:41 +00003657
drh1fee73e2007-08-29 04:00:57 +00003658 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003659 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh9562b552002-02-19 15:00:07 +00003660 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003661 if( rc==SQLITE_OK ){
3662 if( CURSOR_INVALID==pCur->eState ){
danielk197771d5d2c2008-09-29 11:49:47 +00003663 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003664 *pRes = 1;
3665 }else{
3666 assert( pCur->eState==CURSOR_VALID );
3667 *pRes = 0;
3668 rc = moveToRightmost(pCur);
drha2c20e42008-03-29 16:01:04 +00003669 getCellInfo(pCur);
3670 pCur->atLast = rc==SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00003671 }
drh9562b552002-02-19 15:00:07 +00003672 }
drh9562b552002-02-19 15:00:07 +00003673 return rc;
3674}
3675
drhe14006d2008-03-25 17:23:32 +00003676/* Move the cursor so that it points to an entry near the key
drhe63d9992008-08-13 19:11:48 +00003677** specified by pIdxKey or intKey. Return a success code.
drh72f82862001-05-24 21:06:34 +00003678**
drhe63d9992008-08-13 19:11:48 +00003679** For INTKEY tables, the intKey parameter is used. pIdxKey
3680** must be NULL. For index tables, pIdxKey is used and intKey
3681** is ignored.
drh3aac2dd2004-04-26 14:10:20 +00003682**
drh5e2f8b92001-05-28 00:41:15 +00003683** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00003684** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00003685** were present. The cursor might point to an entry that comes
3686** before or after the key.
3687**
drhbd03cae2001-06-02 02:40:57 +00003688** The result of comparing the key with the entry to which the
drhab01f612004-05-22 02:55:23 +00003689** cursor is written to *pRes if pRes!=NULL. The meaning of
drhbd03cae2001-06-02 02:40:57 +00003690** this value is as follows:
3691**
3692** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00003693** is smaller than pKey or if the table is empty
3694** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00003695**
3696** *pRes==0 The cursor is left pointing at an entry that
3697** exactly matches pKey.
3698**
3699** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00003700** is larger than pKey.
drhd677b3d2007-08-20 22:48:41 +00003701**
drha059ad02001-04-17 20:09:11 +00003702*/
drhe63d9992008-08-13 19:11:48 +00003703int sqlite3BtreeMovetoUnpacked(
3704 BtCursor *pCur, /* The cursor to be moved */
3705 UnpackedRecord *pIdxKey, /* Unpacked index key */
3706 i64 intKey, /* The table key */
3707 int biasRight, /* If true, bias the search to the high end */
3708 int *pRes /* Write search results here */
drhe4d90812007-03-29 05:51:49 +00003709){
drh72f82862001-05-24 21:06:34 +00003710 int rc;
drhd677b3d2007-08-20 22:48:41 +00003711
drh1fee73e2007-08-29 04:00:57 +00003712 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003713 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drha2c20e42008-03-29 16:01:04 +00003714
3715 /* If the cursor is already positioned at the point we are trying
3716 ** to move to, then just return without doing any work */
danielk197771d5d2c2008-09-29 11:49:47 +00003717 if( pCur->eState==CURSOR_VALID && pCur->validNKey
3718 && pCur->apPage[0]->intKey
3719 ){
drhe63d9992008-08-13 19:11:48 +00003720 if( pCur->info.nKey==intKey ){
drha2c20e42008-03-29 16:01:04 +00003721 *pRes = 0;
3722 return SQLITE_OK;
3723 }
drhe63d9992008-08-13 19:11:48 +00003724 if( pCur->atLast && pCur->info.nKey<intKey ){
drha2c20e42008-03-29 16:01:04 +00003725 *pRes = -1;
3726 return SQLITE_OK;
3727 }
3728 }
3729
drh5e2f8b92001-05-28 00:41:15 +00003730 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003731 if( rc ){
3732 return rc;
3733 }
danielk197771d5d2c2008-09-29 11:49:47 +00003734 assert( pCur->apPage[pCur->iPage] );
3735 assert( pCur->apPage[pCur->iPage]->isInit );
danielk1977da184232006-01-05 11:34:32 +00003736 if( pCur->eState==CURSOR_INVALID ){
drhf328bc82004-05-10 23:29:49 +00003737 *pRes = -1;
danielk197771d5d2c2008-09-29 11:49:47 +00003738 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhc39e0002004-05-07 23:50:57 +00003739 return SQLITE_OK;
3740 }
danielk197771d5d2c2008-09-29 11:49:47 +00003741 assert( pCur->apPage[0]->intKey || pIdxKey );
drh14684382006-11-30 13:05:29 +00003742 for(;;){
drh72f82862001-05-24 21:06:34 +00003743 int lwr, upr;
3744 Pgno chldPg;
danielk197771d5d2c2008-09-29 11:49:47 +00003745 MemPage *pPage = pCur->apPage[pCur->iPage];
drh1a844c32002-12-04 22:29:28 +00003746 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00003747 lwr = 0;
3748 upr = pPage->nCell-1;
drhe63d9992008-08-13 19:11:48 +00003749 if( !pPage->intKey && pIdxKey==0 ){
drh1e968a02008-03-25 00:22:21 +00003750 rc = SQLITE_CORRUPT_BKPT;
3751 goto moveto_finish;
drh4eec4c12005-01-21 00:22:37 +00003752 }
drhe4d90812007-03-29 05:51:49 +00003753 if( biasRight ){
danielk197771d5d2c2008-09-29 11:49:47 +00003754 pCur->aiIdx[pCur->iPage] = upr;
drhe4d90812007-03-29 05:51:49 +00003755 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003756 pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
drhe4d90812007-03-29 05:51:49 +00003757 }
drhf1d68b32007-03-29 04:43:26 +00003758 if( lwr<=upr ) for(;;){
danielk197713adf8a2004-06-03 16:08:41 +00003759 void *pCellKey;
drh4a1c3802004-05-12 15:15:47 +00003760 i64 nCellKey;
danielk197771d5d2c2008-09-29 11:49:47 +00003761 int idx = pCur->aiIdx[pCur->iPage];
drh366fda62006-01-13 02:35:09 +00003762 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003763 pCur->validNKey = 1;
drh3aac2dd2004-04-26 14:10:20 +00003764 if( pPage->intKey ){
drh777e4c42006-01-13 04:31:58 +00003765 u8 *pCell;
danielk197771d5d2c2008-09-29 11:49:47 +00003766 pCell = findCell(pPage, idx) + pPage->childPtrSize;
drhd172f862006-01-12 15:01:15 +00003767 if( pPage->hasData ){
danielk1977bab45c62006-01-16 15:14:27 +00003768 u32 dummy;
shane3f8d5cf2008-04-24 19:15:09 +00003769 pCell += getVarint32(pCell, dummy);
drhd172f862006-01-12 15:01:15 +00003770 }
drha2c20e42008-03-29 16:01:04 +00003771 getVarint(pCell, (u64*)&nCellKey);
drhe63d9992008-08-13 19:11:48 +00003772 if( nCellKey==intKey ){
drh3aac2dd2004-04-26 14:10:20 +00003773 c = 0;
drhe63d9992008-08-13 19:11:48 +00003774 }else if( nCellKey<intKey ){
drh41eb9e92008-04-02 18:33:07 +00003775 c = -1;
3776 }else{
drhe63d9992008-08-13 19:11:48 +00003777 assert( nCellKey>intKey );
drh41eb9e92008-04-02 18:33:07 +00003778 c = +1;
drh3aac2dd2004-04-26 14:10:20 +00003779 }
drh3aac2dd2004-04-26 14:10:20 +00003780 }else{
drhe51c44f2004-05-30 20:46:09 +00003781 int available;
danielk197713adf8a2004-06-03 16:08:41 +00003782 pCellKey = (void *)fetchPayload(pCur, &available, 0);
drh366fda62006-01-13 02:35:09 +00003783 nCellKey = pCur->info.nKey;
drhe51c44f2004-05-30 20:46:09 +00003784 if( available>=nCellKey ){
drhe63d9992008-08-13 19:11:48 +00003785 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
drhe51c44f2004-05-30 20:46:09 +00003786 }else{
drhfacf0302008-06-17 15:12:00 +00003787 pCellKey = sqlite3Malloc( nCellKey );
danielk19776507ecb2008-03-25 09:56:44 +00003788 if( pCellKey==0 ){
3789 rc = SQLITE_NOMEM;
3790 goto moveto_finish;
3791 }
danielk197713adf8a2004-06-03 16:08:41 +00003792 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
drhe63d9992008-08-13 19:11:48 +00003793 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
drhfacf0302008-06-17 15:12:00 +00003794 sqlite3_free(pCellKey);
drh1e968a02008-03-25 00:22:21 +00003795 if( rc ) goto moveto_finish;
drhe51c44f2004-05-30 20:46:09 +00003796 }
drh3aac2dd2004-04-26 14:10:20 +00003797 }
drh72f82862001-05-24 21:06:34 +00003798 if( c==0 ){
drha2c20e42008-03-29 16:01:04 +00003799 pCur->info.nKey = nCellKey;
drh44845222008-07-17 18:39:57 +00003800 if( pPage->intKey && !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00003801 lwr = idx;
drhfc70e6f2004-05-12 21:11:27 +00003802 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00003803 break;
3804 }else{
drh8b18dd42004-05-12 19:18:15 +00003805 if( pRes ) *pRes = 0;
drh1e968a02008-03-25 00:22:21 +00003806 rc = SQLITE_OK;
3807 goto moveto_finish;
drh8b18dd42004-05-12 19:18:15 +00003808 }
drh72f82862001-05-24 21:06:34 +00003809 }
3810 if( c<0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003811 lwr = idx+1;
drh72f82862001-05-24 21:06:34 +00003812 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003813 upr = idx-1;
drh72f82862001-05-24 21:06:34 +00003814 }
drhf1d68b32007-03-29 04:43:26 +00003815 if( lwr>upr ){
drha2c20e42008-03-29 16:01:04 +00003816 pCur->info.nKey = nCellKey;
drhf1d68b32007-03-29 04:43:26 +00003817 break;
3818 }
danielk197771d5d2c2008-09-29 11:49:47 +00003819 pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
drh72f82862001-05-24 21:06:34 +00003820 }
3821 assert( lwr==upr+1 );
danielk197771d5d2c2008-09-29 11:49:47 +00003822 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00003823 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003824 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00003825 }else if( lwr>=pPage->nCell ){
drh43605152004-05-29 21:46:49 +00003826 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh72f82862001-05-24 21:06:34 +00003827 }else{
danielk19771cc5ed82007-05-16 17:28:43 +00003828 chldPg = get4byte(findCell(pPage, lwr));
drh72f82862001-05-24 21:06:34 +00003829 }
3830 if( chldPg==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003831 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh72f82862001-05-24 21:06:34 +00003832 if( pRes ) *pRes = c;
drh1e968a02008-03-25 00:22:21 +00003833 rc = SQLITE_OK;
3834 goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003835 }
danielk197771d5d2c2008-09-29 11:49:47 +00003836 pCur->aiIdx[pCur->iPage] = lwr;
drh271efa52004-05-30 19:19:05 +00003837 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003838 pCur->validNKey = 0;
drh8178a752003-01-05 21:41:40 +00003839 rc = moveToChild(pCur, chldPg);
drh1e968a02008-03-25 00:22:21 +00003840 if( rc ) goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003841 }
drh1e968a02008-03-25 00:22:21 +00003842moveto_finish:
drhe63d9992008-08-13 19:11:48 +00003843 return rc;
3844}
3845
3846/*
3847** In this version of BtreeMoveto, pKey is a packed index record
3848** such as is generated by the OP_MakeRecord opcode. Unpack the
3849** record and then call BtreeMovetoUnpacked() to do the work.
3850*/
3851int sqlite3BtreeMoveto(
3852 BtCursor *pCur, /* Cursor open on the btree to be searched */
3853 const void *pKey, /* Packed key if the btree is an index */
3854 i64 nKey, /* Integer key for tables. Size of pKey for indices */
3855 int bias, /* Bias search to the high end */
3856 int *pRes /* Write search results here */
3857){
3858 int rc; /* Status code */
3859 UnpackedRecord *pIdxKey; /* Unpacked index key */
drh23f79d02008-08-20 22:06:47 +00003860 UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
drhe63d9992008-08-13 19:11:48 +00003861
drhe14006d2008-03-25 17:23:32 +00003862 if( pKey ){
drhe63d9992008-08-13 19:11:48 +00003863 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
drh23f79d02008-08-20 22:06:47 +00003864 aSpace, sizeof(aSpace));
drhe63d9992008-08-13 19:11:48 +00003865 if( pIdxKey==0 ) return SQLITE_NOMEM;
3866 }else{
3867 pIdxKey = 0;
3868 }
3869 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
3870 if( pKey ){
3871 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
drhe14006d2008-03-25 17:23:32 +00003872 }
drh1e968a02008-03-25 00:22:21 +00003873 return rc;
drh72f82862001-05-24 21:06:34 +00003874}
3875
drhd677b3d2007-08-20 22:48:41 +00003876
drh72f82862001-05-24 21:06:34 +00003877/*
drhc39e0002004-05-07 23:50:57 +00003878** Return TRUE if the cursor is not pointing at an entry of the table.
3879**
3880** TRUE will be returned after a call to sqlite3BtreeNext() moves
3881** past the last entry in the table or sqlite3BtreePrev() moves past
3882** the first entry. TRUE is also returned if the table is empty.
3883*/
3884int sqlite3BtreeEof(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00003885 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3886 ** have been deleted? This API will need to change to return an error code
3887 ** as well as the boolean result value.
3888 */
3889 return (CURSOR_VALID!=pCur->eState);
drhc39e0002004-05-07 23:50:57 +00003890}
3891
3892/*
drhb21c8cd2007-08-21 19:33:56 +00003893** Return the database connection handle for a cursor.
3894*/
3895sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
drhe5fe6902007-12-07 18:55:28 +00003896 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3897 return pCur->pBtree->db;
drhb21c8cd2007-08-21 19:33:56 +00003898}
3899
3900/*
drhbd03cae2001-06-02 02:40:57 +00003901** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00003902** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00003903** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00003904** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00003905*/
drhd094db12008-04-03 21:46:57 +00003906int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00003907 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003908 int idx;
danielk197797a227c2006-01-20 16:32:04 +00003909 MemPage *pPage;
drh8b18dd42004-05-12 19:18:15 +00003910
drh1fee73e2007-08-29 04:00:57 +00003911 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003912 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003913 if( rc!=SQLITE_OK ){
3914 return rc;
3915 }
drh8c4d3a62007-04-06 01:03:32 +00003916 assert( pRes!=0 );
drh8c4d3a62007-04-06 01:03:32 +00003917 if( CURSOR_INVALID==pCur->eState ){
3918 *pRes = 1;
3919 return SQLITE_OK;
3920 }
danielk1977da184232006-01-05 11:34:32 +00003921 if( pCur->skip>0 ){
3922 pCur->skip = 0;
3923 *pRes = 0;
3924 return SQLITE_OK;
3925 }
3926 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00003927
danielk197771d5d2c2008-09-29 11:49:47 +00003928 pPage = pCur->apPage[pCur->iPage];
3929 idx = ++pCur->aiIdx[pCur->iPage];
3930 assert( pPage->isInit );
3931 assert( idx<=pPage->nCell );
danielk19776a43f9b2004-11-16 04:57:24 +00003932
drh271efa52004-05-30 19:19:05 +00003933 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003934 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003935 if( idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00003936 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00003937 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
drh5e2f8b92001-05-28 00:41:15 +00003938 if( rc ) return rc;
3939 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003940 *pRes = 0;
3941 return rc;
drh72f82862001-05-24 21:06:34 +00003942 }
drh5e2f8b92001-05-28 00:41:15 +00003943 do{
danielk197771d5d2c2008-09-29 11:49:47 +00003944 if( pCur->iPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00003945 *pRes = 1;
danielk1977da184232006-01-05 11:34:32 +00003946 pCur->eState = CURSOR_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00003947 return SQLITE_OK;
3948 }
drh16a9b832007-05-05 18:39:25 +00003949 sqlite3BtreeMoveToParent(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00003950 pPage = pCur->apPage[pCur->iPage];
3951 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00003952 *pRes = 0;
drh44845222008-07-17 18:39:57 +00003953 if( pPage->intKey ){
drh8b18dd42004-05-12 19:18:15 +00003954 rc = sqlite3BtreeNext(pCur, pRes);
3955 }else{
3956 rc = SQLITE_OK;
3957 }
3958 return rc;
drh8178a752003-01-05 21:41:40 +00003959 }
3960 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00003961 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00003962 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00003963 }
drh5e2f8b92001-05-28 00:41:15 +00003964 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00003965 return rc;
drh72f82862001-05-24 21:06:34 +00003966}
drhd677b3d2007-08-20 22:48:41 +00003967
drh72f82862001-05-24 21:06:34 +00003968
drh3b7511c2001-05-26 13:15:44 +00003969/*
drh2dcc9aa2002-12-04 13:40:25 +00003970** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00003971** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00003972** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00003973** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00003974*/
drhd094db12008-04-03 21:46:57 +00003975int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00003976 int rc;
drh8178a752003-01-05 21:41:40 +00003977 MemPage *pPage;
danielk1977da184232006-01-05 11:34:32 +00003978
drh1fee73e2007-08-29 04:00:57 +00003979 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003980 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003981 if( rc!=SQLITE_OK ){
3982 return rc;
3983 }
drha2c20e42008-03-29 16:01:04 +00003984 pCur->atLast = 0;
drh8c4d3a62007-04-06 01:03:32 +00003985 if( CURSOR_INVALID==pCur->eState ){
3986 *pRes = 1;
3987 return SQLITE_OK;
3988 }
danielk1977da184232006-01-05 11:34:32 +00003989 if( pCur->skip<0 ){
3990 pCur->skip = 0;
3991 *pRes = 0;
3992 return SQLITE_OK;
3993 }
3994 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00003995
danielk197771d5d2c2008-09-29 11:49:47 +00003996 pPage = pCur->apPage[pCur->iPage];
3997 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00003998 if( !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00003999 int idx = pCur->aiIdx[pCur->iPage];
4000 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
drhd677b3d2007-08-20 22:48:41 +00004001 if( rc ){
4002 return rc;
4003 }
drh2dcc9aa2002-12-04 13:40:25 +00004004 rc = moveToRightmost(pCur);
4005 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00004006 while( pCur->aiIdx[pCur->iPage]==0 ){
4007 if( pCur->iPage==0 ){
danielk1977da184232006-01-05 11:34:32 +00004008 pCur->eState = CURSOR_INVALID;
drhc39e0002004-05-07 23:50:57 +00004009 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00004010 return SQLITE_OK;
4011 }
drh16a9b832007-05-05 18:39:25 +00004012 sqlite3BtreeMoveToParent(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00004013 }
drh271efa52004-05-30 19:19:05 +00004014 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004015 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004016
4017 pCur->aiIdx[pCur->iPage]--;
4018 pPage = pCur->apPage[pCur->iPage];
drh44845222008-07-17 18:39:57 +00004019 if( pPage->intKey && !pPage->leaf ){
drh8b18dd42004-05-12 19:18:15 +00004020 rc = sqlite3BtreePrevious(pCur, pRes);
4021 }else{
4022 rc = SQLITE_OK;
4023 }
drh2dcc9aa2002-12-04 13:40:25 +00004024 }
drh8178a752003-01-05 21:41:40 +00004025 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00004026 return rc;
4027}
4028
4029/*
drh3b7511c2001-05-26 13:15:44 +00004030** Allocate a new page from the database file.
4031**
danielk19773b8a05f2007-03-19 17:44:26 +00004032** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
drh3b7511c2001-05-26 13:15:44 +00004033** has already been called on the new page.) The new page has also
4034** been referenced and the calling routine is responsible for calling
danielk19773b8a05f2007-03-19 17:44:26 +00004035** sqlite3PagerUnref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00004036**
4037** SQLITE_OK is returned on success. Any other return value indicates
4038** an error. *ppPage and *pPgno are undefined in the event of an error.
danielk19773b8a05f2007-03-19 17:44:26 +00004039** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00004040**
drh199e3cf2002-07-18 11:01:47 +00004041** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4042** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00004043** attempt to keep related pages close to each other in the database file,
4044** which in turn can make database access faster.
danielk1977cb1a7eb2004-11-05 12:27:02 +00004045**
4046** If the "exact" parameter is not 0, and the page-number nearby exists
4047** anywhere on the free-list, then it is guarenteed to be returned. This
4048** is only used by auto-vacuum databases when allocating a new table.
drh3b7511c2001-05-26 13:15:44 +00004049*/
drh4f0c5872007-03-26 22:05:01 +00004050static int allocateBtreePage(
danielk1977aef0bf62005-12-30 16:28:01 +00004051 BtShared *pBt,
danielk1977cb1a7eb2004-11-05 12:27:02 +00004052 MemPage **ppPage,
4053 Pgno *pPgno,
4054 Pgno nearby,
4055 u8 exact
4056){
drh3aac2dd2004-04-26 14:10:20 +00004057 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00004058 int rc;
drh3aac2dd2004-04-26 14:10:20 +00004059 int n; /* Number of pages on the freelist */
4060 int k; /* Number of leaves on the trunk of the freelist */
drhd3627af2006-12-18 18:34:51 +00004061 MemPage *pTrunk = 0;
4062 MemPage *pPrevTrunk = 0;
drh30e58752002-03-02 20:41:57 +00004063
drh1fee73e2007-08-29 04:00:57 +00004064 assert( sqlite3_mutex_held(pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004065 pPage1 = pBt->pPage1;
4066 n = get4byte(&pPage1->aData[36]);
4067 if( n>0 ){
drh91025292004-05-03 19:49:32 +00004068 /* There are pages on the freelist. Reuse one of those pages. */
danielk1977cb1a7eb2004-11-05 12:27:02 +00004069 Pgno iTrunk;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004070 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4071
4072 /* If the 'exact' parameter was true and a query of the pointer-map
4073 ** shows that the page 'nearby' is somewhere on the free-list, then
4074 ** the entire-list will be searched for that page.
4075 */
4076#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977ad0132d2008-06-07 08:58:22 +00004077 if( exact && nearby<=pagerPagecount(pBt->pPager) ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004078 u8 eType;
4079 assert( nearby>0 );
4080 assert( pBt->autoVacuum );
4081 rc = ptrmapGet(pBt, nearby, &eType, 0);
4082 if( rc ) return rc;
4083 if( eType==PTRMAP_FREEPAGE ){
4084 searchList = 1;
4085 }
4086 *pPgno = nearby;
4087 }
4088#endif
4089
4090 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4091 ** first free-list trunk page. iPrevTrunk is initially 1.
4092 */
danielk19773b8a05f2007-03-19 17:44:26 +00004093 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3b7511c2001-05-26 13:15:44 +00004094 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004095 put4byte(&pPage1->aData[36], n-1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004096
4097 /* The code within this loop is run only once if the 'searchList' variable
4098 ** is not true. Otherwise, it runs once for each trunk-page on the
4099 ** free-list until the page 'nearby' is located.
4100 */
4101 do {
4102 pPrevTrunk = pTrunk;
4103 if( pPrevTrunk ){
4104 iTrunk = get4byte(&pPrevTrunk->aData[0]);
drhbea00b92002-07-08 10:59:50 +00004105 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00004106 iTrunk = get4byte(&pPage1->aData[32]);
drhbea00b92002-07-08 10:59:50 +00004107 }
drh16a9b832007-05-05 18:39:25 +00004108 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004109 if( rc ){
drhd3627af2006-12-18 18:34:51 +00004110 pTrunk = 0;
4111 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004112 }
4113
4114 k = get4byte(&pTrunk->aData[4]);
4115 if( k==0 && !searchList ){
4116 /* The trunk has no leaves and the list is not being searched.
4117 ** So extract the trunk page itself and use it as the newly
4118 ** allocated page */
4119 assert( pPrevTrunk==0 );
danielk19773b8a05f2007-03-19 17:44:26 +00004120 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004121 if( rc ){
4122 goto end_allocate_page;
4123 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004124 *pPgno = iTrunk;
4125 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4126 *ppPage = pTrunk;
4127 pTrunk = 0;
4128 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh45b1fac2008-07-04 17:52:42 +00004129 }else if( k>pBt->usableSize/4 - 2 ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004130 /* Value of k is out of range. Database corruption */
drhd3627af2006-12-18 18:34:51 +00004131 rc = SQLITE_CORRUPT_BKPT;
4132 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004133#ifndef SQLITE_OMIT_AUTOVACUUM
4134 }else if( searchList && nearby==iTrunk ){
4135 /* The list is being searched and this trunk page is the page
4136 ** to allocate, regardless of whether it has leaves.
4137 */
4138 assert( *pPgno==iTrunk );
4139 *ppPage = pTrunk;
4140 searchList = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00004141 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004142 if( rc ){
4143 goto end_allocate_page;
4144 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004145 if( k==0 ){
4146 if( !pPrevTrunk ){
4147 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4148 }else{
4149 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4150 }
4151 }else{
4152 /* The trunk page is required by the caller but it contains
4153 ** pointers to free-list leaves. The first leaf becomes a trunk
4154 ** page in this case.
4155 */
4156 MemPage *pNewTrunk;
4157 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
drh16a9b832007-05-05 18:39:25 +00004158 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004159 if( rc!=SQLITE_OK ){
drhd3627af2006-12-18 18:34:51 +00004160 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004161 }
danielk19773b8a05f2007-03-19 17:44:26 +00004162 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004163 if( rc!=SQLITE_OK ){
4164 releasePage(pNewTrunk);
drhd3627af2006-12-18 18:34:51 +00004165 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004166 }
4167 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4168 put4byte(&pNewTrunk->aData[4], k-1);
4169 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
drhd3627af2006-12-18 18:34:51 +00004170 releasePage(pNewTrunk);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004171 if( !pPrevTrunk ){
4172 put4byte(&pPage1->aData[32], iNewTrunk);
4173 }else{
danielk19773b8a05f2007-03-19 17:44:26 +00004174 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004175 if( rc ){
4176 goto end_allocate_page;
4177 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004178 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4179 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004180 }
4181 pTrunk = 0;
4182 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4183#endif
4184 }else{
4185 /* Extract a leaf from the trunk */
4186 int closest;
4187 Pgno iPage;
4188 unsigned char *aData = pTrunk->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00004189 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004190 if( rc ){
4191 goto end_allocate_page;
4192 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004193 if( nearby>0 ){
4194 int i, dist;
4195 closest = 0;
4196 dist = get4byte(&aData[8]) - nearby;
4197 if( dist<0 ) dist = -dist;
4198 for(i=1; i<k; i++){
4199 int d2 = get4byte(&aData[8+i*4]) - nearby;
4200 if( d2<0 ) d2 = -d2;
4201 if( d2<dist ){
4202 closest = i;
4203 dist = d2;
4204 }
4205 }
4206 }else{
4207 closest = 0;
4208 }
4209
4210 iPage = get4byte(&aData[8+closest*4]);
4211 if( !searchList || iPage==nearby ){
danielk1977ad0132d2008-06-07 08:58:22 +00004212 int nPage;
shane1f9e6aa2008-06-09 19:27:11 +00004213 *pPgno = iPage;
danielk1977ad0132d2008-06-07 08:58:22 +00004214 nPage = pagerPagecount(pBt->pPager);
4215 if( *pPgno>nPage ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004216 /* Free page off the end of the file */
danielk197743e377a2008-05-05 12:09:32 +00004217 rc = SQLITE_CORRUPT_BKPT;
4218 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004219 }
4220 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4221 ": %d more free pages\n",
4222 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4223 if( closest<k-1 ){
4224 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4225 }
4226 put4byte(&aData[4], k-1);
drh16a9b832007-05-05 18:39:25 +00004227 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004228 if( rc==SQLITE_OK ){
drh538f5702007-04-13 02:14:30 +00004229 sqlite3PagerDontRollback((*ppPage)->pDbPage);
danielk19773b8a05f2007-03-19 17:44:26 +00004230 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004231 if( rc!=SQLITE_OK ){
4232 releasePage(*ppPage);
4233 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004234 }
4235 searchList = 0;
4236 }
drhee696e22004-08-30 16:52:17 +00004237 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004238 releasePage(pPrevTrunk);
drhd3627af2006-12-18 18:34:51 +00004239 pPrevTrunk = 0;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004240 }while( searchList );
drh3b7511c2001-05-26 13:15:44 +00004241 }else{
drh3aac2dd2004-04-26 14:10:20 +00004242 /* There are no pages on the freelist, so create a new page at the
4243 ** end of the file */
danielk1977ad0132d2008-06-07 08:58:22 +00004244 int nPage = pagerPagecount(pBt->pPager);
4245 *pPgno = nPage + 1;
danielk1977afcdd022004-10-31 16:25:42 +00004246
4247#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00004248 if( pBt->nTrunc ){
4249 /* An incr-vacuum has already run within this transaction. So the
4250 ** page to allocate is not from the physical end of the file, but
4251 ** at pBt->nTrunc.
4252 */
4253 *pPgno = pBt->nTrunc+1;
4254 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4255 (*pPgno)++;
4256 }
4257 }
danielk1977266664d2006-02-10 08:24:21 +00004258 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
danielk1977afcdd022004-10-31 16:25:42 +00004259 /* If *pPgno refers to a pointer-map page, allocate two new pages
4260 ** at the end of the file instead of one. The first allocated page
4261 ** becomes a new pointer-map page, the second is used by the caller.
4262 */
4263 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
danielk1977599fcba2004-11-08 07:13:13 +00004264 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk1977afcdd022004-10-31 16:25:42 +00004265 (*pPgno)++;
drh72190432008-01-31 14:54:43 +00004266 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
danielk1977afcdd022004-10-31 16:25:42 +00004267 }
danielk1977dddbcdc2007-04-26 14:42:34 +00004268 if( pBt->nTrunc ){
4269 pBt->nTrunc = *pPgno;
4270 }
danielk1977afcdd022004-10-31 16:25:42 +00004271#endif
4272
danielk1977599fcba2004-11-08 07:13:13 +00004273 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drh16a9b832007-05-05 18:39:25 +00004274 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
drh3b7511c2001-05-26 13:15:44 +00004275 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00004276 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004277 if( rc!=SQLITE_OK ){
4278 releasePage(*ppPage);
4279 }
drh3a4c1412004-05-09 20:40:11 +00004280 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00004281 }
danielk1977599fcba2004-11-08 07:13:13 +00004282
4283 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drhd3627af2006-12-18 18:34:51 +00004284
4285end_allocate_page:
4286 releasePage(pTrunk);
4287 releasePage(pPrevTrunk);
danielk197771d5d2c2008-09-29 11:49:47 +00004288 if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4289 releasePage(*ppPage);
4290 return SQLITE_CORRUPT_BKPT;
danielk1977eaa06f62008-09-18 17:34:44 +00004291 }
drh3b7511c2001-05-26 13:15:44 +00004292 return rc;
4293}
4294
4295/*
drh3aac2dd2004-04-26 14:10:20 +00004296** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00004297**
danielk19773b8a05f2007-03-19 17:44:26 +00004298** sqlite3PagerUnref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00004299*/
drh3aac2dd2004-04-26 14:10:20 +00004300static int freePage(MemPage *pPage){
danielk1977aef0bf62005-12-30 16:28:01 +00004301 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004302 MemPage *pPage1 = pBt->pPage1;
4303 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00004304
drh3aac2dd2004-04-26 14:10:20 +00004305 /* Prepare the page for freeing */
drh1fee73e2007-08-29 04:00:57 +00004306 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004307 assert( pPage->pgno>1 );
4308 pPage->isInit = 0;
drh3aac2dd2004-04-26 14:10:20 +00004309
drha34b6762004-05-07 13:30:42 +00004310 /* Increment the free page count on pPage1 */
danielk19773b8a05f2007-03-19 17:44:26 +00004311 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00004312 if( rc ) return rc;
4313 n = get4byte(&pPage1->aData[36]);
4314 put4byte(&pPage1->aData[36], n+1);
4315
drhfcce93f2006-02-22 03:08:32 +00004316#ifdef SQLITE_SECURE_DELETE
4317 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4318 ** always fully overwrite deleted information with zeros.
4319 */
danielk19773b8a05f2007-03-19 17:44:26 +00004320 rc = sqlite3PagerWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004321 if( rc ) return rc;
4322 memset(pPage->aData, 0, pPage->pBt->pageSize);
4323#endif
4324
danielk1977687566d2004-11-02 12:56:41 +00004325 /* If the database supports auto-vacuum, write an entry in the pointer-map
danielk1977cb1a7eb2004-11-05 12:27:02 +00004326 ** to indicate that the page is free.
danielk1977687566d2004-11-02 12:56:41 +00004327 */
danielk197785d90ca2008-07-19 14:25:15 +00004328 if( ISAUTOVACUUM ){
danielk1977687566d2004-11-02 12:56:41 +00004329 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
danielk1977a64a0352004-11-05 01:45:13 +00004330 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00004331 }
danielk1977687566d2004-11-02 12:56:41 +00004332
drh3aac2dd2004-04-26 14:10:20 +00004333 if( n==0 ){
4334 /* This is the first free page */
danielk19773b8a05f2007-03-19 17:44:26 +00004335 rc = sqlite3PagerWrite(pPage->pDbPage);
drhda200cc2004-05-09 11:51:38 +00004336 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004337 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00004338 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00004339 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004340 }else{
4341 /* Other free pages already exist. Retrive the first trunk page
4342 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00004343 MemPage *pTrunk;
drh16a9b832007-05-05 18:39:25 +00004344 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
drh3b7511c2001-05-26 13:15:44 +00004345 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004346 k = get4byte(&pTrunk->aData[4]);
drhee696e22004-08-30 16:52:17 +00004347 if( k>=pBt->usableSize/4 - 8 ){
drh3aac2dd2004-04-26 14:10:20 +00004348 /* The trunk is full. Turn the page being freed into a new
drh45b1fac2008-07-04 17:52:42 +00004349 ** trunk page with no leaves.
4350 **
4351 ** Note that the trunk page is not really full until it contains
4352 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4353 ** coded. But due to a coding error in versions of SQLite prior to
4354 ** 3.6.0, databases with freelist trunk pages holding more than
4355 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4356 ** to maintain backwards compatibility with older versions of SQLite,
4357 ** we will contain to restrict the number of entries to usableSize/4 - 8
4358 ** for now. At some point in the future (once everyone has upgraded
4359 ** to 3.6.0 or later) we should consider fixing the conditional above
4360 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4361 */
danielk19773b8a05f2007-03-19 17:44:26 +00004362 rc = sqlite3PagerWrite(pPage->pDbPage);
drhb9ee4932007-09-07 14:32:06 +00004363 if( rc==SQLITE_OK ){
4364 put4byte(pPage->aData, pTrunk->pgno);
4365 put4byte(&pPage->aData[4], 0);
4366 put4byte(&pPage1->aData[32], pPage->pgno);
4367 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4368 pPage->pgno, pTrunk->pgno));
4369 }
4370 }else if( k<0 ){
4371 rc = SQLITE_CORRUPT;
drh3aac2dd2004-04-26 14:10:20 +00004372 }else{
4373 /* Add the newly freed page as a leaf on the current trunk */
danielk19773b8a05f2007-03-19 17:44:26 +00004374 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhf5345442007-04-09 12:45:02 +00004375 if( rc==SQLITE_OK ){
4376 put4byte(&pTrunk->aData[4], k+1);
4377 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drhfcce93f2006-02-22 03:08:32 +00004378#ifndef SQLITE_SECURE_DELETE
danielk1977a1fa00d2008-08-27 15:16:33 +00004379 rc = sqlite3PagerDontWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004380#endif
drhf5345442007-04-09 12:45:02 +00004381 }
drh3a4c1412004-05-09 20:40:11 +00004382 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004383 }
4384 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00004385 }
drh3b7511c2001-05-26 13:15:44 +00004386 return rc;
4387}
4388
4389/*
drh3aac2dd2004-04-26 14:10:20 +00004390** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00004391*/
drh3aac2dd2004-04-26 14:10:20 +00004392static int clearCell(MemPage *pPage, unsigned char *pCell){
danielk1977aef0bf62005-12-30 16:28:01 +00004393 BtShared *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00004394 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00004395 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00004396 int rc;
drh94440812007-03-06 11:42:19 +00004397 int nOvfl;
4398 int ovflPageSize;
drh3b7511c2001-05-26 13:15:44 +00004399
drh1fee73e2007-08-29 04:00:57 +00004400 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh16a9b832007-05-05 18:39:25 +00004401 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004402 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00004403 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00004404 }
drh6f11bef2004-05-13 01:12:56 +00004405 ovflPgno = get4byte(&pCell[info.iOverflow]);
drh94440812007-03-06 11:42:19 +00004406 ovflPageSize = pBt->usableSize - 4;
drh72365832007-03-06 15:53:44 +00004407 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4408 assert( ovflPgno==0 || nOvfl>0 );
4409 while( nOvfl-- ){
drh3aac2dd2004-04-26 14:10:20 +00004410 MemPage *pOvfl;
danielk1977ad0132d2008-06-07 08:58:22 +00004411 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00004412 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00004413 }
danielk19778c0a9592007-04-30 16:55:00 +00004414
4415 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
drh3b7511c2001-05-26 13:15:44 +00004416 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00004417 rc = freePage(pOvfl);
danielk19773b8a05f2007-03-19 17:44:26 +00004418 sqlite3PagerUnref(pOvfl->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00004419 if( rc ) return rc;
drh3b7511c2001-05-26 13:15:44 +00004420 }
drh5e2f8b92001-05-28 00:41:15 +00004421 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00004422}
4423
4424/*
drh91025292004-05-03 19:49:32 +00004425** Create the byte sequence used to represent a cell on page pPage
4426** and write that byte sequence into pCell[]. Overflow pages are
4427** allocated and filled in as necessary. The calling procedure
4428** is responsible for making sure sufficient space has been allocated
4429** for pCell[].
4430**
4431** Note that pCell does not necessary need to point to the pPage->aData
4432** area. pCell might point to some temporary storage. The cell will
4433** be constructed in this temporary area then copied into pPage->aData
4434** later.
drh3b7511c2001-05-26 13:15:44 +00004435*/
4436static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00004437 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00004438 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00004439 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00004440 const void *pData,int nData, /* The data */
drhb026e052007-05-02 01:34:31 +00004441 int nZero, /* Extra zero bytes to append to pData */
drh4b70f112004-05-02 21:12:19 +00004442 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00004443){
drh3b7511c2001-05-26 13:15:44 +00004444 int nPayload;
drh8c6fa9b2004-05-26 00:01:53 +00004445 const u8 *pSrc;
drha34b6762004-05-07 13:30:42 +00004446 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00004447 int spaceLeft;
4448 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00004449 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00004450 unsigned char *pPrior;
4451 unsigned char *pPayload;
danielk1977aef0bf62005-12-30 16:28:01 +00004452 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004453 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00004454 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00004455 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00004456
drh1fee73e2007-08-29 04:00:57 +00004457 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004458
drh91025292004-05-03 19:49:32 +00004459 /* Fill in the header. */
drh43605152004-05-29 21:46:49 +00004460 nHeader = 0;
drh91025292004-05-03 19:49:32 +00004461 if( !pPage->leaf ){
4462 nHeader += 4;
4463 }
drh8b18dd42004-05-12 19:18:15 +00004464 if( pPage->hasData ){
drhb026e052007-05-02 01:34:31 +00004465 nHeader += putVarint(&pCell[nHeader], nData+nZero);
drh6f11bef2004-05-13 01:12:56 +00004466 }else{
drhb026e052007-05-02 01:34:31 +00004467 nData = nZero = 0;
drh91025292004-05-03 19:49:32 +00004468 }
drh6f11bef2004-05-13 01:12:56 +00004469 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
drh16a9b832007-05-05 18:39:25 +00004470 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004471 assert( info.nHeader==nHeader );
4472 assert( info.nKey==nKey );
drhb026e052007-05-02 01:34:31 +00004473 assert( info.nData==nData+nZero );
drh6f11bef2004-05-13 01:12:56 +00004474
4475 /* Fill in the payload */
drhb026e052007-05-02 01:34:31 +00004476 nPayload = nData + nZero;
drh3aac2dd2004-04-26 14:10:20 +00004477 if( pPage->intKey ){
4478 pSrc = pData;
4479 nSrc = nData;
drh91025292004-05-03 19:49:32 +00004480 nData = 0;
drh3aac2dd2004-04-26 14:10:20 +00004481 }else{
4482 nPayload += nKey;
4483 pSrc = pKey;
4484 nSrc = nKey;
4485 }
drh6f11bef2004-05-13 01:12:56 +00004486 *pnSize = info.nSize;
4487 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00004488 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00004489 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00004490
drh3b7511c2001-05-26 13:15:44 +00004491 while( nPayload>0 ){
4492 if( spaceLeft==0 ){
danielk1977b39f70b2007-05-17 18:28:11 +00004493 int isExact = 0;
danielk1977afcdd022004-10-31 16:25:42 +00004494#ifndef SQLITE_OMIT_AUTOVACUUM
4495 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
danielk1977b39f70b2007-05-17 18:28:11 +00004496 if( pBt->autoVacuum ){
4497 do{
4498 pgnoOvfl++;
4499 } while(
4500 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4501 );
danielk197789a4be82007-05-23 13:34:32 +00004502 if( pgnoOvfl>1 ){
danielk1977b39f70b2007-05-17 18:28:11 +00004503 /* isExact = 1; */
4504 }
4505 }
danielk1977afcdd022004-10-31 16:25:42 +00004506#endif
danielk1977b39f70b2007-05-17 18:28:11 +00004507 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
danielk1977afcdd022004-10-31 16:25:42 +00004508#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977a19df672004-11-03 11:37:07 +00004509 /* If the database supports auto-vacuum, and the second or subsequent
4510 ** overflow page is being allocated, add an entry to the pointer-map
danielk19774ef24492007-05-23 09:52:41 +00004511 ** for that page now.
4512 **
4513 ** If this is the first overflow page, then write a partial entry
4514 ** to the pointer-map. If we write nothing to this pointer-map slot,
4515 ** then the optimistic overflow chain processing in clearCell()
4516 ** may misinterpret the uninitialised values and delete the
4517 ** wrong pages from the database.
danielk1977afcdd022004-10-31 16:25:42 +00004518 */
danielk19774ef24492007-05-23 09:52:41 +00004519 if( pBt->autoVacuum && rc==SQLITE_OK ){
4520 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4521 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
danielk197789a4be82007-05-23 13:34:32 +00004522 if( rc ){
4523 releasePage(pOvfl);
4524 }
danielk1977afcdd022004-10-31 16:25:42 +00004525 }
4526#endif
drh3b7511c2001-05-26 13:15:44 +00004527 if( rc ){
drh9b171272004-05-08 02:03:22 +00004528 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004529 return rc;
4530 }
drh3aac2dd2004-04-26 14:10:20 +00004531 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00004532 releasePage(pToRelease);
4533 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00004534 pPrior = pOvfl->aData;
4535 put4byte(pPrior, 0);
4536 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00004537 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00004538 }
4539 n = nPayload;
4540 if( n>spaceLeft ) n = spaceLeft;
drhb026e052007-05-02 01:34:31 +00004541 if( nSrc>0 ){
4542 if( n>nSrc ) n = nSrc;
4543 assert( pSrc );
4544 memcpy(pPayload, pSrc, n);
4545 }else{
4546 memset(pPayload, 0, n);
4547 }
drh3b7511c2001-05-26 13:15:44 +00004548 nPayload -= n;
drhde647132004-05-07 17:57:49 +00004549 pPayload += n;
drh9b171272004-05-08 02:03:22 +00004550 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00004551 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00004552 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00004553 if( nSrc==0 ){
4554 nSrc = nData;
4555 pSrc = pData;
4556 }
drhdd793422001-06-28 01:54:48 +00004557 }
drh9b171272004-05-08 02:03:22 +00004558 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004559 return SQLITE_OK;
4560}
4561
drh14acc042001-06-10 19:56:58 +00004562/*
4563** Remove the i-th cell from pPage. This routine effects pPage only.
4564** The cell content is not freed or deallocated. It is assumed that
4565** the cell content has been copied someplace else. This routine just
4566** removes the reference to the cell from pPage.
4567**
4568** "sz" must be the number of bytes in the cell.
drh14acc042001-06-10 19:56:58 +00004569*/
shane0af3f892008-11-12 04:55:34 +00004570static int dropCell(MemPage *pPage, int idx, int sz){
drh43605152004-05-29 21:46:49 +00004571 int i; /* Loop counter */
4572 int pc; /* Offset to cell content of cell being deleted */
4573 u8 *data; /* pPage->aData */
4574 u8 *ptr; /* Used to move bytes around within data[] */
4575
drh8c42ca92001-06-22 19:15:00 +00004576 assert( idx>=0 && idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00004577 assert( sz==cellSize(pPage, idx) );
danielk19773b8a05f2007-03-19 17:44:26 +00004578 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00004579 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhda200cc2004-05-09 11:51:38 +00004580 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00004581 ptr = &data[pPage->cellOffset + 2*idx];
shane0af3f892008-11-12 04:55:34 +00004582 pc = get2byte(ptr);
4583 if ( pc<=10 || pc+sz>pPage->pBt->usableSize ) {
4584 return SQLITE_CORRUPT_BKPT;
4585 }
drhde647132004-05-07 17:57:49 +00004586 freeSpace(pPage, pc, sz);
drh43605152004-05-29 21:46:49 +00004587 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4588 ptr[0] = ptr[2];
4589 ptr[1] = ptr[3];
drh14acc042001-06-10 19:56:58 +00004590 }
4591 pPage->nCell--;
drh43605152004-05-29 21:46:49 +00004592 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4593 pPage->nFree += 2;
shane0af3f892008-11-12 04:55:34 +00004594 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00004595}
4596
4597/*
4598** Insert a new cell on pPage at cell index "i". pCell points to the
4599** content of the cell.
4600**
4601** If the cell content will fit on the page, then put it there. If it
drh43605152004-05-29 21:46:49 +00004602** will not fit, then make a copy of the cell content into pTemp if
4603** pTemp is not null. Regardless of pTemp, allocate a new entry
4604** in pPage->aOvfl[] and make it point to the cell content (either
4605** in pTemp or the original pCell) and also record its index.
4606** Allocating a new entry in pPage->aCell[] implies that
4607** pPage->nOverflow is incremented.
danielk1977a3ad5e72005-01-07 08:56:44 +00004608**
4609** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4610** cell. The caller will overwrite them after this function returns. If
drh4b238df2005-01-08 15:43:18 +00004611** nSkip is non-zero, then pCell may not point to an invalid memory location
danielk1977a3ad5e72005-01-07 08:56:44 +00004612** (but pCell+nSkip is always valid).
drh14acc042001-06-10 19:56:58 +00004613*/
danielk1977e80463b2004-11-03 03:01:16 +00004614static int insertCell(
drh24cd67e2004-05-10 16:18:47 +00004615 MemPage *pPage, /* Page into which we are copying */
drh43605152004-05-29 21:46:49 +00004616 int i, /* New cell becomes the i-th cell of the page */
4617 u8 *pCell, /* Content of the new cell */
4618 int sz, /* Bytes of content in pCell */
danielk1977a3ad5e72005-01-07 08:56:44 +00004619 u8 *pTemp, /* Temp storage space for pCell, if needed */
4620 u8 nSkip /* Do not write the first nSkip bytes of the cell */
drh24cd67e2004-05-10 16:18:47 +00004621){
drh43605152004-05-29 21:46:49 +00004622 int idx; /* Where to write new cell content in data[] */
4623 int j; /* Loop counter */
4624 int top; /* First byte of content for any cell in data[] */
4625 int end; /* First byte past the last cell pointer in data[] */
4626 int ins; /* Index in data[] where new cell pointer is inserted */
4627 int hdr; /* Offset into data[] of the page header */
4628 int cellOffset; /* Address of first cell pointer in data[] */
4629 u8 *data; /* The content of the whole page */
4630 u8 *ptr; /* Used for moving information around in data[] */
4631
4632 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4633 assert( sz==cellSizePtr(pPage, pCell) );
drh1fee73e2007-08-29 04:00:57 +00004634 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +00004635 if( pPage->nOverflow || sz+2>pPage->nFree ){
drh24cd67e2004-05-10 16:18:47 +00004636 if( pTemp ){
danielk1977a3ad5e72005-01-07 08:56:44 +00004637 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004638 pCell = pTemp;
drh24cd67e2004-05-10 16:18:47 +00004639 }
drh43605152004-05-29 21:46:49 +00004640 j = pPage->nOverflow++;
4641 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4642 pPage->aOvfl[j].pCell = pCell;
4643 pPage->aOvfl[j].idx = i;
4644 pPage->nFree = 0;
drh14acc042001-06-10 19:56:58 +00004645 }else{
danielk19776e465eb2007-08-21 13:11:00 +00004646 int rc = sqlite3PagerWrite(pPage->pDbPage);
4647 if( rc!=SQLITE_OK ){
4648 return rc;
4649 }
4650 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +00004651 data = pPage->aData;
4652 hdr = pPage->hdrOffset;
4653 top = get2byte(&data[hdr+5]);
4654 cellOffset = pPage->cellOffset;
4655 end = cellOffset + 2*pPage->nCell + 2;
4656 ins = cellOffset + 2*i;
4657 if( end > top - sz ){
shane0af3f892008-11-12 04:55:34 +00004658 rc = defragmentPage(pPage);
4659 if( rc!=SQLITE_OK ){
4660 return rc;
4661 }
drh43605152004-05-29 21:46:49 +00004662 top = get2byte(&data[hdr+5]);
4663 assert( end + sz <= top );
4664 }
4665 idx = allocateSpace(pPage, sz);
4666 assert( idx>0 );
4667 assert( end <= get2byte(&data[hdr+5]) );
shane0af3f892008-11-12 04:55:34 +00004668 if (idx+sz > pPage->pBt->usableSize) {
shane34ac18d2008-11-11 22:18:20 +00004669 return SQLITE_CORRUPT_BKPT;
shane0af3f892008-11-12 04:55:34 +00004670 }
drh43605152004-05-29 21:46:49 +00004671 pPage->nCell++;
4672 pPage->nFree -= 2;
danielk1977a3ad5e72005-01-07 08:56:44 +00004673 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004674 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4675 ptr[0] = ptr[-2];
4676 ptr[1] = ptr[-1];
drhda200cc2004-05-09 11:51:38 +00004677 }
drh43605152004-05-29 21:46:49 +00004678 put2byte(&data[ins], idx);
4679 put2byte(&data[hdr+3], pPage->nCell);
danielk1977a19df672004-11-03 11:37:07 +00004680#ifndef SQLITE_OMIT_AUTOVACUUM
4681 if( pPage->pBt->autoVacuum ){
4682 /* The cell may contain a pointer to an overflow page. If so, write
4683 ** the entry for the overflow page into the pointer map.
4684 */
4685 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00004686 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh72365832007-03-06 15:53:44 +00004687 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
danielk1977a19df672004-11-03 11:37:07 +00004688 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4689 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
danielk19776e465eb2007-08-21 13:11:00 +00004690 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977a19df672004-11-03 11:37:07 +00004691 if( rc!=SQLITE_OK ) return rc;
4692 }
4693 }
4694#endif
drh14acc042001-06-10 19:56:58 +00004695 }
danielk1977e80463b2004-11-03 03:01:16 +00004696
danielk1977e80463b2004-11-03 03:01:16 +00004697 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00004698}
4699
4700/*
drhfa1a98a2004-05-14 19:08:17 +00004701** Add a list of cells to a page. The page should be initially empty.
4702** The cells are guaranteed to fit on the page.
4703*/
4704static void assemblePage(
4705 MemPage *pPage, /* The page to be assemblied */
4706 int nCell, /* The number of cells to add to this page */
drh43605152004-05-29 21:46:49 +00004707 u8 **apCell, /* Pointers to cell bodies */
drha9121e42008-02-19 14:59:35 +00004708 u16 *aSize /* Sizes of the cells */
drhfa1a98a2004-05-14 19:08:17 +00004709){
4710 int i; /* Loop counter */
4711 int totalSize; /* Total size of all cells */
4712 int hdr; /* Index of page header */
drh43605152004-05-29 21:46:49 +00004713 int cellptr; /* Address of next cell pointer */
4714 int cellbody; /* Address of next cell body */
drhfa1a98a2004-05-14 19:08:17 +00004715 u8 *data; /* Data for the page */
4716
drh43605152004-05-29 21:46:49 +00004717 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +00004718 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa1a98a2004-05-14 19:08:17 +00004719 totalSize = 0;
4720 for(i=0; i<nCell; i++){
4721 totalSize += aSize[i];
4722 }
drh43605152004-05-29 21:46:49 +00004723 assert( totalSize+2*nCell<=pPage->nFree );
drhfa1a98a2004-05-14 19:08:17 +00004724 assert( pPage->nCell==0 );
drh43605152004-05-29 21:46:49 +00004725 cellptr = pPage->cellOffset;
drhfa1a98a2004-05-14 19:08:17 +00004726 data = pPage->aData;
4727 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00004728 put2byte(&data[hdr+3], nCell);
drh09d0deb2005-08-02 17:13:09 +00004729 if( nCell ){
4730 cellbody = allocateSpace(pPage, totalSize);
4731 assert( cellbody>0 );
4732 assert( pPage->nFree >= 2*nCell );
4733 pPage->nFree -= 2*nCell;
4734 for(i=0; i<nCell; i++){
4735 put2byte(&data[cellptr], cellbody);
4736 memcpy(&data[cellbody], apCell[i], aSize[i]);
4737 cellptr += 2;
4738 cellbody += aSize[i];
4739 }
4740 assert( cellbody==pPage->pBt->usableSize );
drhfa1a98a2004-05-14 19:08:17 +00004741 }
4742 pPage->nCell = nCell;
drhfa1a98a2004-05-14 19:08:17 +00004743}
4744
drh14acc042001-06-10 19:56:58 +00004745/*
drhc3b70572003-01-04 19:44:07 +00004746** The following parameters determine how many adjacent pages get involved
4747** in a balancing operation. NN is the number of neighbors on either side
4748** of the page that participate in the balancing operation. NB is the
4749** total number of pages that participate, including the target page and
4750** NN neighbors on either side.
4751**
4752** The minimum value of NN is 1 (of course). Increasing NN above 1
4753** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4754** in exchange for a larger degradation in INSERT and UPDATE performance.
4755** The value of NN appears to give the best results overall.
4756*/
4757#define NN 1 /* Number of neighbors on either side of pPage */
4758#define NB (NN*2+1) /* Total pages involved in the balance */
4759
drh43605152004-05-29 21:46:49 +00004760/* Forward reference */
danielk197771d5d2c2008-09-29 11:49:47 +00004761static int balance(BtCursor*, int);
danielk1977ac245ec2005-01-14 13:50:11 +00004762
drh615ae552005-01-16 23:21:00 +00004763#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004764/*
4765** This version of balance() handles the common special case where
4766** a new entry is being inserted on the extreme right-end of the
4767** tree, in other words, when the new entry will become the largest
4768** entry in the tree.
4769**
4770** Instead of trying balance the 3 right-most leaf pages, just add
4771** a new page to the right-hand side and put the one new entry in
4772** that page. This leaves the right side of the tree somewhat
4773** unbalanced. But odds are that we will be inserting new entries
4774** at the end soon afterwards so the nearly empty page will quickly
4775** fill up. On average.
4776**
4777** pPage is the leaf page which is the right-most page in the tree.
4778** pParent is its parent. pPage must have a single overflow entry
4779** which is also the right-most entry on the page.
4780*/
danielk197771d5d2c2008-09-29 11:49:47 +00004781static int balance_quick(BtCursor *pCur){
danielk1977ac245ec2005-01-14 13:50:11 +00004782 int rc;
danielk1977eaa06f62008-09-18 17:34:44 +00004783 MemPage *pNew = 0;
danielk1977ac245ec2005-01-14 13:50:11 +00004784 Pgno pgnoNew;
4785 u8 *pCell;
drha9121e42008-02-19 14:59:35 +00004786 u16 szCell;
danielk1977ac245ec2005-01-14 13:50:11 +00004787 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00004788 MemPage *pPage = pCur->apPage[pCur->iPage];
4789 MemPage *pParent = pCur->apPage[pCur->iPage-1];
danielk1977aef0bf62005-12-30 16:28:01 +00004790 BtShared *pBt = pPage->pBt;
danielk197779a40da2005-01-16 08:00:01 +00004791 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4792 int parentSize; /* Size of new divider cell */
4793 u8 parentCell[64]; /* Space for the new divider cell */
danielk1977ac245ec2005-01-14 13:50:11 +00004794
drh1fee73e2007-08-29 04:00:57 +00004795 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004796
danielk1977ac245ec2005-01-14 13:50:11 +00004797 /* Allocate a new page. Insert the overflow cell from pPage
4798 ** into it. Then remove the overflow cell from pPage.
4799 */
drh4f0c5872007-03-26 22:05:01 +00004800 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004801 if( rc==SQLITE_OK ){
4802 pCell = pPage->aOvfl[0].pCell;
4803 szCell = cellSizePtr(pPage, pCell);
4804 zeroPage(pNew, pPage->aData[0]);
4805 assemblePage(pNew, 1, &pCell, &szCell);
4806 pPage->nOverflow = 0;
4807
danielk1977eaa06f62008-09-18 17:34:44 +00004808 /* pPage is currently the right-child of pParent. Change this
4809 ** so that the right-child is the new page allocated above and
4810 ** pPage is the next-to-right child.
4811 **
4812 ** Ignore the return value of the call to fillInCell(). fillInCell()
4813 ** may only return other than SQLITE_OK if it is required to allocate
4814 ** one or more overflow pages. Since an internal table B-Tree cell
4815 ** may never spill over onto an overflow page (it is a maximum of
4816 ** 13 bytes in size), it is not neccessary to check the return code.
4817 **
4818 ** Similarly, the insertCell() function cannot fail if the page
4819 ** being inserted into is already writable and the cell does not
4820 ** contain an overflow pointer. So ignore this return code too.
4821 */
4822 assert( pPage->nCell>0 );
4823 pCell = findCell(pPage, pPage->nCell-1);
4824 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4825 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4826 assert( parentSize<64 );
4827 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4828 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4829 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4830 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4831
4832 /* If this is an auto-vacuum database, update the pointer map
4833 ** with entries for the new page, and any pointer from the
4834 ** cell on the page to an overflow page.
4835 */
4836 if( ISAUTOVACUUM ){
4837 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4838 if( rc==SQLITE_OK ){
4839 rc = ptrmapPutOvfl(pNew, 0);
4840 }
danielk1977ac11ee62005-01-15 12:45:51 +00004841 }
danielk1977e08a3c42008-09-18 18:17:03 +00004842
4843 /* Release the reference to the new page. */
4844 releasePage(pNew);
danielk1977ac11ee62005-01-15 12:45:51 +00004845 }
4846
danielk1977eaa06f62008-09-18 17:34:44 +00004847 /* At this point the pPage->nFree variable is not set correctly with
4848 ** respect to the content of the page (because it was set to 0 by
4849 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
4850 ** correct.
4851 **
4852 ** This has to be done even if an error will be returned. Normally, if
4853 ** an error occurs during tree balancing, the contents of MemPage are
4854 ** not important, as they will be recalculated when the page is rolled
4855 ** back. But here, in balance_quick(), it is possible that pPage has
4856 ** not yet been marked dirty or written into the journal file. Therefore
4857 ** it will not be rolled back and so it is important to make sure that
4858 ** the page data and contents of MemPage are consistent.
4859 */
4860 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004861 sqlite3BtreeInitPage(pPage);
danielk1977eaa06f62008-09-18 17:34:44 +00004862
danielk1977e08a3c42008-09-18 18:17:03 +00004863 /* If everything else succeeded, balance the parent page, in
4864 ** case the divider cell inserted caused it to become overfull.
danielk197779a40da2005-01-16 08:00:01 +00004865 */
danielk1977eaa06f62008-09-18 17:34:44 +00004866 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00004867 releasePage(pPage);
4868 pCur->iPage--;
4869 rc = balance(pCur, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004870 }
4871 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00004872}
drh615ae552005-01-16 23:21:00 +00004873#endif /* SQLITE_OMIT_QUICKBALANCE */
drh43605152004-05-29 21:46:49 +00004874
drhc3b70572003-01-04 19:44:07 +00004875/*
drhab01f612004-05-22 02:55:23 +00004876** This routine redistributes Cells on pPage and up to NN*2 siblings
drh8b2f49b2001-06-08 00:21:52 +00004877** of pPage so that all pages have about the same amount of free space.
drh0c6cc4e2004-06-15 02:13:26 +00004878** Usually NN siblings on either side of pPage is used in the balancing,
4879** though more siblings might come from one side if pPage is the first
drhab01f612004-05-22 02:55:23 +00004880** or last child of its parent. If pPage has fewer than 2*NN siblings
drh8b2f49b2001-06-08 00:21:52 +00004881** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00004882** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00004883**
drh0c6cc4e2004-06-15 02:13:26 +00004884** The number of siblings of pPage might be increased or decreased by one or
4885** two in an effort to keep pages nearly full but not over full. The root page
drhab01f612004-05-22 02:55:23 +00004886** is special and is allowed to be nearly empty. If pPage is
drh8c42ca92001-06-22 19:15:00 +00004887** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00004888** or decreased by one, as necessary, to keep the root page from being
drhab01f612004-05-22 02:55:23 +00004889** overfull or completely empty.
drh14acc042001-06-10 19:56:58 +00004890**
drh8b2f49b2001-06-08 00:21:52 +00004891** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00004892** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00004893** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00004894** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00004895**
drh8c42ca92001-06-22 19:15:00 +00004896** In the course of balancing the siblings of pPage, the parent of pPage
4897** might become overfull or underfull. If that happens, then this routine
4898** is called recursively on the parent.
4899**
drh5e00f6c2001-09-13 13:46:56 +00004900** If this routine fails for any reason, it might leave the database
4901** in a corrupted state. So if this routine fails, the database should
4902** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00004903*/
danielk197771d5d2c2008-09-29 11:49:47 +00004904static int balance_nonroot(BtCursor *pCur){
4905 MemPage *pPage; /* The over or underfull page to balance */
drh8b2f49b2001-06-08 00:21:52 +00004906 MemPage *pParent; /* The parent of pPage */
drh16a9b832007-05-05 18:39:25 +00004907 BtShared *pBt; /* The whole database */
danielk1977634f2982005-03-28 08:44:07 +00004908 int nCell = 0; /* Number of cells in apCell[] */
4909 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
drh8b2f49b2001-06-08 00:21:52 +00004910 int nOld; /* Number of pages in apOld[] */
4911 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00004912 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00004913 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00004914 int idx; /* Index of pPage in pParent->aCell[] */
4915 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00004916 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00004917 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00004918 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00004919 int usableSpace; /* Bytes in pPage beyond the header */
4920 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00004921 int subtotal; /* Subtotal of bytes in cells on one page */
drhe5ae5732008-06-15 02:51:47 +00004922 int iSpace1 = 0; /* First unused byte of aSpace1[] */
4923 int iSpace2 = 0; /* First unused byte of aSpace2[] */
drhfacf0302008-06-17 15:12:00 +00004924 int szScratch; /* Size of scratch memory requested */
drhc3b70572003-01-04 19:44:07 +00004925 MemPage *apOld[NB]; /* pPage and up to two siblings */
4926 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00004927 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drha2fce642004-06-05 00:01:44 +00004928 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4929 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
drh4b70f112004-05-02 21:12:19 +00004930 u8 *apDiv[NB]; /* Divider cells in pParent */
drha2fce642004-06-05 00:01:44 +00004931 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4932 int szNew[NB+2]; /* Combined size of cells place on i-th page */
danielk197750f059b2005-03-29 02:54:03 +00004933 u8 **apCell = 0; /* All cells begin balanced */
drha9121e42008-02-19 14:59:35 +00004934 u16 *szCell; /* Local size of all cells in apCell[] */
drhe5ae5732008-06-15 02:51:47 +00004935 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4936 u8 *aSpace1; /* Space for copies of dividers cells before balance */
4937 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
danielk1977ac11ee62005-01-15 12:45:51 +00004938 u8 *aFrom = 0;
drh8b2f49b2001-06-08 00:21:52 +00004939
danielk197771d5d2c2008-09-29 11:49:47 +00004940 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00004941 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf94a1732008-09-30 17:18:17 +00004942 VVA_ONLY( pCur->pagesShuffled = 1 );
drhd677b3d2007-08-20 22:48:41 +00004943
drh14acc042001-06-10 19:56:58 +00004944 /*
drh43605152004-05-29 21:46:49 +00004945 ** Find the parent page.
drh8b2f49b2001-06-08 00:21:52 +00004946 */
danielk197771d5d2c2008-09-29 11:49:47 +00004947 assert( pCur->iPage>0 );
4948 assert( pPage->isInit );
danielk19776e465eb2007-08-21 13:11:00 +00004949 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
drh4b70f112004-05-02 21:12:19 +00004950 pBt = pPage->pBt;
danielk197771d5d2c2008-09-29 11:49:47 +00004951 pParent = pCur->apPage[pCur->iPage-1];
drh43605152004-05-29 21:46:49 +00004952 assert( pParent );
danielk19773b8a05f2007-03-19 17:44:26 +00004953 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
danielk197707cb5602006-01-20 10:55:05 +00004954 return rc;
4955 }
danielk1977474b7cc2008-07-09 11:49:46 +00004956
drh43605152004-05-29 21:46:49 +00004957 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
drh2e38c322004-09-03 18:38:44 +00004958
drh615ae552005-01-16 23:21:00 +00004959#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004960 /*
4961 ** A special case: If a new entry has just been inserted into a
4962 ** table (that is, a btree with integer keys and all data at the leaves)
drh09d0deb2005-08-02 17:13:09 +00004963 ** and the new entry is the right-most entry in the tree (it has the
drhf222e712005-01-14 22:55:49 +00004964 ** largest key) then use the special balance_quick() routine for
4965 ** balancing. balance_quick() is much faster and results in a tighter
4966 ** packing of data in the common case.
4967 */
danielk1977ac245ec2005-01-14 13:50:11 +00004968 if( pPage->leaf &&
4969 pPage->intKey &&
danielk1977ac245ec2005-01-14 13:50:11 +00004970 pPage->nOverflow==1 &&
4971 pPage->aOvfl[0].idx==pPage->nCell &&
danielk197771d5d2c2008-09-29 11:49:47 +00004972 pParent->pgno!=1 &&
danielk1977ac245ec2005-01-14 13:50:11 +00004973 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4974 ){
drh44845222008-07-17 18:39:57 +00004975 assert( pPage->intKey );
danielk1977ac11ee62005-01-15 12:45:51 +00004976 /*
4977 ** TODO: Check the siblings to the left of pPage. It may be that
4978 ** they are not full and no new page is required.
4979 */
danielk197771d5d2c2008-09-29 11:49:47 +00004980 return balance_quick(pCur);
danielk1977ac245ec2005-01-14 13:50:11 +00004981 }
4982#endif
4983
danielk19776e465eb2007-08-21 13:11:00 +00004984 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4985 return rc;
4986 }
4987
drh2e38c322004-09-03 18:38:44 +00004988 /*
drh4b70f112004-05-02 21:12:19 +00004989 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00004990 ** to pPage. The "idx" variable is the index of that cell. If pPage
4991 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00004992 */
danielk1977bf93c562008-09-29 15:53:25 +00004993 idx = pCur->aiIdx[pCur->iPage-1];
4994 assertParentIndex(pParent, idx, pPage->pgno);
drh8b2f49b2001-06-08 00:21:52 +00004995
4996 /*
drh14acc042001-06-10 19:56:58 +00004997 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00004998 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00004999 */
drh14acc042001-06-10 19:56:58 +00005000 nOld = nNew = 0;
drh14acc042001-06-10 19:56:58 +00005001
5002 /*
drh4b70f112004-05-02 21:12:19 +00005003 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00005004 ** the siblings. An attempt is made to find NN siblings on either
5005 ** side of pPage. More siblings are taken from one side, however, if
5006 ** pPage there are fewer than NN siblings on the other side. If pParent
5007 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00005008 */
drhc3b70572003-01-04 19:44:07 +00005009 nxDiv = idx - NN;
5010 if( nxDiv + NB > pParent->nCell ){
5011 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00005012 }
drhc3b70572003-01-04 19:44:07 +00005013 if( nxDiv<0 ){
5014 nxDiv = 0;
5015 }
drh8b2f49b2001-06-08 00:21:52 +00005016 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00005017 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00005018 if( k<pParent->nCell ){
danielk19771cc5ed82007-05-16 17:28:43 +00005019 apDiv[i] = findCell(pParent, k);
drh8b2f49b2001-06-08 00:21:52 +00005020 nDiv++;
drha34b6762004-05-07 13:30:42 +00005021 assert( !pParent->leaf );
drh43605152004-05-29 21:46:49 +00005022 pgnoOld[i] = get4byte(apDiv[i]);
drh14acc042001-06-10 19:56:58 +00005023 }else if( k==pParent->nCell ){
drh43605152004-05-29 21:46:49 +00005024 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
drh14acc042001-06-10 19:56:58 +00005025 }else{
5026 break;
drh8b2f49b2001-06-08 00:21:52 +00005027 }
danielk197771d5d2c2008-09-29 11:49:47 +00005028 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
drh6019e162001-07-02 17:51:45 +00005029 if( rc ) goto balance_cleanup;
danielk197771d5d2c2008-09-29 11:49:47 +00005030 /* apOld[i]->idxParent = k; */
drh91025292004-05-03 19:49:32 +00005031 apCopy[i] = 0;
5032 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00005033 nOld++;
danielk1977634f2982005-03-28 08:44:07 +00005034 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
drh8b2f49b2001-06-08 00:21:52 +00005035 }
5036
drha9121e42008-02-19 14:59:35 +00005037 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
drh8d97f1f2005-05-05 18:14:13 +00005038 ** alignment */
drha9121e42008-02-19 14:59:35 +00005039 nMaxCells = (nMaxCells + 3)&~3;
drh8d97f1f2005-05-05 18:14:13 +00005040
drh8b2f49b2001-06-08 00:21:52 +00005041 /*
danielk1977634f2982005-03-28 08:44:07 +00005042 ** Allocate space for memory structures
5043 */
drhfacf0302008-06-17 15:12:00 +00005044 szScratch =
drha9121e42008-02-19 14:59:35 +00005045 nMaxCells*sizeof(u8*) /* apCell */
5046 + nMaxCells*sizeof(u16) /* szCell */
5047 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
drhe5ae5732008-06-15 02:51:47 +00005048 + pBt->pageSize /* aSpace1 */
drhfacf0302008-06-17 15:12:00 +00005049 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5050 apCell = sqlite3ScratchMalloc( szScratch );
danielk1977634f2982005-03-28 08:44:07 +00005051 if( apCell==0 ){
5052 rc = SQLITE_NOMEM;
5053 goto balance_cleanup;
5054 }
drha9121e42008-02-19 14:59:35 +00005055 szCell = (u16*)&apCell[nMaxCells];
danielk1977634f2982005-03-28 08:44:07 +00005056 aCopy[0] = (u8*)&szCell[nMaxCells];
drhc96d8532005-05-03 12:30:33 +00005057 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005058 for(i=1; i<NB; i++){
drhc96d8532005-05-03 12:30:33 +00005059 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5060 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005061 }
drhe5ae5732008-06-15 02:51:47 +00005062 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5063 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
danielk197785d90ca2008-07-19 14:25:15 +00005064 if( ISAUTOVACUUM ){
drhe5ae5732008-06-15 02:51:47 +00005065 aFrom = &aSpace1[pBt->pageSize];
danielk1977634f2982005-03-28 08:44:07 +00005066 }
drhfacf0302008-06-17 15:12:00 +00005067 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
drhe5ae5732008-06-15 02:51:47 +00005068 if( aSpace2==0 ){
5069 rc = SQLITE_NOMEM;
5070 goto balance_cleanup;
5071 }
danielk1977634f2982005-03-28 08:44:07 +00005072
5073 /*
drh14acc042001-06-10 19:56:58 +00005074 ** Make copies of the content of pPage and its siblings into aOld[].
5075 ** The rest of this function will use data from the copies rather
5076 ** that the original pages since the original pages will be in the
5077 ** process of being overwritten.
5078 */
5079 for(i=0; i<nOld; i++){
drhbf4bca52007-09-06 22:19:14 +00005080 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5081 memcpy(p, apOld[i], sizeof(MemPage));
5082 p->aData = (void*)&p[1];
5083 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
drh14acc042001-06-10 19:56:58 +00005084 }
5085
5086 /*
5087 ** Load pointers to all cells on sibling pages and the divider cells
5088 ** into the local apCell[] array. Make copies of the divider cells
drhe5ae5732008-06-15 02:51:47 +00005089 ** into space obtained form aSpace1[] and remove the the divider Cells
drhb6f41482004-05-14 01:58:11 +00005090 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00005091 **
5092 ** If the siblings are on leaf pages, then the child pointers of the
5093 ** divider cells are stripped from the cells before they are copied
drhe5ae5732008-06-15 02:51:47 +00005094 ** into aSpace1[]. In this way, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00005095 ** child pointers. If siblings are not leaves, then all cell in
5096 ** apCell[] include child pointers. Either way, all cells in apCell[]
5097 ** are alike.
drh96f5b762004-05-16 16:24:36 +00005098 **
5099 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5100 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
drh8b2f49b2001-06-08 00:21:52 +00005101 */
5102 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00005103 leafCorrection = pPage->leaf*4;
drh44845222008-07-17 18:39:57 +00005104 leafData = pPage->hasData;
drh8b2f49b2001-06-08 00:21:52 +00005105 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00005106 MemPage *pOld = apCopy[i];
drh43605152004-05-29 21:46:49 +00005107 int limit = pOld->nCell+pOld->nOverflow;
5108 for(j=0; j<limit; j++){
danielk1977634f2982005-03-28 08:44:07 +00005109 assert( nCell<nMaxCells );
drh43605152004-05-29 21:46:49 +00005110 apCell[nCell] = findOverflowCell(pOld, j);
5111 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
danielk197785d90ca2008-07-19 14:25:15 +00005112 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005113 int a;
5114 aFrom[nCell] = i;
5115 for(a=0; a<pOld->nOverflow; a++){
5116 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5117 aFrom[nCell] = 0xFF;
5118 break;
5119 }
5120 }
5121 }
drh14acc042001-06-10 19:56:58 +00005122 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00005123 }
5124 if( i<nOld-1 ){
drha9121e42008-02-19 14:59:35 +00005125 u16 sz = cellSizePtr(pParent, apDiv[i]);
drh8b18dd42004-05-12 19:18:15 +00005126 if( leafData ){
drh96f5b762004-05-16 16:24:36 +00005127 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5128 ** are duplicates of keys on the child pages. We need to remove
5129 ** the divider cells from pParent, but the dividers cells are not
5130 ** added to apCell[] because they are duplicates of child cells.
5131 */
drh8b18dd42004-05-12 19:18:15 +00005132 dropCell(pParent, nxDiv, sz);
drh4b70f112004-05-02 21:12:19 +00005133 }else{
drhb6f41482004-05-14 01:58:11 +00005134 u8 *pTemp;
danielk1977634f2982005-03-28 08:44:07 +00005135 assert( nCell<nMaxCells );
drhb6f41482004-05-14 01:58:11 +00005136 szCell[nCell] = sz;
drhe5ae5732008-06-15 02:51:47 +00005137 pTemp = &aSpace1[iSpace1];
5138 iSpace1 += sz;
5139 assert( sz<=pBt->pageSize/4 );
5140 assert( iSpace1<=pBt->pageSize );
drhb6f41482004-05-14 01:58:11 +00005141 memcpy(pTemp, apDiv[i], sz);
5142 apCell[nCell] = pTemp+leafCorrection;
danielk197785d90ca2008-07-19 14:25:15 +00005143 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005144 aFrom[nCell] = 0xFF;
5145 }
drhb6f41482004-05-14 01:58:11 +00005146 dropCell(pParent, nxDiv, sz);
drh8b18dd42004-05-12 19:18:15 +00005147 szCell[nCell] -= leafCorrection;
drh43605152004-05-29 21:46:49 +00005148 assert( get4byte(pTemp)==pgnoOld[i] );
drh8b18dd42004-05-12 19:18:15 +00005149 if( !pOld->leaf ){
5150 assert( leafCorrection==0 );
5151 /* The right pointer of the child page pOld becomes the left
5152 ** pointer of the divider cell */
drh43605152004-05-29 21:46:49 +00005153 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
drh8b18dd42004-05-12 19:18:15 +00005154 }else{
5155 assert( leafCorrection==4 );
danielk197739c96042007-05-12 10:41:47 +00005156 if( szCell[nCell]<4 ){
5157 /* Do not allow any cells smaller than 4 bytes. */
5158 szCell[nCell] = 4;
5159 }
drh8b18dd42004-05-12 19:18:15 +00005160 }
5161 nCell++;
drh4b70f112004-05-02 21:12:19 +00005162 }
drh8b2f49b2001-06-08 00:21:52 +00005163 }
5164 }
5165
5166 /*
drh6019e162001-07-02 17:51:45 +00005167 ** Figure out the number of pages needed to hold all nCell cells.
5168 ** Store this number in "k". Also compute szNew[] which is the total
5169 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00005170 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00005171 ** cntNew[k] should equal nCell.
5172 **
drh96f5b762004-05-16 16:24:36 +00005173 ** Values computed by this block:
5174 **
5175 ** k: The total number of sibling pages
5176 ** szNew[i]: Spaced used on the i-th sibling page.
5177 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5178 ** the right of the i-th sibling page.
5179 ** usableSpace: Number of bytes of space available on each sibling.
5180 **
drh8b2f49b2001-06-08 00:21:52 +00005181 */
drh43605152004-05-29 21:46:49 +00005182 usableSpace = pBt->usableSize - 12 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00005183 for(subtotal=k=i=0; i<nCell; i++){
danielk1977634f2982005-03-28 08:44:07 +00005184 assert( i<nMaxCells );
drh43605152004-05-29 21:46:49 +00005185 subtotal += szCell[i] + 2;
drh4b70f112004-05-02 21:12:19 +00005186 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00005187 szNew[k] = subtotal - szCell[i];
5188 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00005189 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00005190 subtotal = 0;
5191 k++;
5192 }
5193 }
5194 szNew[k] = subtotal;
5195 cntNew[k] = nCell;
5196 k++;
drh96f5b762004-05-16 16:24:36 +00005197
5198 /*
5199 ** The packing computed by the previous block is biased toward the siblings
5200 ** on the left side. The left siblings are always nearly full, while the
5201 ** right-most sibling might be nearly empty. This block of code attempts
5202 ** to adjust the packing of siblings to get a better balance.
5203 **
5204 ** This adjustment is more than an optimization. The packing above might
5205 ** be so out of balance as to be illegal. For example, the right-most
5206 ** sibling might be completely empty. This adjustment is not optional.
5207 */
drh6019e162001-07-02 17:51:45 +00005208 for(i=k-1; i>0; i--){
drh96f5b762004-05-16 16:24:36 +00005209 int szRight = szNew[i]; /* Size of sibling on the right */
5210 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5211 int r; /* Index of right-most cell in left sibling */
5212 int d; /* Index of first cell to the left of right sibling */
5213
5214 r = cntNew[i-1] - 1;
5215 d = r + 1 - leafData;
danielk1977634f2982005-03-28 08:44:07 +00005216 assert( d<nMaxCells );
5217 assert( r<nMaxCells );
drh43605152004-05-29 21:46:49 +00005218 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5219 szRight += szCell[d] + 2;
5220 szLeft -= szCell[r] + 2;
drh6019e162001-07-02 17:51:45 +00005221 cntNew[i-1]--;
drh96f5b762004-05-16 16:24:36 +00005222 r = cntNew[i-1] - 1;
5223 d = r + 1 - leafData;
drh6019e162001-07-02 17:51:45 +00005224 }
drh96f5b762004-05-16 16:24:36 +00005225 szNew[i] = szRight;
5226 szNew[i-1] = szLeft;
drh6019e162001-07-02 17:51:45 +00005227 }
drh09d0deb2005-08-02 17:13:09 +00005228
5229 /* Either we found one or more cells (cntnew[0])>0) or we are the
5230 ** a virtual root page. A virtual root page is when the real root
5231 ** page is page 1 and we are the only child of that page.
5232 */
5233 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
drh8b2f49b2001-06-08 00:21:52 +00005234
5235 /*
drh6b308672002-07-08 02:16:37 +00005236 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00005237 */
drh4b70f112004-05-02 21:12:19 +00005238 assert( pPage->pgno>1 );
5239 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00005240 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00005241 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00005242 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00005243 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00005244 pgnoNew[i] = pgnoOld[i];
5245 apOld[i] = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00005246 rc = sqlite3PagerWrite(pNew->pDbPage);
drhf5345442007-04-09 12:45:02 +00005247 nNew++;
danielk197728129562005-01-11 10:25:06 +00005248 if( rc ) goto balance_cleanup;
drh6b308672002-07-08 02:16:37 +00005249 }else{
drh7aa8f852006-03-28 00:24:44 +00005250 assert( i>0 );
drh4f0c5872007-03-26 22:05:01 +00005251 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
drh6b308672002-07-08 02:16:37 +00005252 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00005253 apNew[i] = pNew;
drhf5345442007-04-09 12:45:02 +00005254 nNew++;
drh6b308672002-07-08 02:16:37 +00005255 }
drh8b2f49b2001-06-08 00:21:52 +00005256 }
5257
danielk1977299b1872004-11-22 10:02:10 +00005258 /* Free any old pages that were not reused as new pages.
5259 */
5260 while( i<nOld ){
5261 rc = freePage(apOld[i]);
5262 if( rc ) goto balance_cleanup;
5263 releasePage(apOld[i]);
5264 apOld[i] = 0;
5265 i++;
5266 }
5267
drh8b2f49b2001-06-08 00:21:52 +00005268 /*
drhf9ffac92002-03-02 19:00:31 +00005269 ** Put the new pages in accending order. This helps to
5270 ** keep entries in the disk file in order so that a scan
5271 ** of the table is a linear scan through the file. That
5272 ** in turn helps the operating system to deliver pages
5273 ** from the disk more rapidly.
5274 **
5275 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00005276 ** n is never more than NB (a small constant), that should
5277 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00005278 **
drhc3b70572003-01-04 19:44:07 +00005279 ** When NB==3, this one optimization makes the database
5280 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00005281 */
5282 for(i=0; i<k-1; i++){
5283 int minV = pgnoNew[i];
5284 int minI = i;
5285 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00005286 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00005287 minI = j;
5288 minV = pgnoNew[j];
5289 }
5290 }
5291 if( minI>i ){
5292 int t;
5293 MemPage *pT;
5294 t = pgnoNew[i];
5295 pT = apNew[i];
5296 pgnoNew[i] = pgnoNew[minI];
5297 apNew[i] = apNew[minI];
5298 pgnoNew[minI] = t;
5299 apNew[minI] = pT;
5300 }
5301 }
drha2fce642004-06-05 00:01:44 +00005302 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
drh24cd67e2004-05-10 16:18:47 +00005303 pgnoOld[0],
5304 nOld>=2 ? pgnoOld[1] : 0,
5305 nOld>=3 ? pgnoOld[2] : 0,
drh10c0fa62004-05-18 12:50:17 +00005306 pgnoNew[0], szNew[0],
5307 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5308 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
drha2fce642004-06-05 00:01:44 +00005309 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5310 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
drh24cd67e2004-05-10 16:18:47 +00005311
drhf9ffac92002-03-02 19:00:31 +00005312 /*
drh14acc042001-06-10 19:56:58 +00005313 ** Evenly distribute the data in apCell[] across the new pages.
5314 ** Insert divider cells into pParent as necessary.
5315 */
5316 j = 0;
5317 for(i=0; i<nNew; i++){
danielk1977ac11ee62005-01-15 12:45:51 +00005318 /* Assemble the new sibling page. */
drh14acc042001-06-10 19:56:58 +00005319 MemPage *pNew = apNew[i];
drh19642e52005-03-29 13:17:45 +00005320 assert( j<nMaxCells );
drh4b70f112004-05-02 21:12:19 +00005321 assert( pNew->pgno==pgnoNew[i] );
drh10131482008-07-11 03:34:09 +00005322 zeroPage(pNew, pageFlags);
drhfa1a98a2004-05-14 19:08:17 +00005323 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
drh09d0deb2005-08-02 17:13:09 +00005324 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
drh43605152004-05-29 21:46:49 +00005325 assert( pNew->nOverflow==0 );
danielk1977ac11ee62005-01-15 12:45:51 +00005326
danielk1977ac11ee62005-01-15 12:45:51 +00005327 /* If this is an auto-vacuum database, update the pointer map entries
5328 ** that point to the siblings that were rearranged. These can be: left
5329 ** children of cells, the right-child of the page, or overflow pages
5330 ** pointed to by cells.
5331 */
danielk197785d90ca2008-07-19 14:25:15 +00005332 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005333 for(k=j; k<cntNew[i]; k++){
danielk1977634f2982005-03-28 08:44:07 +00005334 assert( k<nMaxCells );
danielk1977ac11ee62005-01-15 12:45:51 +00005335 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
danielk197779a40da2005-01-16 08:00:01 +00005336 rc = ptrmapPutOvfl(pNew, k-j);
danielk197787c52b52008-07-19 11:49:07 +00005337 if( rc==SQLITE_OK && leafCorrection==0 ){
5338 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5339 }
danielk197779a40da2005-01-16 08:00:01 +00005340 if( rc!=SQLITE_OK ){
5341 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005342 }
5343 }
5344 }
5345 }
danielk1977ac11ee62005-01-15 12:45:51 +00005346
5347 j = cntNew[i];
5348
5349 /* If the sibling page assembled above was not the right-most sibling,
5350 ** insert a divider cell into the parent page.
5351 */
drh14acc042001-06-10 19:56:58 +00005352 if( i<nNew-1 && j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00005353 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00005354 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00005355 int sz;
danielk1977634f2982005-03-28 08:44:07 +00005356
5357 assert( j<nMaxCells );
drh8b18dd42004-05-12 19:18:15 +00005358 pCell = apCell[j];
5359 sz = szCell[j] + leafCorrection;
drhe5ae5732008-06-15 02:51:47 +00005360 pTemp = &aSpace2[iSpace2];
drh4b70f112004-05-02 21:12:19 +00005361 if( !pNew->leaf ){
drh43605152004-05-29 21:46:49 +00005362 memcpy(&pNew->aData[8], pCell, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005363 if( ISAUTOVACUUM
danielk197787c52b52008-07-19 11:49:07 +00005364 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5365 ){
5366 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5367 if( rc!=SQLITE_OK ){
5368 goto balance_cleanup;
5369 }
5370 }
drh8b18dd42004-05-12 19:18:15 +00005371 }else if( leafData ){
drhfd131da2007-08-07 17:13:03 +00005372 /* If the tree is a leaf-data tree, and the siblings are leaves,
danielk1977ac11ee62005-01-15 12:45:51 +00005373 ** then there is no divider cell in apCell[]. Instead, the divider
5374 ** cell consists of the integer key for the right-most cell of
5375 ** the sibling-page assembled above only.
5376 */
drh6f11bef2004-05-13 01:12:56 +00005377 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00005378 j--;
drh16a9b832007-05-05 18:39:25 +00005379 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
drhe5ae5732008-06-15 02:51:47 +00005380 pCell = pTemp;
drhb026e052007-05-02 01:34:31 +00005381 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
drh8b18dd42004-05-12 19:18:15 +00005382 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00005383 }else{
5384 pCell -= 4;
danielk19774aeff622007-05-12 09:30:47 +00005385 /* Obscure case for non-leaf-data trees: If the cell at pCell was
drh85b623f2007-12-13 21:54:09 +00005386 ** previously stored on a leaf node, and its reported size was 4
danielk19774aeff622007-05-12 09:30:47 +00005387 ** bytes, then it may actually be smaller than this
5388 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
drh85b623f2007-12-13 21:54:09 +00005389 ** any cell). But it is important to pass the correct size to
danielk19774aeff622007-05-12 09:30:47 +00005390 ** insertCell(), so reparse the cell now.
5391 **
5392 ** Note that this can never happen in an SQLite data file, as all
5393 ** cells are at least 4 bytes. It only happens in b-trees used
5394 ** to evaluate "IN (SELECT ...)" and similar clauses.
5395 */
5396 if( szCell[j]==4 ){
5397 assert(leafCorrection==4);
5398 sz = cellSizePtr(pParent, pCell);
5399 }
drh4b70f112004-05-02 21:12:19 +00005400 }
drhe5ae5732008-06-15 02:51:47 +00005401 iSpace2 += sz;
5402 assert( sz<=pBt->pageSize/4 );
5403 assert( iSpace2<=pBt->pageSize );
danielk1977a3ad5e72005-01-07 08:56:44 +00005404 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
danielk1977e80463b2004-11-03 03:01:16 +00005405 if( rc!=SQLITE_OK ) goto balance_cleanup;
drh43605152004-05-29 21:46:49 +00005406 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
danielk197785d90ca2008-07-19 14:25:15 +00005407
danielk1977ac11ee62005-01-15 12:45:51 +00005408 /* If this is an auto-vacuum database, and not a leaf-data tree,
5409 ** then update the pointer map with an entry for the overflow page
5410 ** that the cell just inserted points to (if any).
5411 */
danielk197785d90ca2008-07-19 14:25:15 +00005412 if( ISAUTOVACUUM && !leafData ){
danielk197779a40da2005-01-16 08:00:01 +00005413 rc = ptrmapPutOvfl(pParent, nxDiv);
5414 if( rc!=SQLITE_OK ){
5415 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005416 }
5417 }
drh14acc042001-06-10 19:56:58 +00005418 j++;
5419 nxDiv++;
5420 }
danielk197787c52b52008-07-19 11:49:07 +00005421
danielk197787c52b52008-07-19 11:49:07 +00005422 /* Set the pointer-map entry for the new sibling page. */
danielk197785d90ca2008-07-19 14:25:15 +00005423 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005424 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5425 if( rc!=SQLITE_OK ){
5426 goto balance_cleanup;
5427 }
5428 }
drh14acc042001-06-10 19:56:58 +00005429 }
drh6019e162001-07-02 17:51:45 +00005430 assert( j==nCell );
drh7aa8f852006-03-28 00:24:44 +00005431 assert( nOld>0 );
5432 assert( nNew>0 );
drh4b70f112004-05-02 21:12:19 +00005433 if( (pageFlags & PTF_LEAF)==0 ){
danielk197787c52b52008-07-19 11:49:07 +00005434 u8 *zChild = &apCopy[nOld-1]->aData[8];
5435 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005436 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005437 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5438 if( rc!=SQLITE_OK ){
5439 goto balance_cleanup;
5440 }
5441 }
drh14acc042001-06-10 19:56:58 +00005442 }
drh43605152004-05-29 21:46:49 +00005443 if( nxDiv==pParent->nCell+pParent->nOverflow ){
drh4b70f112004-05-02 21:12:19 +00005444 /* Right-most sibling is the right-most child of pParent */
drh43605152004-05-29 21:46:49 +00005445 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
drh4b70f112004-05-02 21:12:19 +00005446 }else{
5447 /* Right-most sibling is the left child of the first entry in pParent
5448 ** past the right-most divider entry */
drh43605152004-05-29 21:46:49 +00005449 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00005450 }
5451
5452 /*
drh3a4c1412004-05-09 20:40:11 +00005453 ** Balance the parent page. Note that the current page (pPage) might
danielk1977ac11ee62005-01-15 12:45:51 +00005454 ** have been added to the freelist so it might no longer be initialized.
drh3a4c1412004-05-09 20:40:11 +00005455 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00005456 */
danielk197771d5d2c2008-09-29 11:49:47 +00005457 assert( pParent->isInit );
drhfacf0302008-06-17 15:12:00 +00005458 sqlite3ScratchFree(apCell);
drhe5ae5732008-06-15 02:51:47 +00005459 apCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005460 releasePage(pPage);
5461 pCur->iPage--;
5462 rc = balance(pCur, 0);
drhda200cc2004-05-09 11:51:38 +00005463
drh8b2f49b2001-06-08 00:21:52 +00005464 /*
drh14acc042001-06-10 19:56:58 +00005465 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00005466 */
drh14acc042001-06-10 19:56:58 +00005467balance_cleanup:
drhfacf0302008-06-17 15:12:00 +00005468 sqlite3PageFree(aSpace2);
5469 sqlite3ScratchFree(apCell);
drh8b2f49b2001-06-08 00:21:52 +00005470 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00005471 releasePage(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00005472 }
drh14acc042001-06-10 19:56:58 +00005473 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00005474 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00005475 }
danielk1977eaa06f62008-09-18 17:34:44 +00005476
danielk197771d5d2c2008-09-29 11:49:47 +00005477 /* releasePage(pParent); */
drh3a4c1412004-05-09 20:40:11 +00005478 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5479 pPage->pgno, nOld, nNew, nCell));
danielk1977eaa06f62008-09-18 17:34:44 +00005480
drh8b2f49b2001-06-08 00:21:52 +00005481 return rc;
5482}
5483
5484/*
drh43605152004-05-29 21:46:49 +00005485** This routine is called for the root page of a btree when the root
5486** page contains no cells. This is an opportunity to make the tree
5487** shallower by one level.
5488*/
danielk197771d5d2c2008-09-29 11:49:47 +00005489static int balance_shallower(BtCursor *pCur){
5490 MemPage *pPage; /* Root page of B-Tree */
drh43605152004-05-29 21:46:49 +00005491 MemPage *pChild; /* The only child page of pPage */
5492 Pgno pgnoChild; /* Page number for pChild */
drh2e38c322004-09-03 18:38:44 +00005493 int rc = SQLITE_OK; /* Return code from subprocedures */
danielk1977aef0bf62005-12-30 16:28:01 +00005494 BtShared *pBt; /* The main BTree structure */
drh2e38c322004-09-03 18:38:44 +00005495 int mxCellPerPage; /* Maximum number of cells per page */
5496 u8 **apCell; /* All cells from pages being balanced */
drha9121e42008-02-19 14:59:35 +00005497 u16 *szCell; /* Local size of all cells */
drh43605152004-05-29 21:46:49 +00005498
danielk197771d5d2c2008-09-29 11:49:47 +00005499 assert( pCur->iPage==0 );
5500 pPage = pCur->apPage[0];
5501
drh43605152004-05-29 21:46:49 +00005502 assert( pPage->nCell==0 );
drh1fee73e2007-08-29 04:00:57 +00005503 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh2e38c322004-09-03 18:38:44 +00005504 pBt = pPage->pBt;
5505 mxCellPerPage = MX_CELL(pBt);
drhe5ae5732008-06-15 02:51:47 +00005506 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
drh2e38c322004-09-03 18:38:44 +00005507 if( apCell==0 ) return SQLITE_NOMEM;
drha9121e42008-02-19 14:59:35 +00005508 szCell = (u16*)&apCell[mxCellPerPage];
drh43605152004-05-29 21:46:49 +00005509 if( pPage->leaf ){
5510 /* The table is completely empty */
5511 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5512 }else{
5513 /* The root page is empty but has one child. Transfer the
5514 ** information from that one child into the root page if it
5515 ** will fit. This reduces the depth of the tree by one.
5516 **
5517 ** If the root page is page 1, it has less space available than
5518 ** its child (due to the 100 byte header that occurs at the beginning
5519 ** of the database fle), so it might not be able to hold all of the
5520 ** information currently contained in the child. If this is the
5521 ** case, then do not do the transfer. Leave page 1 empty except
5522 ** for the right-pointer to the child page. The child page becomes
5523 ** the virtual root of the tree.
5524 */
drhf94a1732008-09-30 17:18:17 +00005525 VVA_ONLY( pCur->pagesShuffled = 1 );
drh43605152004-05-29 21:46:49 +00005526 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5527 assert( pgnoChild>0 );
danielk1977ad0132d2008-06-07 08:58:22 +00005528 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
drh16a9b832007-05-05 18:39:25 +00005529 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
drh2e38c322004-09-03 18:38:44 +00005530 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005531 if( pPage->pgno==1 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005532 rc = sqlite3BtreeInitPage(pChild);
drh2e38c322004-09-03 18:38:44 +00005533 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005534 assert( pChild->nOverflow==0 );
5535 if( pChild->nFree>=100 ){
5536 /* The child information will fit on the root page, so do the
5537 ** copy */
5538 int i;
5539 zeroPage(pPage, pChild->aData[0]);
5540 for(i=0; i<pChild->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00005541 apCell[i] = findCell(pChild,i);
drh43605152004-05-29 21:46:49 +00005542 szCell[i] = cellSizePtr(pChild, apCell[i]);
5543 }
5544 assemblePage(pPage, pChild->nCell, apCell, szCell);
danielk1977ae825582004-11-23 09:06:55 +00005545 /* Copy the right-pointer of the child to the parent. */
5546 put4byte(&pPage->aData[pPage->hdrOffset+8],
5547 get4byte(&pChild->aData[pChild->hdrOffset+8]));
drh43605152004-05-29 21:46:49 +00005548 freePage(pChild);
5549 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5550 }else{
5551 /* The child has more information that will fit on the root.
5552 ** The tree is already balanced. Do nothing. */
5553 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5554 }
5555 }else{
5556 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5557 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005558 rc = sqlite3BtreeInitPage(pPage);
drh43605152004-05-29 21:46:49 +00005559 assert( rc==SQLITE_OK );
5560 freePage(pChild);
5561 TRACE(("BALANCE: transfer child %d into root %d\n",
5562 pChild->pgno, pPage->pgno));
5563 }
danielk1977ac11ee62005-01-15 12:45:51 +00005564 assert( pPage->nOverflow==0 );
shane831c3292008-11-10 17:14:58 +00005565#ifndef SQLITE_OMIT_AUTOVACUUM
danielk197785d90ca2008-07-19 14:25:15 +00005566 if( ISAUTOVACUUM ){
danielk197700a696d2008-09-29 16:41:31 +00005567 rc = setChildPtrmaps(pPage);
danielk1977ac11ee62005-01-15 12:45:51 +00005568 }
shane831c3292008-11-10 17:14:58 +00005569#endif
drh43605152004-05-29 21:46:49 +00005570 releasePage(pChild);
5571 }
drh2e38c322004-09-03 18:38:44 +00005572end_shallow_balance:
drh17435752007-08-16 04:30:38 +00005573 sqlite3_free(apCell);
drh2e38c322004-09-03 18:38:44 +00005574 return rc;
drh43605152004-05-29 21:46:49 +00005575}
5576
5577
5578/*
5579** The root page is overfull
5580**
5581** When this happens, Create a new child page and copy the
5582** contents of the root into the child. Then make the root
5583** page an empty page with rightChild pointing to the new
5584** child. Finally, call balance_internal() on the new child
5585** to cause it to split.
5586*/
danielk197771d5d2c2008-09-29 11:49:47 +00005587static int balance_deeper(BtCursor *pCur){
drh43605152004-05-29 21:46:49 +00005588 int rc; /* Return value from subprocedures */
danielk197771d5d2c2008-09-29 11:49:47 +00005589 MemPage *pPage; /* Pointer to the root page */
drh43605152004-05-29 21:46:49 +00005590 MemPage *pChild; /* Pointer to a new child page */
5591 Pgno pgnoChild; /* Page number of the new child page */
danielk1977aef0bf62005-12-30 16:28:01 +00005592 BtShared *pBt; /* The BTree */
drh43605152004-05-29 21:46:49 +00005593 int usableSize; /* Total usable size of a page */
5594 u8 *data; /* Content of the parent page */
5595 u8 *cdata; /* Content of the child page */
5596 int hdr; /* Offset to page header in parent */
drh281b21d2008-08-22 12:57:08 +00005597 int cbrk; /* Offset to content of first cell in parent */
drh43605152004-05-29 21:46:49 +00005598
danielk197771d5d2c2008-09-29 11:49:47 +00005599 assert( pCur->iPage==0 );
5600 assert( pCur->apPage[0]->nOverflow>0 );
5601
drhf94a1732008-09-30 17:18:17 +00005602 VVA_ONLY( pCur->pagesShuffled = 1 );
danielk197771d5d2c2008-09-29 11:49:47 +00005603 pPage = pCur->apPage[0];
drh43605152004-05-29 21:46:49 +00005604 pBt = pPage->pBt;
drh1fee73e2007-08-29 04:00:57 +00005605 assert( sqlite3_mutex_held(pBt->mutex) );
drh4f0c5872007-03-26 22:05:01 +00005606 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
drh43605152004-05-29 21:46:49 +00005607 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00005608 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
drh43605152004-05-29 21:46:49 +00005609 usableSize = pBt->usableSize;
5610 data = pPage->aData;
5611 hdr = pPage->hdrOffset;
drh281b21d2008-08-22 12:57:08 +00005612 cbrk = get2byte(&data[hdr+5]);
drh43605152004-05-29 21:46:49 +00005613 cdata = pChild->aData;
5614 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
drh281b21d2008-08-22 12:57:08 +00005615 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
danielk197771d5d2c2008-09-29 11:49:47 +00005616
5617 rc = sqlite3BtreeInitPage(pChild);
5618 if( rc==SQLITE_OK ){
5619 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5620 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5621 pChild->nOverflow = pPage->nOverflow;
5622 if( pChild->nOverflow ){
5623 pChild->nFree = 0;
5624 }
5625 assert( pChild->nCell==pPage->nCell );
5626 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5627 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5628 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5629 if( ISAUTOVACUUM ){
danielk197771d5d2c2008-09-29 11:49:47 +00005630 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
shane831c3292008-11-10 17:14:58 +00005631#ifndef SQLITE_OMIT_AUTOVACUUM
danielk197771d5d2c2008-09-29 11:49:47 +00005632 if( rc==SQLITE_OK ){
danielk197700a696d2008-09-29 16:41:31 +00005633 rc = setChildPtrmaps(pChild);
danielk1977ac11ee62005-01-15 12:45:51 +00005634 }
shane831c3292008-11-10 17:14:58 +00005635#endif
danielk1977ac11ee62005-01-15 12:45:51 +00005636 }
danielk197787c52b52008-07-19 11:49:07 +00005637 }
danielk19776b456a22005-03-21 04:04:02 +00005638
danielk197771d5d2c2008-09-29 11:49:47 +00005639 if( rc==SQLITE_OK ){
5640 pCur->iPage++;
5641 pCur->apPage[1] = pChild;
danielk1977bf93c562008-09-29 15:53:25 +00005642 pCur->aiIdx[0] = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005643 rc = balance_nonroot(pCur);
5644 }else{
5645 releasePage(pChild);
5646 }
5647
drh43605152004-05-29 21:46:49 +00005648 return rc;
5649}
5650
5651/*
danielk197771d5d2c2008-09-29 11:49:47 +00005652** The page that pCur currently points to has just been modified in
5653** some way. This function figures out if this modification means the
5654** tree needs to be balanced, and if so calls the appropriate balancing
5655** routine.
5656**
5657** Parameter isInsert is true if a new cell was just inserted into the
5658** page, or false otherwise.
drh43605152004-05-29 21:46:49 +00005659*/
danielk197771d5d2c2008-09-29 11:49:47 +00005660static int balance(BtCursor *pCur, int isInsert){
drh43605152004-05-29 21:46:49 +00005661 int rc = SQLITE_OK;
danielk197771d5d2c2008-09-29 11:49:47 +00005662 MemPage *pPage = pCur->apPage[pCur->iPage];
5663
drh1fee73e2007-08-29 04:00:57 +00005664 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00005665 if( pCur->iPage==0 ){
danielk19776e465eb2007-08-21 13:11:00 +00005666 rc = sqlite3PagerWrite(pPage->pDbPage);
5667 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005668 rc = balance_deeper(pCur);
drh43605152004-05-29 21:46:49 +00005669 }
danielk1977687566d2004-11-02 12:56:41 +00005670 if( rc==SQLITE_OK && pPage->nCell==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005671 rc = balance_shallower(pCur);
drh43605152004-05-29 21:46:49 +00005672 }
5673 }else{
danielk1977ac245ec2005-01-14 13:50:11 +00005674 if( pPage->nOverflow>0 ||
danielk197771d5d2c2008-09-29 11:49:47 +00005675 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5676 rc = balance_nonroot(pCur);
drh43605152004-05-29 21:46:49 +00005677 }
5678 }
5679 return rc;
5680}
5681
5682/*
drh8dcd7ca2004-08-08 19:43:29 +00005683** This routine checks all cursors that point to table pgnoRoot.
drh980b1a72006-08-16 16:42:48 +00005684** If any of those cursors were opened with wrFlag==0 in a different
5685** database connection (a database connection that shares the pager
5686** cache with the current connection) and that other connection
5687** is not in the ReadUncommmitted state, then this routine returns
5688** SQLITE_LOCKED.
danielk1977299b1872004-11-22 10:02:10 +00005689**
danielk19773588ceb2008-06-10 17:30:26 +00005690** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5691** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5692** blob cursors are used for both reading and writing.
5693**
5694** When pgnoRoot is the root page of an intkey table, this function is also
5695** responsible for invalidating incremental blob cursors when the table row
5696** on which they are opened is deleted or modified. Cursors are invalidated
5697** according to the following rules:
5698**
5699** 1) When BtreeClearTable() is called to completely delete the contents
5700** of a B-Tree table, pExclude is set to zero and parameter iRow is
5701** set to non-zero. In this case all incremental blob cursors open
5702** on the table rooted at pgnoRoot are invalidated.
5703**
5704** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5705** modify a table row via an SQL statement, pExclude is set to the
5706** write cursor used to do the modification and parameter iRow is set
5707** to the integer row id of the B-Tree entry being modified. Unless
5708** pExclude is itself an incremental blob cursor, then all incremental
5709** blob cursors open on row iRow of the B-Tree are invalidated.
5710**
5711** 3) If both pExclude and iRow are set to zero, no incremental blob
5712** cursors are invalidated.
drhf74b8d92002-09-01 23:20:45 +00005713*/
danielk19773588ceb2008-06-10 17:30:26 +00005714static int checkReadLocks(
5715 Btree *pBtree,
5716 Pgno pgnoRoot,
5717 BtCursor *pExclude,
5718 i64 iRow
5719){
danielk1977299b1872004-11-22 10:02:10 +00005720 BtCursor *p;
drh980b1a72006-08-16 16:42:48 +00005721 BtShared *pBt = pBtree->pBt;
drhe5fe6902007-12-07 18:55:28 +00005722 sqlite3 *db = pBtree->db;
drh1fee73e2007-08-29 04:00:57 +00005723 assert( sqlite3BtreeHoldsMutex(pBtree) );
danielk1977299b1872004-11-22 10:02:10 +00005724 for(p=pBt->pCursor; p; p=p->pNext){
drh980b1a72006-08-16 16:42:48 +00005725 if( p==pExclude ) continue;
drh980b1a72006-08-16 16:42:48 +00005726 if( p->pgnoRoot!=pgnoRoot ) continue;
danielk19773588ceb2008-06-10 17:30:26 +00005727#ifndef SQLITE_OMIT_INCRBLOB
5728 if( p->isIncrblobHandle && (
5729 (!pExclude && iRow)
5730 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5731 )){
5732 p->eState = CURSOR_INVALID;
5733 }
5734#endif
5735 if( p->eState!=CURSOR_VALID ) continue;
5736 if( p->wrFlag==0
5737#ifndef SQLITE_OMIT_INCRBLOB
5738 || p->isIncrblobHandle
5739#endif
5740 ){
drhe5fe6902007-12-07 18:55:28 +00005741 sqlite3 *dbOther = p->pBtree->db;
drh980b1a72006-08-16 16:42:48 +00005742 if( dbOther==0 ||
5743 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5744 return SQLITE_LOCKED;
5745 }
danielk1977299b1872004-11-22 10:02:10 +00005746 }
5747 }
drhf74b8d92002-09-01 23:20:45 +00005748 return SQLITE_OK;
5749}
5750
5751/*
drh3b7511c2001-05-26 13:15:44 +00005752** Insert a new record into the BTree. The key is given by (pKey,nKey)
5753** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00005754** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00005755** is left pointing at a random location.
5756**
5757** For an INTKEY table, only the nKey value of the key is used. pKey is
5758** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00005759*/
drh3aac2dd2004-04-26 14:10:20 +00005760int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00005761 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00005762 const void *pKey, i64 nKey, /* The key of the new record */
drhe4d90812007-03-29 05:51:49 +00005763 const void *pData, int nData, /* The data of the new record */
drhb026e052007-05-02 01:34:31 +00005764 int nZero, /* Number of extra 0 bytes to append to data */
drhe4d90812007-03-29 05:51:49 +00005765 int appendBias /* True if this is likely an append */
drh3b7511c2001-05-26 13:15:44 +00005766){
drh3b7511c2001-05-26 13:15:44 +00005767 int rc;
5768 int loc;
drh14acc042001-06-10 19:56:58 +00005769 int szNew;
danielk197771d5d2c2008-09-29 11:49:47 +00005770 int idx;
drh3b7511c2001-05-26 13:15:44 +00005771 MemPage *pPage;
drhd677b3d2007-08-20 22:48:41 +00005772 Btree *p = pCur->pBtree;
5773 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00005774 unsigned char *oldCell;
drh2e38c322004-09-03 18:38:44 +00005775 unsigned char *newCell = 0;
drh3b7511c2001-05-26 13:15:44 +00005776
drh1fee73e2007-08-29 04:00:57 +00005777 assert( cursorHoldsMutex(pCur) );
danielk1977aef0bf62005-12-30 16:28:01 +00005778 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005779 /* Must start a transaction before doing an insert */
drhd677b3d2007-08-20 22:48:41 +00005780 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005781 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005782 }
drhf74b8d92002-09-01 23:20:45 +00005783 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00005784 if( !pCur->wrFlag ){
5785 return SQLITE_PERM; /* Cursor not open for writing */
5786 }
danielk19773588ceb2008-06-10 17:30:26 +00005787 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
drhf74b8d92002-09-01 23:20:45 +00005788 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5789 }
drhfb982642007-08-30 01:19:59 +00005790 if( pCur->eState==CURSOR_FAULT ){
5791 return pCur->skip;
5792 }
danielk1977da184232006-01-05 11:34:32 +00005793
5794 /* Save the positions of any other cursors open on this table */
danielk1977be51a652008-10-08 17:58:48 +00005795 sqlite3BtreeClearCursor(pCur);
danielk19772e94d4d2006-01-09 05:36:27 +00005796 if(
danielk19772e94d4d2006-01-09 05:36:27 +00005797 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
drhe63d9992008-08-13 19:11:48 +00005798 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
danielk19772e94d4d2006-01-09 05:36:27 +00005799 ){
danielk1977da184232006-01-05 11:34:32 +00005800 return rc;
5801 }
5802
danielk197771d5d2c2008-09-29 11:49:47 +00005803 pPage = pCur->apPage[pCur->iPage];
drh4a1c3802004-05-12 15:15:47 +00005804 assert( pPage->intKey || nKey>=0 );
drh44845222008-07-17 18:39:57 +00005805 assert( pPage->leaf || !pPage->intKey );
drh3a4c1412004-05-09 20:40:11 +00005806 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5807 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5808 loc==0 ? "overwrite" : "new entry"));
danielk197771d5d2c2008-09-29 11:49:47 +00005809 assert( pPage->isInit );
danielk197752ae7242008-03-25 14:24:56 +00005810 allocateTempSpace(pBt);
5811 newCell = pBt->pTmpSpace;
drh2e38c322004-09-03 18:38:44 +00005812 if( newCell==0 ) return SQLITE_NOMEM;
drhb026e052007-05-02 01:34:31 +00005813 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
drh2e38c322004-09-03 18:38:44 +00005814 if( rc ) goto end_insert;
drh43605152004-05-29 21:46:49 +00005815 assert( szNew==cellSizePtr(pPage, newCell) );
drh2e38c322004-09-03 18:38:44 +00005816 assert( szNew<=MX_CELL_SIZE(pBt) );
danielk197771d5d2c2008-09-29 11:49:47 +00005817 idx = pCur->aiIdx[pCur->iPage];
danielk1977da184232006-01-05 11:34:32 +00005818 if( loc==0 && CURSOR_VALID==pCur->eState ){
drha9121e42008-02-19 14:59:35 +00005819 u16 szOld;
danielk197771d5d2c2008-09-29 11:49:47 +00005820 assert( idx<pPage->nCell );
danielk19776e465eb2007-08-21 13:11:00 +00005821 rc = sqlite3PagerWrite(pPage->pDbPage);
5822 if( rc ){
5823 goto end_insert;
5824 }
danielk197771d5d2c2008-09-29 11:49:47 +00005825 oldCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00005826 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005827 memcpy(newCell, oldCell, 4);
drh4b70f112004-05-02 21:12:19 +00005828 }
drh43605152004-05-29 21:46:49 +00005829 szOld = cellSizePtr(pPage, oldCell);
drh4b70f112004-05-02 21:12:19 +00005830 rc = clearCell(pPage, oldCell);
drh2e38c322004-09-03 18:38:44 +00005831 if( rc ) goto end_insert;
shane0af3f892008-11-12 04:55:34 +00005832 rc = dropCell(pPage, idx, szOld);
5833 if( rc!=SQLITE_OK ) {
5834 goto end_insert;
5835 }
drh7c717f72001-06-24 20:39:41 +00005836 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00005837 assert( pPage->leaf );
danielk197771d5d2c2008-09-29 11:49:47 +00005838 idx = ++pCur->aiIdx[pCur->iPage];
drh271efa52004-05-30 19:19:05 +00005839 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00005840 pCur->validNKey = 0;
drh14acc042001-06-10 19:56:58 +00005841 }else{
drh4b70f112004-05-02 21:12:19 +00005842 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00005843 }
danielk197771d5d2c2008-09-29 11:49:47 +00005844 rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
danielk1977e80463b2004-11-03 03:01:16 +00005845 if( rc!=SQLITE_OK ) goto end_insert;
danielk197771d5d2c2008-09-29 11:49:47 +00005846 rc = balance(pCur, 1);
danielk1977299b1872004-11-22 10:02:10 +00005847 if( rc==SQLITE_OK ){
5848 moveToRoot(pCur);
5849 }
drh2e38c322004-09-03 18:38:44 +00005850end_insert:
drh5e2f8b92001-05-28 00:41:15 +00005851 return rc;
5852}
5853
5854/*
drh4b70f112004-05-02 21:12:19 +00005855** Delete the entry that the cursor is pointing to. The cursor
drhf94a1732008-09-30 17:18:17 +00005856** is left pointing at a arbitrary location.
drh3b7511c2001-05-26 13:15:44 +00005857*/
drh3aac2dd2004-04-26 14:10:20 +00005858int sqlite3BtreeDelete(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00005859 MemPage *pPage = pCur->apPage[pCur->iPage];
5860 int idx;
drh4b70f112004-05-02 21:12:19 +00005861 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00005862 int rc;
danielk1977cfe9a692004-06-16 12:00:29 +00005863 Pgno pgnoChild = 0;
drhd677b3d2007-08-20 22:48:41 +00005864 Btree *p = pCur->pBtree;
5865 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00005866
drh1fee73e2007-08-29 04:00:57 +00005867 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00005868 assert( pPage->isInit );
danielk1977aef0bf62005-12-30 16:28:01 +00005869 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005870 /* Must start a transaction before doing a delete */
drhd677b3d2007-08-20 22:48:41 +00005871 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005872 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005873 }
drhf74b8d92002-09-01 23:20:45 +00005874 assert( !pBt->readOnly );
drhfb982642007-08-30 01:19:59 +00005875 if( pCur->eState==CURSOR_FAULT ){
5876 return pCur->skip;
5877 }
danielk197771d5d2c2008-09-29 11:49:47 +00005878 if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
drhbd03cae2001-06-02 02:40:57 +00005879 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5880 }
drhecdc7532001-09-23 02:35:53 +00005881 if( !pCur->wrFlag ){
5882 return SQLITE_PERM; /* Did not open this cursor for writing */
5883 }
danielk19773588ceb2008-06-10 17:30:26 +00005884 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
drhf74b8d92002-09-01 23:20:45 +00005885 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5886 }
danielk1977da184232006-01-05 11:34:32 +00005887
5888 /* Restore the current cursor position (a no-op if the cursor is not in
5889 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
danielk19773b8a05f2007-03-19 17:44:26 +00005890 ** open on the same table. Then call sqlite3PagerWrite() on the page
danielk1977da184232006-01-05 11:34:32 +00005891 ** that the entry will be deleted from.
5892 */
5893 if(
drha3460582008-07-11 21:02:53 +00005894 (rc = restoreCursorPosition(pCur))!=0 ||
drhd1167392006-01-23 13:00:35 +00005895 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
danielk19773b8a05f2007-03-19 17:44:26 +00005896 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
danielk1977da184232006-01-05 11:34:32 +00005897 ){
5898 return rc;
5899 }
danielk1977e6efa742004-11-10 11:55:10 +00005900
drh85b623f2007-12-13 21:54:09 +00005901 /* Locate the cell within its page and leave pCell pointing to the
danielk1977e6efa742004-11-10 11:55:10 +00005902 ** data. The clearCell() call frees any overflow pages associated with the
5903 ** cell. The cell itself is still intact.
5904 */
danielk197771d5d2c2008-09-29 11:49:47 +00005905 idx = pCur->aiIdx[pCur->iPage];
5906 pCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00005907 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005908 pgnoChild = get4byte(pCell);
drh4b70f112004-05-02 21:12:19 +00005909 }
danielk197728129562005-01-11 10:25:06 +00005910 rc = clearCell(pPage, pCell);
drhd677b3d2007-08-20 22:48:41 +00005911 if( rc ){
drhd677b3d2007-08-20 22:48:41 +00005912 return rc;
5913 }
danielk1977e6efa742004-11-10 11:55:10 +00005914
drh4b70f112004-05-02 21:12:19 +00005915 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00005916 /*
drh5e00f6c2001-09-13 13:46:56 +00005917 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00005918 ** do something we will leave a hole on an internal page.
5919 ** We have to fill the hole by moving in a cell from a leaf. The
5920 ** next Cell after the one to be deleted is guaranteed to exist and
danielk1977299b1872004-11-22 10:02:10 +00005921 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00005922 */
drh14acc042001-06-10 19:56:58 +00005923 BtCursor leafCur;
danielk197771d5d2c2008-09-29 11:49:47 +00005924 MemPage *pLeafPage;
danielk197771d5d2c2008-09-29 11:49:47 +00005925
drh4b70f112004-05-02 21:12:19 +00005926 unsigned char *pNext;
danielk1977299b1872004-11-22 10:02:10 +00005927 int notUsed;
danielk19776b456a22005-03-21 04:04:02 +00005928 unsigned char *tempCell = 0;
drh44845222008-07-17 18:39:57 +00005929 assert( !pPage->intKey );
drh16a9b832007-05-05 18:39:25 +00005930 sqlite3BtreeGetTempCursor(pCur, &leafCur);
danielk1977299b1872004-11-22 10:02:10 +00005931 rc = sqlite3BtreeNext(&leafCur, &notUsed);
danielk19776b456a22005-03-21 04:04:02 +00005932 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00005933 assert( leafCur.aiIdx[leafCur.iPage]==0 );
danielk197771d5d2c2008-09-29 11:49:47 +00005934 pLeafPage = leafCur.apPage[leafCur.iPage];
danielk197771d5d2c2008-09-29 11:49:47 +00005935 rc = sqlite3PagerWrite(pLeafPage->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00005936 }
5937 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00005938 int leafCursorInvalid = 0;
drha9121e42008-02-19 14:59:35 +00005939 u16 szNext;
danielk19776b456a22005-03-21 04:04:02 +00005940 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
danielk197771d5d2c2008-09-29 11:49:47 +00005941 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
5942 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
danielk19772f78fc62008-09-30 09:31:45 +00005943 pNext = findCell(pLeafPage, 0);
danielk197771d5d2c2008-09-29 11:49:47 +00005944 szNext = cellSizePtr(pLeafPage, pNext);
danielk19776b456a22005-03-21 04:04:02 +00005945 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
danielk197752ae7242008-03-25 14:24:56 +00005946 allocateTempSpace(pBt);
5947 tempCell = pBt->pTmpSpace;
danielk19776b456a22005-03-21 04:04:02 +00005948 if( tempCell==0 ){
5949 rc = SQLITE_NOMEM;
5950 }
danielk19778ea1cfa2008-01-01 06:19:02 +00005951 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00005952 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00005953 }
danielk19772f78fc62008-09-30 09:31:45 +00005954
drhf94a1732008-09-30 17:18:17 +00005955
5956 /* The "if" statement in the next code block is critical. The
5957 ** slightest error in that statement would allow SQLite to operate
5958 ** correctly most of the time but produce very rare failures. To
5959 ** guard against this, the following macros help to verify that
5960 ** the "if" statement is well tested.
5961 */
5962 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
5963 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5964 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
5965 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5966 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
5967 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5968 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
5969 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5970 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
5971 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
5972
5973
danielk19772f78fc62008-09-30 09:31:45 +00005974 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
5975 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
5976 ){
drhf94a1732008-09-30 17:18:17 +00005977 /* This branch is taken if the internal node is now either overflowing
5978 ** or underfull and the leaf node will be underfull after the just cell
danielk19772f78fc62008-09-30 09:31:45 +00005979 ** copied to the internal node is deleted from it. This is a special
5980 ** case because the call to balance() to correct the internal node
5981 ** may change the tree structure and invalidate the contents of
5982 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
5983 ** used by the balance() required to correct the underfull leaf
5984 ** node.
5985 **
5986 ** The formula used in the expression above are based on facets of
5987 ** the SQLite file-format that do not change over time.
5988 */
drhf94a1732008-09-30 17:18:17 +00005989 testcase( pPage->nFree==pBt->usableSize*2/3+1 );
5990 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
danielk19772f78fc62008-09-30 09:31:45 +00005991 leafCursorInvalid = 1;
5992 }
5993
danielk19778ea1cfa2008-01-01 06:19:02 +00005994 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00005995 put4byte(findOverflowCell(pPage, idx), pgnoChild);
drhf94a1732008-09-30 17:18:17 +00005996 VVA_ONLY( pCur->pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00005997 rc = balance(pCur, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00005998 }
danielk19772f78fc62008-09-30 09:31:45 +00005999
6000 if( rc==SQLITE_OK && leafCursorInvalid ){
6001 /* The leaf-node is now underfull and so the tree needs to be
6002 ** rebalanced. However, the balance() operation on the internal
6003 ** node above may have modified the structure of the B-Tree and
6004 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
6005 ** may not be trusted.
6006 **
6007 ** It is not possible to copy the ancestry from pCur, as the same
6008 ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
6009 ** arrays.
drh7b682802008-09-30 14:06:28 +00006010 **
6011 ** The call to saveCursorPosition() below internally saves the
6012 ** key that leafCur is currently pointing to. Currently, there
6013 ** are two copies of that key in the tree - one here on the leaf
6014 ** page and one on some internal node in the tree. The copy on
6015 ** the leaf node is always the next key in tree-order after the
6016 ** copy on the internal node. So, the call to sqlite3BtreeNext()
6017 ** calls restoreCursorPosition() to point the cursor to the copy
6018 ** stored on the internal node, then advances to the next entry,
6019 ** which happens to be the copy of the key on the internal node.
danielk1977a69fda22008-09-30 16:48:10 +00006020 ** Net effect: leafCur is pointing back to the duplicate cell
6021 ** that needs to be removed, and the leafCur.apPage[] and
6022 ** leafCur.aiIdx[] arrays are correct.
danielk19772f78fc62008-09-30 09:31:45 +00006023 */
drhf94a1732008-09-30 17:18:17 +00006024 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
danielk19772f78fc62008-09-30 09:31:45 +00006025 rc = saveCursorPosition(&leafCur);
6026 if( rc==SQLITE_OK ){
6027 rc = sqlite3BtreeNext(&leafCur, &notUsed);
6028 }
6029 pLeafPage = leafCur.apPage[leafCur.iPage];
6030 assert( pLeafPage->pgno==leafPgno );
6031 assert( leafCur.aiIdx[leafCur.iPage]==0 );
6032 }
6033
danielk19778ea1cfa2008-01-01 06:19:02 +00006034 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00006035 dropCell(pLeafPage, 0, szNext);
drhf94a1732008-09-30 17:18:17 +00006036 VVA_ONLY( leafCur.pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00006037 rc = balance(&leafCur, 0);
drhf94a1732008-09-30 17:18:17 +00006038 assert( leafCursorInvalid || !leafCur.pagesShuffled
6039 || !pCur->pagesShuffled );
danielk19778ea1cfa2008-01-01 06:19:02 +00006040 }
danielk19776b456a22005-03-21 04:04:02 +00006041 }
drh16a9b832007-05-05 18:39:25 +00006042 sqlite3BtreeReleaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00006043 }else{
danielk1977299b1872004-11-22 10:02:10 +00006044 TRACE(("DELETE: table=%d delete from leaf %d\n",
6045 pCur->pgnoRoot, pPage->pgno));
danielk197771d5d2c2008-09-29 11:49:47 +00006046 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6047 rc = balance(pCur, 0);
drh5e2f8b92001-05-28 00:41:15 +00006048 }
danielk19776b456a22005-03-21 04:04:02 +00006049 if( rc==SQLITE_OK ){
6050 moveToRoot(pCur);
6051 }
drh5e2f8b92001-05-28 00:41:15 +00006052 return rc;
drh3b7511c2001-05-26 13:15:44 +00006053}
drh8b2f49b2001-06-08 00:21:52 +00006054
6055/*
drhc6b52df2002-01-04 03:09:29 +00006056** Create a new BTree table. Write into *piTable the page
6057** number for the root page of the new table.
6058**
drhab01f612004-05-22 02:55:23 +00006059** The type of type is determined by the flags parameter. Only the
6060** following values of flags are currently in use. Other values for
6061** flags might not work:
6062**
6063** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6064** BTREE_ZERODATA Used for SQL indices
drh8b2f49b2001-06-08 00:21:52 +00006065*/
drhd677b3d2007-08-20 22:48:41 +00006066static int btreeCreateTable(Btree *p, int *piTable, int flags){
danielk1977aef0bf62005-12-30 16:28:01 +00006067 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006068 MemPage *pRoot;
6069 Pgno pgnoRoot;
6070 int rc;
drhd677b3d2007-08-20 22:48:41 +00006071
drh1fee73e2007-08-29 04:00:57 +00006072 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006073 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006074 /* Must start a transaction first */
drhd677b3d2007-08-20 22:48:41 +00006075 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6076 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006077 }
danielk197728129562005-01-11 10:25:06 +00006078 assert( !pBt->readOnly );
danielk1977e6efa742004-11-10 11:55:10 +00006079
danielk1977003ba062004-11-04 02:57:33 +00006080#ifdef SQLITE_OMIT_AUTOVACUUM
drh4f0c5872007-03-26 22:05:01 +00006081 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
drhd677b3d2007-08-20 22:48:41 +00006082 if( rc ){
6083 return rc;
6084 }
danielk1977003ba062004-11-04 02:57:33 +00006085#else
danielk1977687566d2004-11-02 12:56:41 +00006086 if( pBt->autoVacuum ){
danielk1977003ba062004-11-04 02:57:33 +00006087 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6088 MemPage *pPageMove; /* The page to move to. */
6089
danielk197720713f32007-05-03 11:43:33 +00006090 /* Creating a new table may probably require moving an existing database
6091 ** to make room for the new tables root page. In case this page turns
6092 ** out to be an overflow page, delete all overflow page-map caches
6093 ** held by open cursors.
6094 */
danielk197792d4d7a2007-05-04 12:05:56 +00006095 invalidateAllOverflowCache(pBt);
danielk197720713f32007-05-03 11:43:33 +00006096
danielk1977003ba062004-11-04 02:57:33 +00006097 /* Read the value of meta[3] from the database to determine where the
6098 ** root page of the new table should go. meta[3] is the largest root-page
6099 ** created so far, so the new root-page is (meta[3]+1).
6100 */
danielk1977aef0bf62005-12-30 16:28:01 +00006101 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
drhd677b3d2007-08-20 22:48:41 +00006102 if( rc!=SQLITE_OK ){
6103 return rc;
6104 }
danielk1977003ba062004-11-04 02:57:33 +00006105 pgnoRoot++;
6106
danielk1977599fcba2004-11-08 07:13:13 +00006107 /* The new root-page may not be allocated on a pointer-map page, or the
6108 ** PENDING_BYTE page.
6109 */
drh72190432008-01-31 14:54:43 +00006110 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
danielk1977599fcba2004-11-08 07:13:13 +00006111 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
danielk1977003ba062004-11-04 02:57:33 +00006112 pgnoRoot++;
6113 }
6114 assert( pgnoRoot>=3 );
6115
6116 /* Allocate a page. The page that currently resides at pgnoRoot will
6117 ** be moved to the allocated page (unless the allocated page happens
6118 ** to reside at pgnoRoot).
6119 */
drh4f0c5872007-03-26 22:05:01 +00006120 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
danielk1977003ba062004-11-04 02:57:33 +00006121 if( rc!=SQLITE_OK ){
danielk1977687566d2004-11-02 12:56:41 +00006122 return rc;
6123 }
danielk1977003ba062004-11-04 02:57:33 +00006124
6125 if( pgnoMove!=pgnoRoot ){
danielk1977f35843b2007-04-07 15:03:17 +00006126 /* pgnoRoot is the page that will be used for the root-page of
6127 ** the new table (assuming an error did not occur). But we were
6128 ** allocated pgnoMove. If required (i.e. if it was not allocated
6129 ** by extending the file), the current page at position pgnoMove
6130 ** is already journaled.
6131 */
danielk1977003ba062004-11-04 02:57:33 +00006132 u8 eType;
6133 Pgno iPtrPage;
6134
6135 releasePage(pPageMove);
danielk1977f35843b2007-04-07 15:03:17 +00006136
6137 /* Move the page currently at pgnoRoot to pgnoMove. */
drh16a9b832007-05-05 18:39:25 +00006138 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006139 if( rc!=SQLITE_OK ){
6140 return rc;
6141 }
6142 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
drhccae6022005-02-26 17:31:26 +00006143 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00006144 releasePage(pRoot);
6145 return rc;
6146 }
drhccae6022005-02-26 17:31:26 +00006147 assert( eType!=PTRMAP_ROOTPAGE );
6148 assert( eType!=PTRMAP_FREEPAGE );
danielk19773b8a05f2007-03-19 17:44:26 +00006149 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk19775fd057a2005-03-09 13:09:43 +00006150 if( rc!=SQLITE_OK ){
6151 releasePage(pRoot);
6152 return rc;
6153 }
danielk19774c999992008-07-16 18:17:55 +00006154 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
danielk1977003ba062004-11-04 02:57:33 +00006155 releasePage(pRoot);
danielk1977f35843b2007-04-07 15:03:17 +00006156
6157 /* Obtain the page at pgnoRoot */
danielk1977003ba062004-11-04 02:57:33 +00006158 if( rc!=SQLITE_OK ){
6159 return rc;
6160 }
drh16a9b832007-05-05 18:39:25 +00006161 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006162 if( rc!=SQLITE_OK ){
6163 return rc;
6164 }
danielk19773b8a05f2007-03-19 17:44:26 +00006165 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk1977003ba062004-11-04 02:57:33 +00006166 if( rc!=SQLITE_OK ){
6167 releasePage(pRoot);
6168 return rc;
6169 }
6170 }else{
6171 pRoot = pPageMove;
6172 }
6173
danielk197742741be2005-01-08 12:42:39 +00006174 /* Update the pointer-map and meta-data with the new root-page number. */
danielk1977003ba062004-11-04 02:57:33 +00006175 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6176 if( rc ){
6177 releasePage(pRoot);
6178 return rc;
6179 }
danielk1977aef0bf62005-12-30 16:28:01 +00006180 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00006181 if( rc ){
6182 releasePage(pRoot);
6183 return rc;
6184 }
danielk197742741be2005-01-08 12:42:39 +00006185
danielk1977003ba062004-11-04 02:57:33 +00006186 }else{
drh4f0c5872007-03-26 22:05:01 +00006187 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
danielk1977003ba062004-11-04 02:57:33 +00006188 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00006189 }
6190#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006191 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
drhde647132004-05-07 17:57:49 +00006192 zeroPage(pRoot, flags | PTF_LEAF);
danielk19773b8a05f2007-03-19 17:44:26 +00006193 sqlite3PagerUnref(pRoot->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00006194 *piTable = (int)pgnoRoot;
6195 return SQLITE_OK;
6196}
drhd677b3d2007-08-20 22:48:41 +00006197int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6198 int rc;
6199 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006200 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006201 rc = btreeCreateTable(p, piTable, flags);
6202 sqlite3BtreeLeave(p);
6203 return rc;
6204}
drh8b2f49b2001-06-08 00:21:52 +00006205
6206/*
6207** Erase the given database page and all its children. Return
6208** the page to the freelist.
6209*/
drh4b70f112004-05-02 21:12:19 +00006210static int clearDatabasePage(
danielk1977aef0bf62005-12-30 16:28:01 +00006211 BtShared *pBt, /* The BTree that contains the table */
drh4b70f112004-05-02 21:12:19 +00006212 Pgno pgno, /* Page number to clear */
6213 MemPage *pParent, /* Parent page. NULL for the root */
danielk1977c7af4842008-10-27 13:59:33 +00006214 int freePageFlag, /* Deallocate page if true */
6215 int *pnChange
drh4b70f112004-05-02 21:12:19 +00006216){
danielk19776b456a22005-03-21 04:04:02 +00006217 MemPage *pPage = 0;
drh8b2f49b2001-06-08 00:21:52 +00006218 int rc;
drh4b70f112004-05-02 21:12:19 +00006219 unsigned char *pCell;
6220 int i;
drh8b2f49b2001-06-08 00:21:52 +00006221
drh1fee73e2007-08-29 04:00:57 +00006222 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977ad0132d2008-06-07 08:58:22 +00006223 if( pgno>pagerPagecount(pBt->pPager) ){
drh49285702005-09-17 15:20:26 +00006224 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00006225 }
6226
danielk197771d5d2c2008-09-29 11:49:47 +00006227 rc = getAndInitPage(pBt, pgno, &pPage);
danielk19776b456a22005-03-21 04:04:02 +00006228 if( rc ) goto cleardatabasepage_out;
drh4b70f112004-05-02 21:12:19 +00006229 for(i=0; i<pPage->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00006230 pCell = findCell(pPage, i);
drh4b70f112004-05-02 21:12:19 +00006231 if( !pPage->leaf ){
danielk1977c7af4842008-10-27 13:59:33 +00006232 rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006233 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006234 }
drh4b70f112004-05-02 21:12:19 +00006235 rc = clearCell(pPage, pCell);
danielk19776b456a22005-03-21 04:04:02 +00006236 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006237 }
drha34b6762004-05-07 13:30:42 +00006238 if( !pPage->leaf ){
danielk1977c7af4842008-10-27 13:59:33 +00006239 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006240 if( rc ) goto cleardatabasepage_out;
danielk1977c7af4842008-10-27 13:59:33 +00006241 }else if( pnChange ){
6242 assert( pPage->intKey );
6243 *pnChange += pPage->nCell;
drh2aa679f2001-06-25 02:11:07 +00006244 }
6245 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00006246 rc = freePage(pPage);
danielk19773b8a05f2007-03-19 17:44:26 +00006247 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
drh3a4c1412004-05-09 20:40:11 +00006248 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00006249 }
danielk19776b456a22005-03-21 04:04:02 +00006250
6251cleardatabasepage_out:
drh4b70f112004-05-02 21:12:19 +00006252 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00006253 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006254}
6255
6256/*
drhab01f612004-05-22 02:55:23 +00006257** Delete all information from a single table in the database. iTable is
6258** the page number of the root of the table. After this routine returns,
6259** the root page is empty, but still exists.
6260**
6261** This routine will fail with SQLITE_LOCKED if there are any open
6262** read cursors on the table. Open write cursors are moved to the
6263** root of the table.
danielk1977c7af4842008-10-27 13:59:33 +00006264**
6265** If pnChange is not NULL, then table iTable must be an intkey table. The
6266** integer value pointed to by pnChange is incremented by the number of
6267** entries in the table.
drh8b2f49b2001-06-08 00:21:52 +00006268*/
danielk1977c7af4842008-10-27 13:59:33 +00006269int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
drh8b2f49b2001-06-08 00:21:52 +00006270 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00006271 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00006272 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006273 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006274 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006275 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
danielk19773588ceb2008-06-10 17:30:26 +00006276 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006277 /* nothing to do */
6278 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6279 /* nothing to do */
6280 }else{
danielk1977c7af4842008-10-27 13:59:33 +00006281 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0, pnChange);
drh8b2f49b2001-06-08 00:21:52 +00006282 }
drhd677b3d2007-08-20 22:48:41 +00006283 sqlite3BtreeLeave(p);
6284 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006285}
6286
6287/*
6288** Erase all information in a table and add the root of the table to
6289** the freelist. Except, the root of the principle table (the one on
drhab01f612004-05-22 02:55:23 +00006290** page 1) is never added to the freelist.
6291**
6292** This routine will fail with SQLITE_LOCKED if there are any open
6293** cursors on the table.
drh205f48e2004-11-05 00:43:11 +00006294**
6295** If AUTOVACUUM is enabled and the page at iTable is not the last
6296** root page in the database file, then the last root page
6297** in the database file is moved into the slot formerly occupied by
6298** iTable and that last slot formerly occupied by the last root page
6299** is added to the freelist instead of iTable. In this say, all
6300** root pages are kept at the beginning of the database file, which
6301** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6302** page number that used to be the last root page in the file before
6303** the move. If no page gets moved, *piMoved is set to 0.
6304** The last root page is recorded in meta[3] and the value of
6305** meta[3] is updated by this procedure.
drh8b2f49b2001-06-08 00:21:52 +00006306*/
drhd677b3d2007-08-20 22:48:41 +00006307static int btreeDropTable(Btree *p, int iTable, int *piMoved){
drh8b2f49b2001-06-08 00:21:52 +00006308 int rc;
danielk1977a0bf2652004-11-04 14:30:04 +00006309 MemPage *pPage = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00006310 BtShared *pBt = p->pBt;
danielk1977a0bf2652004-11-04 14:30:04 +00006311
drh1fee73e2007-08-29 04:00:57 +00006312 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006313 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006314 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00006315 }
danielk1977a0bf2652004-11-04 14:30:04 +00006316
danielk1977e6efa742004-11-10 11:55:10 +00006317 /* It is illegal to drop a table if any cursors are open on the
6318 ** database. This is because in auto-vacuum mode the backend may
6319 ** need to move another root-page to fill a gap left by the deleted
6320 ** root page. If an open cursor was using this page a problem would
6321 ** occur.
6322 */
6323 if( pBt->pCursor ){
6324 return SQLITE_LOCKED;
drh5df72a52002-06-06 23:16:05 +00006325 }
danielk1977a0bf2652004-11-04 14:30:04 +00006326
drh16a9b832007-05-05 18:39:25 +00006327 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
drh2aa679f2001-06-25 02:11:07 +00006328 if( rc ) return rc;
danielk1977c7af4842008-10-27 13:59:33 +00006329 rc = sqlite3BtreeClearTable(p, iTable, 0);
danielk19776b456a22005-03-21 04:04:02 +00006330 if( rc ){
6331 releasePage(pPage);
6332 return rc;
6333 }
danielk1977a0bf2652004-11-04 14:30:04 +00006334
drh205f48e2004-11-05 00:43:11 +00006335 *piMoved = 0;
danielk1977a0bf2652004-11-04 14:30:04 +00006336
drh4b70f112004-05-02 21:12:19 +00006337 if( iTable>1 ){
danielk1977a0bf2652004-11-04 14:30:04 +00006338#ifdef SQLITE_OMIT_AUTOVACUUM
drha34b6762004-05-07 13:30:42 +00006339 rc = freePage(pPage);
danielk1977a0bf2652004-11-04 14:30:04 +00006340 releasePage(pPage);
6341#else
6342 if( pBt->autoVacuum ){
6343 Pgno maxRootPgno;
danielk1977aef0bf62005-12-30 16:28:01 +00006344 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006345 if( rc!=SQLITE_OK ){
6346 releasePage(pPage);
6347 return rc;
6348 }
6349
6350 if( iTable==maxRootPgno ){
6351 /* If the table being dropped is the table with the largest root-page
6352 ** number in the database, put the root page on the free list.
6353 */
6354 rc = freePage(pPage);
6355 releasePage(pPage);
6356 if( rc!=SQLITE_OK ){
6357 return rc;
6358 }
6359 }else{
6360 /* The table being dropped does not have the largest root-page
6361 ** number in the database. So move the page that does into the
6362 ** gap left by the deleted root-page.
6363 */
6364 MemPage *pMove;
6365 releasePage(pPage);
drh16a9b832007-05-05 18:39:25 +00006366 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006367 if( rc!=SQLITE_OK ){
6368 return rc;
6369 }
danielk19774c999992008-07-16 18:17:55 +00006370 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006371 releasePage(pMove);
6372 if( rc!=SQLITE_OK ){
6373 return rc;
6374 }
drh16a9b832007-05-05 18:39:25 +00006375 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006376 if( rc!=SQLITE_OK ){
6377 return rc;
6378 }
6379 rc = freePage(pMove);
6380 releasePage(pMove);
6381 if( rc!=SQLITE_OK ){
6382 return rc;
6383 }
6384 *piMoved = maxRootPgno;
6385 }
6386
danielk1977599fcba2004-11-08 07:13:13 +00006387 /* Set the new 'max-root-page' value in the database header. This
6388 ** is the old value less one, less one more if that happens to
6389 ** be a root-page number, less one again if that is the
6390 ** PENDING_BYTE_PAGE.
6391 */
danielk197787a6e732004-11-05 12:58:25 +00006392 maxRootPgno--;
danielk1977599fcba2004-11-08 07:13:13 +00006393 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6394 maxRootPgno--;
6395 }
danielk1977266664d2006-02-10 08:24:21 +00006396 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
danielk197787a6e732004-11-05 12:58:25 +00006397 maxRootPgno--;
6398 }
danielk1977599fcba2004-11-08 07:13:13 +00006399 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6400
danielk1977aef0bf62005-12-30 16:28:01 +00006401 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006402 }else{
6403 rc = freePage(pPage);
6404 releasePage(pPage);
6405 }
6406#endif
drh2aa679f2001-06-25 02:11:07 +00006407 }else{
danielk1977a0bf2652004-11-04 14:30:04 +00006408 /* If sqlite3BtreeDropTable was called on page 1. */
drha34b6762004-05-07 13:30:42 +00006409 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
danielk1977a0bf2652004-11-04 14:30:04 +00006410 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00006411 }
drh8b2f49b2001-06-08 00:21:52 +00006412 return rc;
6413}
drhd677b3d2007-08-20 22:48:41 +00006414int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6415 int rc;
6416 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006417 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006418 rc = btreeDropTable(p, iTable, piMoved);
6419 sqlite3BtreeLeave(p);
6420 return rc;
6421}
drh8b2f49b2001-06-08 00:21:52 +00006422
drh001bbcb2003-03-19 03:14:00 +00006423
drh8b2f49b2001-06-08 00:21:52 +00006424/*
drh23e11ca2004-05-04 17:27:28 +00006425** Read the meta-information out of a database file. Meta[0]
6426** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00006427** through meta[15] are available for use by higher layers. Meta[0]
6428** is read-only, the others are read/write.
6429**
6430** The schema layer numbers meta values differently. At the schema
6431** layer (and the SetCookie and ReadCookie opcodes) the number of
6432** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00006433*/
danielk1977aef0bf62005-12-30 16:28:01 +00006434int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
danielk19773b8a05f2007-03-19 17:44:26 +00006435 DbPage *pDbPage;
drh8b2f49b2001-06-08 00:21:52 +00006436 int rc;
drh4b70f112004-05-02 21:12:19 +00006437 unsigned char *pP1;
danielk1977aef0bf62005-12-30 16:28:01 +00006438 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006439
drhd677b3d2007-08-20 22:48:41 +00006440 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006441 pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006442
danielk1977da184232006-01-05 11:34:32 +00006443 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6444 ** the sqlite_master table. We grab this lock regardless of whether or
6445 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6446 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6447 */
6448 rc = queryTableLock(p, 1, READ_LOCK);
6449 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006450 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006451 return rc;
6452 }
6453
drh23e11ca2004-05-04 17:27:28 +00006454 assert( idx>=0 && idx<=15 );
danielk1977d9f6c532008-09-19 16:39:38 +00006455 if( pBt->pPage1 ){
6456 /* The b-tree is already holding a reference to page 1 of the database
6457 ** file. In this case the required meta-data value can be read directly
6458 ** from the page data of this reference. This is slightly faster than
6459 ** requesting a new reference from the pager layer.
6460 */
6461 pP1 = (unsigned char *)pBt->pPage1->aData;
6462 }else{
6463 /* The b-tree does not have a reference to page 1 of the database file.
6464 ** Obtain one from the pager layer.
6465 */
danielk1977ea897302008-09-19 15:10:58 +00006466 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6467 if( rc ){
6468 sqlite3BtreeLeave(p);
6469 return rc;
6470 }
6471 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
drhd677b3d2007-08-20 22:48:41 +00006472 }
drh23e11ca2004-05-04 17:27:28 +00006473 *pMeta = get4byte(&pP1[36 + idx*4]);
danielk1977ea897302008-09-19 15:10:58 +00006474
danielk1977d9f6c532008-09-19 16:39:38 +00006475 /* If the b-tree is not holding a reference to page 1, then one was
6476 ** requested from the pager layer in the above block. Release it now.
6477 */
danielk1977ea897302008-09-19 15:10:58 +00006478 if( !pBt->pPage1 ){
6479 sqlite3PagerUnref(pDbPage);
6480 }
drhae157872004-08-14 19:20:09 +00006481
danielk1977599fcba2004-11-08 07:13:13 +00006482 /* If autovacuumed is disabled in this build but we are trying to
6483 ** access an autovacuumed database, then make the database readonly.
6484 */
danielk1977003ba062004-11-04 02:57:33 +00006485#ifdef SQLITE_OMIT_AUTOVACUUM
drhae157872004-08-14 19:20:09 +00006486 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
danielk1977003ba062004-11-04 02:57:33 +00006487#endif
drhae157872004-08-14 19:20:09 +00006488
danielk1977da184232006-01-05 11:34:32 +00006489 /* Grab the read-lock on page 1. */
6490 rc = lockTable(p, 1, READ_LOCK);
drhd677b3d2007-08-20 22:48:41 +00006491 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006492 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006493}
6494
6495/*
drh23e11ca2004-05-04 17:27:28 +00006496** Write meta-information back into the database. Meta[0] is
6497** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00006498*/
danielk1977aef0bf62005-12-30 16:28:01 +00006499int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6500 BtShared *pBt = p->pBt;
drh4b70f112004-05-02 21:12:19 +00006501 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00006502 int rc;
drh23e11ca2004-05-04 17:27:28 +00006503 assert( idx>=1 && idx<=15 );
drhd677b3d2007-08-20 22:48:41 +00006504 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006505 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006506 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006507 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6508 }else{
6509 assert( pBt->pPage1!=0 );
6510 pP1 = pBt->pPage1->aData;
6511 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6512 if( rc==SQLITE_OK ){
6513 put4byte(&pP1[36 + idx*4], iMeta);
danielk19774152e672007-09-12 17:01:45 +00006514#ifndef SQLITE_OMIT_AUTOVACUUM
drhd677b3d2007-08-20 22:48:41 +00006515 if( idx==7 ){
6516 assert( pBt->autoVacuum || iMeta==0 );
6517 assert( iMeta==0 || iMeta==1 );
6518 pBt->incrVacuum = iMeta;
6519 }
danielk19774152e672007-09-12 17:01:45 +00006520#endif
drhd677b3d2007-08-20 22:48:41 +00006521 }
drh5df72a52002-06-06 23:16:05 +00006522 }
drhd677b3d2007-08-20 22:48:41 +00006523 sqlite3BtreeLeave(p);
6524 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006525}
drh8c42ca92001-06-22 19:15:00 +00006526
drhf328bc82004-05-10 23:29:49 +00006527/*
6528** Return the flag byte at the beginning of the page that the cursor
6529** is currently pointing to.
6530*/
6531int sqlite3BtreeFlags(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00006532 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
drha3460582008-07-11 21:02:53 +00006533 ** restoreCursorPosition() here.
danielk1977da184232006-01-05 11:34:32 +00006534 */
danielk1977e448dc42008-01-02 11:50:51 +00006535 MemPage *pPage;
drha3460582008-07-11 21:02:53 +00006536 restoreCursorPosition(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00006537 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00006538 assert( cursorHoldsMutex(pCur) );
drhd0679ed2007-08-28 22:24:34 +00006539 assert( pPage->pBt==pCur->pBt );
drhf328bc82004-05-10 23:29:49 +00006540 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6541}
6542
drhdd793422001-06-28 01:54:48 +00006543
drhdd793422001-06-28 01:54:48 +00006544/*
drh5eddca62001-06-30 21:53:53 +00006545** Return the pager associated with a BTree. This routine is used for
6546** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00006547*/
danielk1977aef0bf62005-12-30 16:28:01 +00006548Pager *sqlite3BtreePager(Btree *p){
6549 return p->pBt->pPager;
drhdd793422001-06-28 01:54:48 +00006550}
drh5eddca62001-06-30 21:53:53 +00006551
drhb7f91642004-10-31 02:22:47 +00006552#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006553/*
6554** Append a message to the error message string.
6555*/
drh2e38c322004-09-03 18:38:44 +00006556static void checkAppendMsg(
6557 IntegrityCk *pCheck,
6558 char *zMsg1,
6559 const char *zFormat,
6560 ...
6561){
6562 va_list ap;
drh1dcdbc02007-01-27 02:24:54 +00006563 if( !pCheck->mxErr ) return;
6564 pCheck->mxErr--;
6565 pCheck->nErr++;
drh2e38c322004-09-03 18:38:44 +00006566 va_start(ap, zFormat);
drhf089aa42008-07-08 19:34:06 +00006567 if( pCheck->errMsg.nChar ){
6568 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
drh5eddca62001-06-30 21:53:53 +00006569 }
drhf089aa42008-07-08 19:34:06 +00006570 if( zMsg1 ){
6571 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6572 }
6573 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6574 va_end(ap);
drhc890fec2008-08-01 20:10:08 +00006575 if( pCheck->errMsg.mallocFailed ){
6576 pCheck->mallocFailed = 1;
6577 }
drh5eddca62001-06-30 21:53:53 +00006578}
drhb7f91642004-10-31 02:22:47 +00006579#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006580
drhb7f91642004-10-31 02:22:47 +00006581#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006582/*
6583** Add 1 to the reference count for page iPage. If this is the second
6584** reference to the page, add an error message to pCheck->zErrMsg.
6585** Return 1 if there are 2 ore more references to the page and 0 if
6586** if this is the first reference to the page.
6587**
6588** Also check that the page number is in bounds.
6589*/
drhaaab5722002-02-19 13:39:21 +00006590static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00006591 if( iPage==0 ) return 1;
drh0de8c112002-07-06 16:32:14 +00006592 if( iPage>pCheck->nPage || iPage<0 ){
drh2e38c322004-09-03 18:38:44 +00006593 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006594 return 1;
6595 }
6596 if( pCheck->anRef[iPage]==1 ){
drh2e38c322004-09-03 18:38:44 +00006597 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006598 return 1;
6599 }
6600 return (pCheck->anRef[iPage]++)>1;
6601}
6602
danielk1977afcdd022004-10-31 16:25:42 +00006603#ifndef SQLITE_OMIT_AUTOVACUUM
6604/*
6605** Check that the entry in the pointer-map for page iChild maps to
6606** page iParent, pointer type ptrType. If not, append an error message
6607** to pCheck.
6608*/
6609static void checkPtrmap(
6610 IntegrityCk *pCheck, /* Integrity check context */
6611 Pgno iChild, /* Child page number */
6612 u8 eType, /* Expected pointer map type */
6613 Pgno iParent, /* Expected pointer map parent page number */
6614 char *zContext /* Context description (used for error msg) */
6615){
6616 int rc;
6617 u8 ePtrmapType;
6618 Pgno iPtrmapParent;
6619
6620 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6621 if( rc!=SQLITE_OK ){
6622 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6623 return;
6624 }
6625
6626 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6627 checkAppendMsg(pCheck, zContext,
6628 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6629 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6630 }
6631}
6632#endif
6633
drh5eddca62001-06-30 21:53:53 +00006634/*
6635** Check the integrity of the freelist or of an overflow page list.
6636** Verify that the number of pages on the list is N.
6637*/
drh30e58752002-03-02 20:41:57 +00006638static void checkList(
6639 IntegrityCk *pCheck, /* Integrity checking context */
6640 int isFreeList, /* True for a freelist. False for overflow page list */
6641 int iPage, /* Page number for first page in the list */
6642 int N, /* Expected number of pages in the list */
6643 char *zContext /* Context for error messages */
6644){
6645 int i;
drh3a4c1412004-05-09 20:40:11 +00006646 int expected = N;
6647 int iFirst = iPage;
drh1dcdbc02007-01-27 02:24:54 +00006648 while( N-- > 0 && pCheck->mxErr ){
danielk19773b8a05f2007-03-19 17:44:26 +00006649 DbPage *pOvflPage;
6650 unsigned char *pOvflData;
drh5eddca62001-06-30 21:53:53 +00006651 if( iPage<1 ){
drh2e38c322004-09-03 18:38:44 +00006652 checkAppendMsg(pCheck, zContext,
6653 "%d of %d pages missing from overflow list starting at %d",
drh3a4c1412004-05-09 20:40:11 +00006654 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00006655 break;
6656 }
6657 if( checkRef(pCheck, iPage, zContext) ) break;
danielk19773b8a05f2007-03-19 17:44:26 +00006658 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
drh2e38c322004-09-03 18:38:44 +00006659 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006660 break;
6661 }
danielk19773b8a05f2007-03-19 17:44:26 +00006662 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
drh30e58752002-03-02 20:41:57 +00006663 if( isFreeList ){
danielk19773b8a05f2007-03-19 17:44:26 +00006664 int n = get4byte(&pOvflData[4]);
danielk1977687566d2004-11-02 12:56:41 +00006665#ifndef SQLITE_OMIT_AUTOVACUUM
6666 if( pCheck->pBt->autoVacuum ){
6667 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6668 }
6669#endif
drh45b1fac2008-07-04 17:52:42 +00006670 if( n>pCheck->pBt->usableSize/4-2 ){
drh2e38c322004-09-03 18:38:44 +00006671 checkAppendMsg(pCheck, zContext,
6672 "freelist leaf count too big on page %d", iPage);
drhee696e22004-08-30 16:52:17 +00006673 N--;
6674 }else{
6675 for(i=0; i<n; i++){
danielk19773b8a05f2007-03-19 17:44:26 +00006676 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
danielk1977687566d2004-11-02 12:56:41 +00006677#ifndef SQLITE_OMIT_AUTOVACUUM
6678 if( pCheck->pBt->autoVacuum ){
6679 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6680 }
6681#endif
6682 checkRef(pCheck, iFreePage, zContext);
drhee696e22004-08-30 16:52:17 +00006683 }
6684 N -= n;
drh30e58752002-03-02 20:41:57 +00006685 }
drh30e58752002-03-02 20:41:57 +00006686 }
danielk1977afcdd022004-10-31 16:25:42 +00006687#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006688 else{
6689 /* If this database supports auto-vacuum and iPage is not the last
6690 ** page in this overflow list, check that the pointer-map entry for
6691 ** the following page matches iPage.
6692 */
6693 if( pCheck->pBt->autoVacuum && N>0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00006694 i = get4byte(pOvflData);
danielk1977687566d2004-11-02 12:56:41 +00006695 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6696 }
danielk1977afcdd022004-10-31 16:25:42 +00006697 }
6698#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006699 iPage = get4byte(pOvflData);
6700 sqlite3PagerUnref(pOvflPage);
drh5eddca62001-06-30 21:53:53 +00006701 }
6702}
drhb7f91642004-10-31 02:22:47 +00006703#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006704
drhb7f91642004-10-31 02:22:47 +00006705#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006706/*
6707** Do various sanity checks on a single page of a tree. Return
6708** the tree depth. Root pages return 0. Parents of root pages
6709** return 1, and so forth.
6710**
6711** These checks are done:
6712**
6713** 1. Make sure that cells and freeblocks do not overlap
6714** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00006715** NO 2. Make sure cell keys are in order.
6716** NO 3. Make sure no key is less than or equal to zLowerBound.
6717** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00006718** 5. Check the integrity of overflow pages.
6719** 6. Recursively call checkTreePage on all children.
6720** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00006721** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00006722** the root of the tree.
6723*/
6724static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00006725 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00006726 int iPage, /* Page number of the page to check */
6727 MemPage *pParent, /* Parent page */
drh74161702006-02-24 02:53:49 +00006728 char *zParentContext /* Parent context */
drh5eddca62001-06-30 21:53:53 +00006729){
6730 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00006731 int i, rc, depth, d2, pgno, cnt;
drh43605152004-05-29 21:46:49 +00006732 int hdr, cellStart;
6733 int nCell;
drhda200cc2004-05-09 11:51:38 +00006734 u8 *data;
danielk1977aef0bf62005-12-30 16:28:01 +00006735 BtShared *pBt;
drh4f26bb62005-09-08 14:17:20 +00006736 int usableSize;
drh5eddca62001-06-30 21:53:53 +00006737 char zContext[100];
shane0af3f892008-11-12 04:55:34 +00006738 char *hit = 0;
drh5eddca62001-06-30 21:53:53 +00006739
drh5bb3eb92007-05-04 13:15:55 +00006740 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
danielk1977ef73ee92004-11-06 12:26:07 +00006741
drh5eddca62001-06-30 21:53:53 +00006742 /* Check that the page exists
6743 */
drhd9cb6ac2005-10-20 07:28:17 +00006744 pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00006745 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00006746 if( iPage==0 ) return 0;
6747 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh16a9b832007-05-05 18:39:25 +00006748 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
drh2e38c322004-09-03 18:38:44 +00006749 checkAppendMsg(pCheck, zContext,
6750 "unable to get the page. error code=%d", rc);
drh5eddca62001-06-30 21:53:53 +00006751 return 0;
6752 }
danielk197771d5d2c2008-09-29 11:49:47 +00006753 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
drh16a9b832007-05-05 18:39:25 +00006754 checkAppendMsg(pCheck, zContext,
6755 "sqlite3BtreeInitPage() returns error code %d", rc);
drh91025292004-05-03 19:49:32 +00006756 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00006757 return 0;
6758 }
6759
6760 /* Check out all the cells.
6761 */
6762 depth = 0;
drh1dcdbc02007-01-27 02:24:54 +00006763 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
drh6f11bef2004-05-13 01:12:56 +00006764 u8 *pCell;
6765 int sz;
6766 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00006767
6768 /* Check payload overflow pages
6769 */
drh5bb3eb92007-05-04 13:15:55 +00006770 sqlite3_snprintf(sizeof(zContext), zContext,
6771 "On tree page %d cell %d: ", iPage, i);
danielk19771cc5ed82007-05-16 17:28:43 +00006772 pCell = findCell(pPage,i);
drh16a9b832007-05-05 18:39:25 +00006773 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00006774 sz = info.nData;
6775 if( !pPage->intKey ) sz += info.nKey;
drh72365832007-03-06 15:53:44 +00006776 assert( sz==info.nPayload );
drh6f11bef2004-05-13 01:12:56 +00006777 if( sz>info.nLocal ){
drhb6f41482004-05-14 01:58:11 +00006778 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
danielk1977afcdd022004-10-31 16:25:42 +00006779 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6780#ifndef SQLITE_OMIT_AUTOVACUUM
6781 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006782 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
danielk1977afcdd022004-10-31 16:25:42 +00006783 }
6784#endif
6785 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00006786 }
6787
6788 /* Check sanity of left child page.
6789 */
drhda200cc2004-05-09 11:51:38 +00006790 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006791 pgno = get4byte(pCell);
danielk1977afcdd022004-10-31 16:25:42 +00006792#ifndef SQLITE_OMIT_AUTOVACUUM
6793 if( pBt->autoVacuum ){
6794 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6795 }
6796#endif
drh74161702006-02-24 02:53:49 +00006797 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
drhda200cc2004-05-09 11:51:38 +00006798 if( i>0 && d2!=depth ){
6799 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6800 }
6801 depth = d2;
drh5eddca62001-06-30 21:53:53 +00006802 }
drh5eddca62001-06-30 21:53:53 +00006803 }
drhda200cc2004-05-09 11:51:38 +00006804 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006805 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh5bb3eb92007-05-04 13:15:55 +00006806 sqlite3_snprintf(sizeof(zContext), zContext,
6807 "On page %d at right child: ", iPage);
danielk1977afcdd022004-10-31 16:25:42 +00006808#ifndef SQLITE_OMIT_AUTOVACUUM
6809 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006810 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
danielk1977afcdd022004-10-31 16:25:42 +00006811 }
6812#endif
drh74161702006-02-24 02:53:49 +00006813 checkTreePage(pCheck, pgno, pPage, zContext);
drhda200cc2004-05-09 11:51:38 +00006814 }
drh5eddca62001-06-30 21:53:53 +00006815
6816 /* Check for complete coverage of the page
6817 */
drhda200cc2004-05-09 11:51:38 +00006818 data = pPage->aData;
6819 hdr = pPage->hdrOffset;
drhf7141992008-06-19 00:16:08 +00006820 hit = sqlite3PageMalloc( pBt->pageSize );
drhc890fec2008-08-01 20:10:08 +00006821 if( hit==0 ){
6822 pCheck->mallocFailed = 1;
6823 }else{
shane5780ebd2008-11-11 17:36:30 +00006824 u16 contentOffset = get2byte(&data[hdr+5]);
6825 if (contentOffset > usableSize) {
6826 checkAppendMsg(pCheck, 0,
6827 "Corruption detected in header on page %d",iPage,0);
shane0af3f892008-11-12 04:55:34 +00006828 goto check_page_abort;
shane5780ebd2008-11-11 17:36:30 +00006829 }
6830 memset(hit+contentOffset, 0, usableSize-contentOffset);
6831 memset(hit, 1, contentOffset);
drh2e38c322004-09-03 18:38:44 +00006832 nCell = get2byte(&data[hdr+3]);
6833 cellStart = hdr + 12 - 4*pPage->leaf;
6834 for(i=0; i<nCell; i++){
6835 int pc = get2byte(&data[cellStart+i*2]);
danielk1977daca5432008-08-25 11:57:16 +00006836 u16 size = 1024;
drh2e38c322004-09-03 18:38:44 +00006837 int j;
danielk1977daca5432008-08-25 11:57:16 +00006838 if( pc<=usableSize ){
6839 size = cellSizePtr(pPage, &data[pc]);
6840 }
danielk19777701e812005-01-10 12:59:51 +00006841 if( (pc+size-1)>=usableSize || pc<0 ){
6842 checkAppendMsg(pCheck, 0,
6843 "Corruption detected in cell %d on page %d",i,iPage,0);
6844 }else{
6845 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6846 }
drh2e38c322004-09-03 18:38:44 +00006847 }
6848 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6849 cnt++){
6850 int size = get2byte(&data[i+2]);
6851 int j;
danielk19777701e812005-01-10 12:59:51 +00006852 if( (i+size-1)>=usableSize || i<0 ){
6853 checkAppendMsg(pCheck, 0,
6854 "Corruption detected in cell %d on page %d",i,iPage,0);
6855 }else{
6856 for(j=i+size-1; j>=i; j--) hit[j]++;
6857 }
drh2e38c322004-09-03 18:38:44 +00006858 i = get2byte(&data[i]);
6859 }
6860 for(i=cnt=0; i<usableSize; i++){
6861 if( hit[i]==0 ){
6862 cnt++;
6863 }else if( hit[i]>1 ){
6864 checkAppendMsg(pCheck, 0,
6865 "Multiple uses for byte %d of page %d", i, iPage);
6866 break;
6867 }
6868 }
6869 if( cnt!=data[hdr+7] ){
6870 checkAppendMsg(pCheck, 0,
6871 "Fragmented space is %d byte reported as %d on page %d",
6872 cnt, data[hdr+7], iPage);
drh5eddca62001-06-30 21:53:53 +00006873 }
6874 }
shane0af3f892008-11-12 04:55:34 +00006875check_page_abort:
6876 if (hit) sqlite3PageFree(hit);
drh6019e162001-07-02 17:51:45 +00006877
drh4b70f112004-05-02 21:12:19 +00006878 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00006879 return depth+1;
drh5eddca62001-06-30 21:53:53 +00006880}
drhb7f91642004-10-31 02:22:47 +00006881#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006882
drhb7f91642004-10-31 02:22:47 +00006883#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006884/*
6885** This routine does a complete check of the given BTree file. aRoot[] is
6886** an array of pages numbers were each page number is the root page of
6887** a table. nRoot is the number of entries in aRoot.
6888**
drhc890fec2008-08-01 20:10:08 +00006889** Write the number of error seen in *pnErr. Except for some memory
6890** allocation errors, nn error message is held in memory obtained from
6891** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
6892** returned.
drh5eddca62001-06-30 21:53:53 +00006893*/
drh1dcdbc02007-01-27 02:24:54 +00006894char *sqlite3BtreeIntegrityCheck(
6895 Btree *p, /* The btree to be checked */
6896 int *aRoot, /* An array of root pages numbers for individual trees */
6897 int nRoot, /* Number of entries in aRoot[] */
6898 int mxErr, /* Stop reporting errors after this many */
6899 int *pnErr /* Write number of errors seen to this variable */
6900){
drh5eddca62001-06-30 21:53:53 +00006901 int i;
6902 int nRef;
drhaaab5722002-02-19 13:39:21 +00006903 IntegrityCk sCheck;
danielk1977aef0bf62005-12-30 16:28:01 +00006904 BtShared *pBt = p->pBt;
drhf089aa42008-07-08 19:34:06 +00006905 char zErr[100];
drh5eddca62001-06-30 21:53:53 +00006906
drhd677b3d2007-08-20 22:48:41 +00006907 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006908 pBt->db = p->db;
danielk19773b8a05f2007-03-19 17:44:26 +00006909 nRef = sqlite3PagerRefcount(pBt->pPager);
danielk1977aef0bf62005-12-30 16:28:01 +00006910 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
drhc890fec2008-08-01 20:10:08 +00006911 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00006912 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00006913 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
drhefc251d2001-07-01 22:12:01 +00006914 }
drh5eddca62001-06-30 21:53:53 +00006915 sCheck.pBt = pBt;
6916 sCheck.pPager = pBt->pPager;
danielk1977ad0132d2008-06-07 08:58:22 +00006917 sCheck.nPage = pagerPagecount(sCheck.pPager);
drh1dcdbc02007-01-27 02:24:54 +00006918 sCheck.mxErr = mxErr;
6919 sCheck.nErr = 0;
drhc890fec2008-08-01 20:10:08 +00006920 sCheck.mallocFailed = 0;
drh1dcdbc02007-01-27 02:24:54 +00006921 *pnErr = 0;
danielk1977e5321f02007-04-27 07:05:44 +00006922#ifndef SQLITE_OMIT_AUTOVACUUM
6923 if( pBt->nTrunc!=0 ){
6924 sCheck.nPage = pBt->nTrunc;
6925 }
6926#endif
drh0de8c112002-07-06 16:32:14 +00006927 if( sCheck.nPage==0 ){
6928 unlockBtreeIfUnused(pBt);
drhd677b3d2007-08-20 22:48:41 +00006929 sqlite3BtreeLeave(p);
drh0de8c112002-07-06 16:32:14 +00006930 return 0;
6931 }
drhe5ae5732008-06-15 02:51:47 +00006932 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
danielk1977ac245ec2005-01-14 13:50:11 +00006933 if( !sCheck.anRef ){
6934 unlockBtreeIfUnused(pBt);
drh1dcdbc02007-01-27 02:24:54 +00006935 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00006936 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00006937 return 0;
danielk1977ac245ec2005-01-14 13:50:11 +00006938 }
drhda200cc2004-05-09 11:51:38 +00006939 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh42cac6d2004-11-20 20:31:11 +00006940 i = PENDING_BYTE_PAGE(pBt);
drh1f595712004-06-15 01:40:29 +00006941 if( i<=sCheck.nPage ){
6942 sCheck.anRef[i] = 1;
6943 }
drhf089aa42008-07-08 19:34:06 +00006944 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
drh5eddca62001-06-30 21:53:53 +00006945
6946 /* Check the integrity of the freelist
6947 */
drha34b6762004-05-07 13:30:42 +00006948 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6949 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00006950
6951 /* Check all the tables.
6952 */
drh1dcdbc02007-01-27 02:24:54 +00006953 for(i=0; i<nRoot && sCheck.mxErr; i++){
drh4ff6dfa2002-03-03 23:06:00 +00006954 if( aRoot[i]==0 ) continue;
danielk1977687566d2004-11-02 12:56:41 +00006955#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006956 if( pBt->autoVacuum && aRoot[i]>1 ){
6957 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6958 }
6959#endif
drh74161702006-02-24 02:53:49 +00006960 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
drh5eddca62001-06-30 21:53:53 +00006961 }
6962
6963 /* Make sure every page in the file is referenced
6964 */
drh1dcdbc02007-01-27 02:24:54 +00006965 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
danielk1977afcdd022004-10-31 16:25:42 +00006966#ifdef SQLITE_OMIT_AUTOVACUUM
drh5eddca62001-06-30 21:53:53 +00006967 if( sCheck.anRef[i]==0 ){
drh2e38c322004-09-03 18:38:44 +00006968 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
drh5eddca62001-06-30 21:53:53 +00006969 }
danielk1977afcdd022004-10-31 16:25:42 +00006970#else
6971 /* If the database supports auto-vacuum, make sure no tables contain
6972 ** references to pointer-map pages.
6973 */
6974 if( sCheck.anRef[i]==0 &&
danielk1977266664d2006-02-10 08:24:21 +00006975 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006976 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6977 }
6978 if( sCheck.anRef[i]!=0 &&
danielk1977266664d2006-02-10 08:24:21 +00006979 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00006980 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6981 }
6982#endif
drh5eddca62001-06-30 21:53:53 +00006983 }
6984
6985 /* Make sure this analysis did not leave any unref() pages
6986 */
drh5e00f6c2001-09-13 13:46:56 +00006987 unlockBtreeIfUnused(pBt);
danielk19773b8a05f2007-03-19 17:44:26 +00006988 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
drh2e38c322004-09-03 18:38:44 +00006989 checkAppendMsg(&sCheck, 0,
drh5eddca62001-06-30 21:53:53 +00006990 "Outstanding page count goes from %d to %d during this analysis",
danielk19773b8a05f2007-03-19 17:44:26 +00006991 nRef, sqlite3PagerRefcount(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00006992 );
drh5eddca62001-06-30 21:53:53 +00006993 }
6994
6995 /* Clean up and report errors.
6996 */
drhd677b3d2007-08-20 22:48:41 +00006997 sqlite3BtreeLeave(p);
drh17435752007-08-16 04:30:38 +00006998 sqlite3_free(sCheck.anRef);
drhc890fec2008-08-01 20:10:08 +00006999 if( sCheck.mallocFailed ){
7000 sqlite3StrAccumReset(&sCheck.errMsg);
7001 *pnErr = sCheck.nErr+1;
7002 return 0;
7003 }
drh1dcdbc02007-01-27 02:24:54 +00007004 *pnErr = sCheck.nErr;
drhf089aa42008-07-08 19:34:06 +00007005 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7006 return sqlite3StrAccumFinish(&sCheck.errMsg);
drh5eddca62001-06-30 21:53:53 +00007007}
drhb7f91642004-10-31 02:22:47 +00007008#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
paulb95a8862003-04-01 21:16:41 +00007009
drh73509ee2003-04-06 20:44:45 +00007010/*
7011** Return the full pathname of the underlying database file.
drhd0679ed2007-08-28 22:24:34 +00007012**
7013** The pager filename is invariant as long as the pager is
7014** open so it is safe to access without the BtShared mutex.
drh73509ee2003-04-06 20:44:45 +00007015*/
danielk1977aef0bf62005-12-30 16:28:01 +00007016const char *sqlite3BtreeGetFilename(Btree *p){
7017 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007018 return sqlite3PagerFilename(p->pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00007019}
7020
7021/*
danielk19775865e3d2004-06-14 06:03:57 +00007022** Return the pathname of the directory that contains the database file.
drhd0679ed2007-08-28 22:24:34 +00007023**
7024** The pager directory name is invariant as long as the pager is
7025** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007026*/
danielk1977aef0bf62005-12-30 16:28:01 +00007027const char *sqlite3BtreeGetDirname(Btree *p){
7028 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007029 return sqlite3PagerDirname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007030}
7031
7032/*
7033** Return the pathname of the journal file for this database. The return
7034** value of this routine is the same regardless of whether the journal file
7035** has been created or not.
drhd0679ed2007-08-28 22:24:34 +00007036**
7037** The pager journal filename is invariant as long as the pager is
7038** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007039*/
danielk1977aef0bf62005-12-30 16:28:01 +00007040const char *sqlite3BtreeGetJournalname(Btree *p){
7041 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007042 return sqlite3PagerJournalname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007043}
7044
drhb7f91642004-10-31 02:22:47 +00007045#ifndef SQLITE_OMIT_VACUUM
danielk19775865e3d2004-06-14 06:03:57 +00007046/*
drhf7c57532003-04-25 13:22:51 +00007047** Copy the complete content of pBtFrom into pBtTo. A transaction
7048** must be active for both files.
7049**
danielk1977f653d782008-03-20 11:04:21 +00007050** The size of file pTo may be reduced by this operation.
7051** If anything goes wrong, the transaction on pTo is rolled back.
7052**
7053** If successful, CommitPhaseOne() may be called on pTo before returning.
7054** The caller should finish committing the transaction on pTo by calling
7055** sqlite3BtreeCommit().
drh73509ee2003-04-06 20:44:45 +00007056*/
drhd677b3d2007-08-20 22:48:41 +00007057static int btreeCopyFile(Btree *pTo, Btree *pFrom){
drhf7c57532003-04-25 13:22:51 +00007058 int rc = SQLITE_OK;
danielk1977f653d782008-03-20 11:04:21 +00007059 Pgno i;
7060
7061 Pgno nFromPage; /* Number of pages in pFrom */
7062 Pgno nToPage; /* Number of pages in pTo */
7063 Pgno nNewPage; /* Number of pages in pTo after the copy */
7064
7065 Pgno iSkip; /* Pending byte page in pTo */
7066 int nToPageSize; /* Page size of pTo in bytes */
7067 int nFromPageSize; /* Page size of pFrom in bytes */
drhf7c57532003-04-25 13:22:51 +00007068
danielk1977aef0bf62005-12-30 16:28:01 +00007069 BtShared *pBtTo = pTo->pBt;
7070 BtShared *pBtFrom = pFrom->pBt;
drhe5fe6902007-12-07 18:55:28 +00007071 pBtTo->db = pTo->db;
7072 pBtFrom->db = pFrom->db;
danielk1977f653d782008-03-20 11:04:21 +00007073
7074 nToPageSize = pBtTo->pageSize;
7075 nFromPageSize = pBtFrom->pageSize;
danielk1977aef0bf62005-12-30 16:28:01 +00007076
7077 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
danielk1977ee5741e2004-05-31 10:01:34 +00007078 return SQLITE_ERROR;
7079 }
danielk1977f653d782008-03-20 11:04:21 +00007080 if( pBtTo->pCursor ){
7081 return SQLITE_BUSY;
drhf7c57532003-04-25 13:22:51 +00007082 }
drh538f5702007-04-13 02:14:30 +00007083
danielk1977ad0132d2008-06-07 08:58:22 +00007084 nToPage = pagerPagecount(pBtTo->pPager);
7085 nFromPage = pagerPagecount(pBtFrom->pPager);
danielk1977f653d782008-03-20 11:04:21 +00007086 iSkip = PENDING_BYTE_PAGE(pBtTo);
7087
7088 /* Variable nNewPage is the number of pages required to store the
7089 ** contents of pFrom using the current page-size of pTo.
drh538f5702007-04-13 02:14:30 +00007090 */
danielk1977f653d782008-03-20 11:04:21 +00007091 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
7092 (i64)nToPageSize;
7093
7094 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7095
7096 /* Journal the original page.
7097 **
7098 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7099 ** in database *pTo (before the copy). This page is never written
7100 ** into the journal file. Unless i==iSkip or the page was not
7101 ** present in pTo before the copy operation, journal page i from pTo.
7102 */
7103 if( i!=iSkip && i<=nToPage ){
danielk19774abd5442008-05-05 15:26:50 +00007104 DbPage *pDbPage = 0;
danielk1977f653d782008-03-20 11:04:21 +00007105 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
danielk19774abd5442008-05-05 15:26:50 +00007106 if( rc==SQLITE_OK ){
7107 rc = sqlite3PagerWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007108 if( rc==SQLITE_OK && i>nFromPage ){
7109 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7110 ** that is because the names of those procedures do not exactly
7111 ** represent what they do. Write() really means "put this page in the
7112 ** rollback journal and mark it as dirty so that it will be written
7113 ** to the database file later." DontWrite() undoes the second part of
7114 ** that and prevents the page from being written to the database. The
7115 ** page is still on the rollback journal, though. And that is the
7116 ** whole point of this block: to put pages on the rollback journal.
7117 */
danielk1977a1fa00d2008-08-27 15:16:33 +00007118 rc = sqlite3PagerDontWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007119 }
7120 sqlite3PagerUnref(pDbPage);
danielk1977f653d782008-03-20 11:04:21 +00007121 }
danielk1977f653d782008-03-20 11:04:21 +00007122 }
7123
7124 /* Overwrite the data in page i of the target database */
7125 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7126
7127 DbPage *pToPage = 0;
7128 sqlite3_int64 iOff;
7129
7130 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7131 if( rc==SQLITE_OK ){
7132 rc = sqlite3PagerWrite(pToPage);
7133 }
7134
7135 for(
7136 iOff=(i-1)*nToPageSize;
7137 rc==SQLITE_OK && iOff<i*nToPageSize;
7138 iOff += nFromPageSize
7139 ){
7140 DbPage *pFromPage = 0;
7141 Pgno iFrom = (iOff/nFromPageSize)+1;
7142
7143 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7144 continue;
7145 }
7146
7147 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7148 if( rc==SQLITE_OK ){
7149 char *zTo = sqlite3PagerGetData(pToPage);
7150 char *zFrom = sqlite3PagerGetData(pFromPage);
7151 int nCopy;
7152
7153 if( nFromPageSize>=nToPageSize ){
7154 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7155 nCopy = nToPageSize;
7156 }else{
7157 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7158 nCopy = nFromPageSize;
7159 }
7160
7161 memcpy(zTo, zFrom, nCopy);
danielk19772f78fc62008-09-30 09:31:45 +00007162 sqlite3PagerUnref(pFromPage);
danielk1977f653d782008-03-20 11:04:21 +00007163 }
7164 }
7165
danielk1977eaa06f62008-09-18 17:34:44 +00007166 if( pToPage ){
7167 MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
7168 p->isInit = 0;
7169 sqlite3PagerUnref(pToPage);
7170 }
danielk1977f653d782008-03-20 11:04:21 +00007171 }
drh2e6d11b2003-04-25 15:37:57 +00007172 }
danielk1977f653d782008-03-20 11:04:21 +00007173
7174 /* If things have worked so far, the database file may need to be
7175 ** truncated. The complex part is that it may need to be truncated to
7176 ** a size that is not an integer multiple of nToPageSize - the current
7177 ** page size used by the pager associated with B-Tree pTo.
7178 **
7179 ** For example, say the page-size of pTo is 2048 bytes and the original
7180 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7181 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7182 */
7183 if( rc==SQLITE_OK ){
7184 if( nFromPageSize!=nToPageSize ){
7185 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7186 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7187 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7188 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7189
7190 assert( iSize<=iNow );
7191
7192 /* Commit phase one syncs the journal file associated with pTo
7193 ** containing the original data. It does not sync the database file
7194 ** itself. After doing this it is safe to use OsTruncate() and other
7195 ** file APIs on the database file directly.
7196 */
7197 pBtTo->db = pTo->db;
7198 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7199 if( iSize<iNow && rc==SQLITE_OK ){
7200 rc = sqlite3OsTruncate(pFile, iSize);
7201 }
7202
7203 /* The loop that copied data from database pFrom to pTo did not
7204 ** populate the locking page of database pTo. If the page-size of
7205 ** pFrom is smaller than that of pTo, this means some data will
7206 ** not have been copied.
7207 **
7208 ** This block copies the missing data from database pFrom to pTo
7209 ** using file APIs. This is safe because at this point we know that
7210 ** all of the original data from pTo has been synced into the
7211 ** journal file. At this point it would be safe to do anything at
7212 ** all to the database file except truncate it to zero bytes.
7213 */
7214 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7215 i64 iOff;
7216 for(
7217 iOff=iPending;
7218 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7219 iOff += nFromPageSize
7220 ){
7221 DbPage *pFromPage = 0;
7222 Pgno iFrom = (iOff/nFromPageSize)+1;
7223
7224 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7225 continue;
7226 }
7227
7228 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7229 if( rc==SQLITE_OK ){
7230 char *zFrom = sqlite3PagerGetData(pFromPage);
danielk197706249db2008-08-23 16:17:55 +00007231 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
danielk1977f653d782008-03-20 11:04:21 +00007232 sqlite3PagerUnref(pFromPage);
7233 }
7234 }
7235 }
7236
7237 /* Sync the database file */
7238 if( rc==SQLITE_OK ){
7239 rc = sqlite3PagerSync(pBtTo->pPager);
7240 }
7241 }else{
7242 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7243 }
7244 if( rc==SQLITE_OK ){
7245 pBtTo->pageSizeFixed = 0;
7246 }
drh2e6d11b2003-04-25 15:37:57 +00007247 }
drh538f5702007-04-13 02:14:30 +00007248
drhf7c57532003-04-25 13:22:51 +00007249 if( rc ){
danielk1977aef0bf62005-12-30 16:28:01 +00007250 sqlite3BtreeRollback(pTo);
drhf7c57532003-04-25 13:22:51 +00007251 }
danielk1977f653d782008-03-20 11:04:21 +00007252
drhf7c57532003-04-25 13:22:51 +00007253 return rc;
drh73509ee2003-04-06 20:44:45 +00007254}
drhd677b3d2007-08-20 22:48:41 +00007255int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7256 int rc;
7257 sqlite3BtreeEnter(pTo);
7258 sqlite3BtreeEnter(pFrom);
7259 rc = btreeCopyFile(pTo, pFrom);
7260 sqlite3BtreeLeave(pFrom);
7261 sqlite3BtreeLeave(pTo);
7262 return rc;
7263}
7264
drhb7f91642004-10-31 02:22:47 +00007265#endif /* SQLITE_OMIT_VACUUM */
danielk19771d850a72004-05-31 08:26:49 +00007266
7267/*
7268** Return non-zero if a transaction is active.
7269*/
danielk1977aef0bf62005-12-30 16:28:01 +00007270int sqlite3BtreeIsInTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007271 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00007272 return (p && (p->inTrans==TRANS_WRITE));
danielk19771d850a72004-05-31 08:26:49 +00007273}
7274
7275/*
7276** Return non-zero if a statement transaction is active.
7277*/
danielk1977aef0bf62005-12-30 16:28:01 +00007278int sqlite3BtreeIsInStmt(Btree *p){
drh1fee73e2007-08-29 04:00:57 +00007279 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00007280 return (p->pBt && p->pBt->inStmt);
danielk19771d850a72004-05-31 08:26:49 +00007281}
danielk197713adf8a2004-06-03 16:08:41 +00007282
7283/*
danielk19772372c2b2006-06-27 16:34:56 +00007284** Return non-zero if a read (or write) transaction is active.
7285*/
7286int sqlite3BtreeIsInReadTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007287 assert( sqlite3_mutex_held(p->db->mutex) );
danielk19772372c2b2006-06-27 16:34:56 +00007288 return (p && (p->inTrans!=TRANS_NONE));
7289}
7290
7291/*
danielk1977da184232006-01-05 11:34:32 +00007292** This function returns a pointer to a blob of memory associated with
drh85b623f2007-12-13 21:54:09 +00007293** a single shared-btree. The memory is used by client code for its own
danielk1977da184232006-01-05 11:34:32 +00007294** purposes (for example, to store a high-level schema associated with
7295** the shared-btree). The btree layer manages reference counting issues.
7296**
7297** The first time this is called on a shared-btree, nBytes bytes of memory
7298** are allocated, zeroed, and returned to the caller. For each subsequent
7299** call the nBytes parameter is ignored and a pointer to the same blob
7300** of memory returned.
7301**
danielk1977171bfed2008-06-23 09:50:50 +00007302** If the nBytes parameter is 0 and the blob of memory has not yet been
7303** allocated, a null pointer is returned. If the blob has already been
7304** allocated, it is returned as normal.
7305**
danielk1977da184232006-01-05 11:34:32 +00007306** Just before the shared-btree is closed, the function passed as the
7307** xFree argument when the memory allocation was made is invoked on the
drh17435752007-08-16 04:30:38 +00007308** blob of allocated memory. This function should not call sqlite3_free()
danielk1977da184232006-01-05 11:34:32 +00007309** on the memory, the btree layer does that.
7310*/
7311void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7312 BtShared *pBt = p->pBt;
drh27641702007-08-22 02:56:42 +00007313 sqlite3BtreeEnter(p);
danielk1977171bfed2008-06-23 09:50:50 +00007314 if( !pBt->pSchema && nBytes ){
drh17435752007-08-16 04:30:38 +00007315 pBt->pSchema = sqlite3MallocZero(nBytes);
danielk1977da184232006-01-05 11:34:32 +00007316 pBt->xFreeSchema = xFree;
7317 }
drh27641702007-08-22 02:56:42 +00007318 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00007319 return pBt->pSchema;
7320}
7321
danielk1977c87d34d2006-01-06 13:00:28 +00007322/*
7323** Return true if another user of the same shared btree as the argument
7324** handle holds an exclusive lock on the sqlite_master table.
7325*/
7326int sqlite3BtreeSchemaLocked(Btree *p){
drh27641702007-08-22 02:56:42 +00007327 int rc;
drhe5fe6902007-12-07 18:55:28 +00007328 assert( sqlite3_mutex_held(p->db->mutex) );
drh27641702007-08-22 02:56:42 +00007329 sqlite3BtreeEnter(p);
7330 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7331 sqlite3BtreeLeave(p);
7332 return rc;
danielk1977c87d34d2006-01-06 13:00:28 +00007333}
7334
drha154dcd2006-03-22 22:10:07 +00007335
7336#ifndef SQLITE_OMIT_SHARED_CACHE
7337/*
7338** Obtain a lock on the table whose root page is iTab. The
7339** lock is a write lock if isWritelock is true or a read lock
7340** if it is false.
7341*/
danielk1977c00da102006-01-07 13:21:04 +00007342int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
danielk19772e94d4d2006-01-09 05:36:27 +00007343 int rc = SQLITE_OK;
drh6a9ad3d2008-04-02 16:29:30 +00007344 if( p->sharable ){
7345 u8 lockType = READ_LOCK + isWriteLock;
7346 assert( READ_LOCK+1==WRITE_LOCK );
7347 assert( isWriteLock==0 || isWriteLock==1 );
7348 sqlite3BtreeEnter(p);
7349 rc = queryTableLock(p, iTab, lockType);
7350 if( rc==SQLITE_OK ){
7351 rc = lockTable(p, iTab, lockType);
7352 }
7353 sqlite3BtreeLeave(p);
danielk1977c00da102006-01-07 13:21:04 +00007354 }
7355 return rc;
7356}
drha154dcd2006-03-22 22:10:07 +00007357#endif
danielk1977b82e7ed2006-01-11 14:09:31 +00007358
danielk1977b4e9af92007-05-01 17:49:49 +00007359#ifndef SQLITE_OMIT_INCRBLOB
7360/*
7361** Argument pCsr must be a cursor opened for writing on an
7362** INTKEY table currently pointing at a valid table entry.
7363** This function modifies the data stored as part of that entry.
7364** Only the data content may only be modified, it is not possible
7365** to change the length of the data stored.
7366*/
danielk1977dcbb5d32007-05-04 18:36:44 +00007367int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
drh1fee73e2007-08-29 04:00:57 +00007368 assert( cursorHoldsMutex(pCsr) );
drhe5fe6902007-12-07 18:55:28 +00007369 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007370 assert(pCsr->isIncrblobHandle);
danielk19773588ceb2008-06-10 17:30:26 +00007371
drha3460582008-07-11 21:02:53 +00007372 restoreCursorPosition(pCsr);
danielk19773588ceb2008-06-10 17:30:26 +00007373 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7374 if( pCsr->eState!=CURSOR_VALID ){
7375 return SQLITE_ABORT;
danielk1977dcbb5d32007-05-04 18:36:44 +00007376 }
7377
danielk1977d04417962007-05-02 13:16:30 +00007378 /* Check some preconditions:
danielk1977dcbb5d32007-05-04 18:36:44 +00007379 ** (a) the cursor is open for writing,
7380 ** (b) there is no read-lock on the table being modified and
7381 ** (c) the cursor points at a valid row of an intKey table.
danielk1977d04417962007-05-02 13:16:30 +00007382 */
danielk1977d04417962007-05-02 13:16:30 +00007383 if( !pCsr->wrFlag ){
danielk1977dcbb5d32007-05-04 18:36:44 +00007384 return SQLITE_READONLY;
danielk1977d04417962007-05-02 13:16:30 +00007385 }
drhd0679ed2007-08-28 22:24:34 +00007386 assert( !pCsr->pBt->readOnly
7387 && pCsr->pBt->inTransaction==TRANS_WRITE );
danielk19773588ceb2008-06-10 17:30:26 +00007388 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
danielk1977d04417962007-05-02 13:16:30 +00007389 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7390 }
danielk197771d5d2c2008-09-29 11:49:47 +00007391 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
danielk1977d04417962007-05-02 13:16:30 +00007392 return SQLITE_ERROR;
danielk1977b4e9af92007-05-01 17:49:49 +00007393 }
7394
danielk19779f8d6402007-05-02 17:48:45 +00007395 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
danielk1977b4e9af92007-05-01 17:49:49 +00007396}
danielk19772dec9702007-05-02 16:48:37 +00007397
7398/*
7399** Set a flag on this cursor to cache the locations of pages from the
danielk1977da107192007-05-04 08:32:13 +00007400** overflow list for the current row. This is used by cursors opened
7401** for incremental blob IO only.
7402**
7403** This function sets a flag only. The actual page location cache
7404** (stored in BtCursor.aOverflow[]) is allocated and used by function
7405** accessPayload() (the worker function for sqlite3BtreeData() and
7406** sqlite3BtreePutData()).
danielk19772dec9702007-05-02 16:48:37 +00007407*/
7408void sqlite3BtreeCacheOverflow(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00007409 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00007410 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007411 assert(!pCur->isIncrblobHandle);
danielk19772dec9702007-05-02 16:48:37 +00007412 assert(!pCur->aOverflow);
danielk1977dcbb5d32007-05-04 18:36:44 +00007413 pCur->isIncrblobHandle = 1;
danielk19772dec9702007-05-02 16:48:37 +00007414}
danielk1977b4e9af92007-05-01 17:49:49 +00007415#endif