blob: 34b4ffa442f1f80c6e840cd7a31e1a007f9f7599 [file] [log] [blame]
drha059ad02001-04-17 20:09:11 +00001/*
drh9e572e62004-04-23 23:43:10 +00002** 2004 April 6
drha059ad02001-04-17 20:09:11 +00003**
drhb19a2bc2001-09-16 00:13:26 +00004** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
drha059ad02001-04-17 20:09:11 +00006**
drhb19a2bc2001-09-16 00:13:26 +00007** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
drha059ad02001-04-17 20:09:11 +000010**
11*************************************************************************
danielk1977fd7f0452008-12-17 17:30:26 +000012** $Id: btree.c,v 1.549 2008/12/17 17:30:26 danielk1977 Exp $
drh8b2f49b2001-06-08 00:21:52 +000013**
14** This file implements a external (disk-based) database using BTrees.
drha3152892007-05-05 11:48:52 +000015** See the header comment on "btreeInt.h" for additional information.
16** Including a description of file format and an overview of operation.
drha059ad02001-04-17 20:09:11 +000017*/
drha3152892007-05-05 11:48:52 +000018#include "btreeInt.h"
paulb95a8862003-04-01 21:16:41 +000019
drh8c42ca92001-06-22 19:15:00 +000020/*
drha3152892007-05-05 11:48:52 +000021** The header string that appears at the beginning of every
22** SQLite database.
drh556b2a22005-06-14 16:04:05 +000023*/
drh556b2a22005-06-14 16:04:05 +000024static const char zMagicHeader[] = SQLITE_FILE_HEADER;
drh08ed44e2001-04-29 23:32:55 +000025
drh8c42ca92001-06-22 19:15:00 +000026/*
drha3152892007-05-05 11:48:52 +000027** Set this global variable to 1 to enable tracing using the TRACE
28** macro.
drh615ae552005-01-16 23:21:00 +000029*/
drhe8f52c52008-07-12 14:52:20 +000030#if 0
mlcreech3a00f902008-03-04 17:45:01 +000031int sqlite3BtreeTrace=0; /* True to enable tracing */
drhe8f52c52008-07-12 14:52:20 +000032# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
33#else
34# define TRACE(X)
drh615ae552005-01-16 23:21:00 +000035#endif
drh615ae552005-01-16 23:21:00 +000036
drhf94a1732008-09-30 17:18:17 +000037/*
38** Sometimes we need a small amount of code such as a variable initialization
39** to setup for a later assert() statement. We do not want this code to
40** appear when assert() is disabled. The following macro is therefore
41** used to contain that setup code. The "VVA" acronym stands for
42** "Verification, Validation, and Accreditation". In other words, the
43** code within VVA_ONLY() will only run during verification processes.
44*/
45#ifndef NDEBUG
46# define VVA_ONLY(X) X
47#else
48# define VVA_ONLY(X)
49#endif
50
drh86f8c192007-08-22 00:39:19 +000051
52
drhe53831d2007-08-17 01:14:38 +000053#ifndef SQLITE_OMIT_SHARED_CACHE
54/*
danielk1977502b4e02008-09-02 14:07:24 +000055** A list of BtShared objects that are eligible for participation
56** in shared cache. This variable has file scope during normal builds,
57** but the test harness needs to access it so we make it global for
58** test builds.
drhe53831d2007-08-17 01:14:38 +000059*/
60#ifdef SQLITE_TEST
drh78f82d12008-09-02 00:52:52 +000061BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000062#else
drh78f82d12008-09-02 00:52:52 +000063static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
drhe53831d2007-08-17 01:14:38 +000064#endif
drhe53831d2007-08-17 01:14:38 +000065#endif /* SQLITE_OMIT_SHARED_CACHE */
66
67#ifndef SQLITE_OMIT_SHARED_CACHE
68/*
69** Enable or disable the shared pager and schema features.
70**
71** This routine has no effect on existing database connections.
72** The shared cache setting effects only future calls to
73** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
74*/
75int sqlite3_enable_shared_cache(int enable){
danielk1977502b4e02008-09-02 14:07:24 +000076 sqlite3GlobalConfig.sharedCacheEnabled = enable;
drhe53831d2007-08-17 01:14:38 +000077 return SQLITE_OK;
78}
79#endif
80
drhd677b3d2007-08-20 22:48:41 +000081
drh615ae552005-01-16 23:21:00 +000082/*
drh66cbd152004-09-01 16:12:25 +000083** Forward declaration
84*/
danielk19773588ceb2008-06-10 17:30:26 +000085static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
drh66cbd152004-09-01 16:12:25 +000086
danielk1977aef0bf62005-12-30 16:28:01 +000087
88#ifdef SQLITE_OMIT_SHARED_CACHE
89 /*
90 ** The functions queryTableLock(), lockTable() and unlockAllTables()
91 ** manipulate entries in the BtShared.pLock linked list used to store
92 ** shared-cache table level locks. If the library is compiled with the
93 ** shared-cache feature disabled, then there is only ever one user
danielk1977da184232006-01-05 11:34:32 +000094 ** of each BtShared structure and so this locking is not necessary.
95 ** So define the lock related functions as no-ops.
danielk1977aef0bf62005-12-30 16:28:01 +000096 */
97 #define queryTableLock(a,b,c) SQLITE_OK
98 #define lockTable(a,b,c) SQLITE_OK
danielk1977da184232006-01-05 11:34:32 +000099 #define unlockAllTables(a)
drhe53831d2007-08-17 01:14:38 +0000100#endif
danielk1977aef0bf62005-12-30 16:28:01 +0000101
drhe53831d2007-08-17 01:14:38 +0000102#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977da184232006-01-05 11:34:32 +0000103/*
danielk1977aef0bf62005-12-30 16:28:01 +0000104** Query to see if btree handle p may obtain a lock of type eLock
105** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
106** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
danielk1977c87d34d2006-01-06 13:00:28 +0000107** SQLITE_LOCKED if not.
danielk1977aef0bf62005-12-30 16:28:01 +0000108*/
109static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
110 BtShared *pBt = p->pBt;
111 BtLock *pIter;
112
drh1fee73e2007-08-29 04:00:57 +0000113 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000114 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
115 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000116
danielk1977da184232006-01-05 11:34:32 +0000117 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000118 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000119 return SQLITE_OK;
120 }
121
danielk1977641b0f42007-12-21 04:47:25 +0000122 /* If some other connection is holding an exclusive lock, the
123 ** requested lock may not be obtained.
124 */
125 if( pBt->pExclusive && pBt->pExclusive!=p ){
126 return SQLITE_LOCKED;
127 }
128
danielk1977da184232006-01-05 11:34:32 +0000129 /* This (along with lockTable()) is where the ReadUncommitted flag is
130 ** dealt with. If the caller is querying for a read-lock and the flag is
131 ** set, it is unconditionally granted - even if there are write-locks
132 ** on the table. If a write-lock is requested, the ReadUncommitted flag
133 ** is not considered.
134 **
135 ** In function lockTable(), if a read-lock is demanded and the
136 ** ReadUncommitted flag is set, no entry is added to the locks list
137 ** (BtShared.pLock).
138 **
139 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
140 ** not create or respect table locks. The locking procedure for a
141 ** write-cursor does not change.
142 */
143 if(
drhe5fe6902007-12-07 18:55:28 +0000144 0==(p->db->flags&SQLITE_ReadUncommitted) ||
danielk1977da184232006-01-05 11:34:32 +0000145 eLock==WRITE_LOCK ||
drh47ded162006-01-06 01:42:58 +0000146 iTab==MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000147 ){
148 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
149 if( pIter->pBtree!=p && pIter->iTable==iTab &&
150 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
danielk1977c87d34d2006-01-06 13:00:28 +0000151 return SQLITE_LOCKED;
danielk1977da184232006-01-05 11:34:32 +0000152 }
danielk1977aef0bf62005-12-30 16:28:01 +0000153 }
154 }
155 return SQLITE_OK;
156}
drhe53831d2007-08-17 01:14:38 +0000157#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000158
drhe53831d2007-08-17 01:14:38 +0000159#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000160/*
161** Add a lock on the table with root-page iTable to the shared-btree used
162** by Btree handle p. Parameter eLock must be either READ_LOCK or
163** WRITE_LOCK.
164**
165** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
166** SQLITE_NOMEM may also be returned.
167*/
168static int lockTable(Btree *p, Pgno iTable, u8 eLock){
169 BtShared *pBt = p->pBt;
170 BtLock *pLock = 0;
171 BtLock *pIter;
172
drh1fee73e2007-08-29 04:00:57 +0000173 assert( sqlite3BtreeHoldsMutex(p) );
drhfa67c3c2008-07-11 02:21:40 +0000174 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
175 assert( p->db!=0 );
drhd677b3d2007-08-20 22:48:41 +0000176
danielk1977da184232006-01-05 11:34:32 +0000177 /* This is a no-op if the shared-cache is not enabled */
drhe53831d2007-08-17 01:14:38 +0000178 if( !p->sharable ){
danielk1977da184232006-01-05 11:34:32 +0000179 return SQLITE_OK;
180 }
181
danielk1977aef0bf62005-12-30 16:28:01 +0000182 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
183
danielk1977da184232006-01-05 11:34:32 +0000184 /* If the read-uncommitted flag is set and a read-lock is requested,
185 ** return early without adding an entry to the BtShared.pLock list. See
186 ** comment in function queryTableLock() for more info on handling
187 ** the ReadUncommitted flag.
188 */
189 if(
drhe5fe6902007-12-07 18:55:28 +0000190 (p->db->flags&SQLITE_ReadUncommitted) &&
danielk1977da184232006-01-05 11:34:32 +0000191 (eLock==READ_LOCK) &&
drh47ded162006-01-06 01:42:58 +0000192 iTable!=MASTER_ROOT
danielk1977da184232006-01-05 11:34:32 +0000193 ){
194 return SQLITE_OK;
195 }
196
danielk1977aef0bf62005-12-30 16:28:01 +0000197 /* First search the list for an existing lock on this table. */
198 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
199 if( pIter->iTable==iTable && pIter->pBtree==p ){
200 pLock = pIter;
201 break;
202 }
203 }
204
205 /* If the above search did not find a BtLock struct associating Btree p
206 ** with table iTable, allocate one and link it into the list.
207 */
208 if( !pLock ){
drh17435752007-08-16 04:30:38 +0000209 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
danielk1977aef0bf62005-12-30 16:28:01 +0000210 if( !pLock ){
211 return SQLITE_NOMEM;
212 }
213 pLock->iTable = iTable;
214 pLock->pBtree = p;
215 pLock->pNext = pBt->pLock;
216 pBt->pLock = pLock;
217 }
218
219 /* Set the BtLock.eLock variable to the maximum of the current lock
220 ** and the requested lock. This means if a write-lock was already held
221 ** and a read-lock requested, we don't incorrectly downgrade the lock.
222 */
223 assert( WRITE_LOCK>READ_LOCK );
danielk19775118b912005-12-30 16:31:53 +0000224 if( eLock>pLock->eLock ){
225 pLock->eLock = eLock;
226 }
danielk1977aef0bf62005-12-30 16:28:01 +0000227
228 return SQLITE_OK;
229}
drhe53831d2007-08-17 01:14:38 +0000230#endif /* !SQLITE_OMIT_SHARED_CACHE */
danielk1977aef0bf62005-12-30 16:28:01 +0000231
drhe53831d2007-08-17 01:14:38 +0000232#ifndef SQLITE_OMIT_SHARED_CACHE
danielk1977aef0bf62005-12-30 16:28:01 +0000233/*
234** Release all the table locks (locks obtained via calls to the lockTable()
235** procedure) held by Btree handle p.
236*/
237static void unlockAllTables(Btree *p){
danielk1977641b0f42007-12-21 04:47:25 +0000238 BtShared *pBt = p->pBt;
239 BtLock **ppIter = &pBt->pLock;
danielk1977da184232006-01-05 11:34:32 +0000240
drh1fee73e2007-08-29 04:00:57 +0000241 assert( sqlite3BtreeHoldsMutex(p) );
drhe53831d2007-08-17 01:14:38 +0000242 assert( p->sharable || 0==*ppIter );
danielk1977da184232006-01-05 11:34:32 +0000243
danielk1977aef0bf62005-12-30 16:28:01 +0000244 while( *ppIter ){
245 BtLock *pLock = *ppIter;
danielk1977641b0f42007-12-21 04:47:25 +0000246 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
danielk1977aef0bf62005-12-30 16:28:01 +0000247 if( pLock->pBtree==p ){
248 *ppIter = pLock->pNext;
drh17435752007-08-16 04:30:38 +0000249 sqlite3_free(pLock);
danielk1977aef0bf62005-12-30 16:28:01 +0000250 }else{
251 ppIter = &pLock->pNext;
252 }
253 }
danielk1977641b0f42007-12-21 04:47:25 +0000254
255 if( pBt->pExclusive==p ){
256 pBt->pExclusive = 0;
257 }
danielk1977aef0bf62005-12-30 16:28:01 +0000258}
259#endif /* SQLITE_OMIT_SHARED_CACHE */
260
drh980b1a72006-08-16 16:42:48 +0000261static void releasePage(MemPage *pPage); /* Forward reference */
262
drh1fee73e2007-08-29 04:00:57 +0000263/*
264** Verify that the cursor holds a mutex on the BtShared
265*/
266#ifndef NDEBUG
267static int cursorHoldsMutex(BtCursor *p){
drhff0587c2007-08-29 17:43:19 +0000268 return sqlite3_mutex_held(p->pBt->mutex);
drh1fee73e2007-08-29 04:00:57 +0000269}
270#endif
271
272
danielk197792d4d7a2007-05-04 12:05:56 +0000273#ifndef SQLITE_OMIT_INCRBLOB
274/*
275** Invalidate the overflow page-list cache for cursor pCur, if any.
276*/
277static void invalidateOverflowCache(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000278 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000279 sqlite3_free(pCur->aOverflow);
danielk197792d4d7a2007-05-04 12:05:56 +0000280 pCur->aOverflow = 0;
281}
282
283/*
284** Invalidate the overflow page-list cache for all cursors opened
285** on the shared btree structure pBt.
286*/
287static void invalidateAllOverflowCache(BtShared *pBt){
288 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000289 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +0000290 for(p=pBt->pCursor; p; p=p->pNext){
291 invalidateOverflowCache(p);
292 }
293}
294#else
295 #define invalidateOverflowCache(x)
296 #define invalidateAllOverflowCache(x)
297#endif
298
drh980b1a72006-08-16 16:42:48 +0000299/*
300** Save the current cursor position in the variables BtCursor.nKey
301** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
302*/
303static int saveCursorPosition(BtCursor *pCur){
304 int rc;
305
306 assert( CURSOR_VALID==pCur->eState );
307 assert( 0==pCur->pKey );
drh1fee73e2007-08-29 04:00:57 +0000308 assert( cursorHoldsMutex(pCur) );
drh980b1a72006-08-16 16:42:48 +0000309
310 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
311
312 /* If this is an intKey table, then the above call to BtreeKeySize()
313 ** stores the integer key in pCur->nKey. In this case this value is
314 ** all that is required. Otherwise, if pCur is not open on an intKey
315 ** table, then malloc space for and store the pCur->nKey bytes of key
316 ** data.
317 */
danielk197771d5d2c2008-09-29 11:49:47 +0000318 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
drhf49661a2008-12-10 16:45:50 +0000319 void *pKey = sqlite3Malloc( (int)pCur->nKey );
drh980b1a72006-08-16 16:42:48 +0000320 if( pKey ){
drhf49661a2008-12-10 16:45:50 +0000321 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
drh980b1a72006-08-16 16:42:48 +0000322 if( rc==SQLITE_OK ){
323 pCur->pKey = pKey;
324 }else{
drh17435752007-08-16 04:30:38 +0000325 sqlite3_free(pKey);
drh980b1a72006-08-16 16:42:48 +0000326 }
327 }else{
328 rc = SQLITE_NOMEM;
329 }
330 }
danielk197771d5d2c2008-09-29 11:49:47 +0000331 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
drh980b1a72006-08-16 16:42:48 +0000332
333 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +0000334 int i;
335 for(i=0; i<=pCur->iPage; i++){
336 releasePage(pCur->apPage[i]);
337 pCur->apPage[i] = 0;
338 }
339 pCur->iPage = -1;
drh980b1a72006-08-16 16:42:48 +0000340 pCur->eState = CURSOR_REQUIRESEEK;
341 }
342
danielk197792d4d7a2007-05-04 12:05:56 +0000343 invalidateOverflowCache(pCur);
drh980b1a72006-08-16 16:42:48 +0000344 return rc;
345}
346
347/*
348** Save the positions of all cursors except pExcept open on the table
349** with root-page iRoot. Usually, this is called just before cursor
350** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
351*/
352static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
353 BtCursor *p;
drh1fee73e2007-08-29 04:00:57 +0000354 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +0000355 assert( pExcept==0 || pExcept->pBt==pBt );
drh980b1a72006-08-16 16:42:48 +0000356 for(p=pBt->pCursor; p; p=p->pNext){
357 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
358 p->eState==CURSOR_VALID ){
359 int rc = saveCursorPosition(p);
360 if( SQLITE_OK!=rc ){
361 return rc;
362 }
363 }
364 }
365 return SQLITE_OK;
366}
367
368/*
drhbf700f32007-03-31 02:36:44 +0000369** Clear the current cursor position.
370*/
danielk1977be51a652008-10-08 17:58:48 +0000371void sqlite3BtreeClearCursor(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +0000372 assert( cursorHoldsMutex(pCur) );
drh17435752007-08-16 04:30:38 +0000373 sqlite3_free(pCur->pKey);
drhbf700f32007-03-31 02:36:44 +0000374 pCur->pKey = 0;
375 pCur->eState = CURSOR_INVALID;
376}
377
378/*
drh980b1a72006-08-16 16:42:48 +0000379** Restore the cursor to the position it was in (or as close to as possible)
380** when saveCursorPosition() was called. Note that this call deletes the
381** saved position info stored by saveCursorPosition(), so there can be
drha3460582008-07-11 21:02:53 +0000382** at most one effective restoreCursorPosition() call after each
drh980b1a72006-08-16 16:42:48 +0000383** saveCursorPosition().
drh980b1a72006-08-16 16:42:48 +0000384*/
drha3460582008-07-11 21:02:53 +0000385int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
drhbf700f32007-03-31 02:36:44 +0000386 int rc;
drh1fee73e2007-08-29 04:00:57 +0000387 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +0000388 assert( pCur->eState>=CURSOR_REQUIRESEEK );
389 if( pCur->eState==CURSOR_FAULT ){
390 return pCur->skip;
391 }
drh980b1a72006-08-16 16:42:48 +0000392 pCur->eState = CURSOR_INVALID;
drhe63d9992008-08-13 19:11:48 +0000393 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
drh980b1a72006-08-16 16:42:48 +0000394 if( rc==SQLITE_OK ){
drh17435752007-08-16 04:30:38 +0000395 sqlite3_free(pCur->pKey);
drh980b1a72006-08-16 16:42:48 +0000396 pCur->pKey = 0;
drhbf700f32007-03-31 02:36:44 +0000397 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
drh980b1a72006-08-16 16:42:48 +0000398 }
399 return rc;
400}
401
drha3460582008-07-11 21:02:53 +0000402#define restoreCursorPosition(p) \
drhfb982642007-08-30 01:19:59 +0000403 (p->eState>=CURSOR_REQUIRESEEK ? \
drha3460582008-07-11 21:02:53 +0000404 sqlite3BtreeRestoreCursorPosition(p) : \
drh16a9b832007-05-05 18:39:25 +0000405 SQLITE_OK)
drh980b1a72006-08-16 16:42:48 +0000406
drha3460582008-07-11 21:02:53 +0000407/*
408** Determine whether or not a cursor has moved from the position it
drhdfe88ec2008-11-03 20:55:06 +0000409** was last placed at. Cursors can move when the row they are pointing
drha3460582008-07-11 21:02:53 +0000410** at is deleted out from under them.
411**
412** This routine returns an error code if something goes wrong. The
413** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
414*/
415int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
416 int rc;
417
418 rc = restoreCursorPosition(pCur);
419 if( rc ){
420 *pHasMoved = 1;
421 return rc;
422 }
423 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
424 *pHasMoved = 1;
425 }else{
426 *pHasMoved = 0;
427 }
428 return SQLITE_OK;
429}
430
danielk1977599fcba2004-11-08 07:13:13 +0000431#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977afcdd022004-10-31 16:25:42 +0000432/*
drha3152892007-05-05 11:48:52 +0000433** Given a page number of a regular database page, return the page
434** number for the pointer-map page that contains the entry for the
435** input page number.
danielk1977afcdd022004-10-31 16:25:42 +0000436*/
danielk1977266664d2006-02-10 08:24:21 +0000437static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
danielk197789d40042008-11-17 14:20:56 +0000438 int nPagesPerMapPage;
439 Pgno iPtrMap, ret;
drh1fee73e2007-08-29 04:00:57 +0000440 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000441 nPagesPerMapPage = (pBt->usableSize/5)+1;
442 iPtrMap = (pgno-2)/nPagesPerMapPage;
443 ret = (iPtrMap*nPagesPerMapPage) + 2;
danielk1977266664d2006-02-10 08:24:21 +0000444 if( ret==PENDING_BYTE_PAGE(pBt) ){
445 ret++;
446 }
447 return ret;
448}
danielk1977a19df672004-11-03 11:37:07 +0000449
danielk1977afcdd022004-10-31 16:25:42 +0000450/*
danielk1977afcdd022004-10-31 16:25:42 +0000451** Write an entry into the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000452**
453** This routine updates the pointer map entry for page number 'key'
454** so that it maps to type 'eType' and parent page number 'pgno'.
455** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000456*/
danielk1977aef0bf62005-12-30 16:28:01 +0000457static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
danielk19773b8a05f2007-03-19 17:44:26 +0000458 DbPage *pDbPage; /* The pointer map page */
459 u8 *pPtrmap; /* The pointer map data */
460 Pgno iPtrmap; /* The pointer map page number */
461 int offset; /* Offset in pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000462 int rc;
463
drh1fee73e2007-08-29 04:00:57 +0000464 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977266664d2006-02-10 08:24:21 +0000465 /* The master-journal page number must never be used as a pointer map page */
466 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
467
danielk1977ac11ee62005-01-15 12:45:51 +0000468 assert( pBt->autoVacuum );
danielk1977fdb7cdb2005-01-17 02:12:18 +0000469 if( key==0 ){
drh49285702005-09-17 15:20:26 +0000470 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +0000471 }
danielk1977266664d2006-02-10 08:24:21 +0000472 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000473 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977687566d2004-11-02 12:56:41 +0000474 if( rc!=SQLITE_OK ){
danielk1977afcdd022004-10-31 16:25:42 +0000475 return rc;
476 }
danielk19778c666b12008-07-18 09:34:57 +0000477 offset = PTRMAP_PTROFFSET(iPtrmap, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000478 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000479
drh615ae552005-01-16 23:21:00 +0000480 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
481 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
danielk19773b8a05f2007-03-19 17:44:26 +0000482 rc = sqlite3PagerWrite(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000483 if( rc==SQLITE_OK ){
484 pPtrmap[offset] = eType;
485 put4byte(&pPtrmap[offset+1], parent);
danielk1977afcdd022004-10-31 16:25:42 +0000486 }
danielk1977afcdd022004-10-31 16:25:42 +0000487 }
488
danielk19773b8a05f2007-03-19 17:44:26 +0000489 sqlite3PagerUnref(pDbPage);
danielk19775558a8a2005-01-17 07:53:44 +0000490 return rc;
danielk1977afcdd022004-10-31 16:25:42 +0000491}
492
493/*
494** Read an entry from the pointer map.
danielk1977687566d2004-11-02 12:56:41 +0000495**
496** This routine retrieves the pointer map entry for page 'key', writing
497** the type and parent page number to *pEType and *pPgno respectively.
498** An error code is returned if something goes wrong, otherwise SQLITE_OK.
danielk1977afcdd022004-10-31 16:25:42 +0000499*/
danielk1977aef0bf62005-12-30 16:28:01 +0000500static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
danielk19773b8a05f2007-03-19 17:44:26 +0000501 DbPage *pDbPage; /* The pointer map page */
danielk1977afcdd022004-10-31 16:25:42 +0000502 int iPtrmap; /* Pointer map page index */
503 u8 *pPtrmap; /* Pointer map page data */
504 int offset; /* Offset of entry in pointer map */
505 int rc;
506
drh1fee73e2007-08-29 04:00:57 +0000507 assert( sqlite3_mutex_held(pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000508
danielk1977266664d2006-02-10 08:24:21 +0000509 iPtrmap = PTRMAP_PAGENO(pBt, key);
danielk19773b8a05f2007-03-19 17:44:26 +0000510 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000511 if( rc!=0 ){
512 return rc;
513 }
danielk19773b8a05f2007-03-19 17:44:26 +0000514 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
danielk1977afcdd022004-10-31 16:25:42 +0000515
danielk19778c666b12008-07-18 09:34:57 +0000516 offset = PTRMAP_PTROFFSET(iPtrmap, key);
drh43617e92006-03-06 20:55:46 +0000517 assert( pEType!=0 );
518 *pEType = pPtrmap[offset];
danielk1977687566d2004-11-02 12:56:41 +0000519 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
danielk1977afcdd022004-10-31 16:25:42 +0000520
danielk19773b8a05f2007-03-19 17:44:26 +0000521 sqlite3PagerUnref(pDbPage);
drh49285702005-09-17 15:20:26 +0000522 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
danielk1977afcdd022004-10-31 16:25:42 +0000523 return SQLITE_OK;
524}
525
danielk197785d90ca2008-07-19 14:25:15 +0000526#else /* if defined SQLITE_OMIT_AUTOVACUUM */
527 #define ptrmapPut(w,x,y,z) SQLITE_OK
528 #define ptrmapGet(w,x,y,z) SQLITE_OK
529 #define ptrmapPutOvfl(y,z) SQLITE_OK
530#endif
danielk1977afcdd022004-10-31 16:25:42 +0000531
drh0d316a42002-08-11 20:10:47 +0000532/*
drh271efa52004-05-30 19:19:05 +0000533** Given a btree page and a cell index (0 means the first cell on
534** the page, 1 means the second cell, and so forth) return a pointer
535** to the cell content.
536**
537** This routine works only for pages that do not contain overflow cells.
drh3aac2dd2004-04-26 14:10:20 +0000538*/
drh1688c862008-07-18 02:44:17 +0000539#define findCell(P,I) \
540 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
drh43605152004-05-29 21:46:49 +0000541
542/*
drh93a960a2008-07-10 00:32:42 +0000543** This a more complex version of findCell() that works for
drh43605152004-05-29 21:46:49 +0000544** pages that do contain overflow cells. See insert
545*/
546static u8 *findOverflowCell(MemPage *pPage, int iCell){
547 int i;
drh1fee73e2007-08-29 04:00:57 +0000548 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +0000549 for(i=pPage->nOverflow-1; i>=0; i--){
drh6d08b4d2004-07-20 12:45:22 +0000550 int k;
551 struct _OvflCell *pOvfl;
552 pOvfl = &pPage->aOvfl[i];
553 k = pOvfl->idx;
554 if( k<=iCell ){
555 if( k==iCell ){
556 return pOvfl->pCell;
drh43605152004-05-29 21:46:49 +0000557 }
558 iCell--;
559 }
560 }
danielk19771cc5ed82007-05-16 17:28:43 +0000561 return findCell(pPage, iCell);
drh43605152004-05-29 21:46:49 +0000562}
563
564/*
565** Parse a cell content block and fill in the CellInfo structure. There
drh16a9b832007-05-05 18:39:25 +0000566** are two versions of this function. sqlite3BtreeParseCell() takes a
567** cell index as the second argument and sqlite3BtreeParseCellPtr()
568** takes a pointer to the body of the cell as its second argument.
danielk19771cc5ed82007-05-16 17:28:43 +0000569**
570** Within this file, the parseCell() macro can be called instead of
571** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
drh43605152004-05-29 21:46:49 +0000572*/
drh16a9b832007-05-05 18:39:25 +0000573void sqlite3BtreeParseCellPtr(
drh3aac2dd2004-04-26 14:10:20 +0000574 MemPage *pPage, /* Page containing the cell */
drh43605152004-05-29 21:46:49 +0000575 u8 *pCell, /* Pointer to the cell text. */
drh6f11bef2004-05-13 01:12:56 +0000576 CellInfo *pInfo /* Fill in this structure */
drh3aac2dd2004-04-26 14:10:20 +0000577){
drhf49661a2008-12-10 16:45:50 +0000578 u16 n; /* Number bytes in cell content header */
drh271efa52004-05-30 19:19:05 +0000579 u32 nPayload; /* Number of bytes of cell payload */
drh43605152004-05-29 21:46:49 +0000580
drh1fee73e2007-08-29 04:00:57 +0000581 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +0000582
drh43605152004-05-29 21:46:49 +0000583 pInfo->pCell = pCell;
drhab01f612004-05-22 02:55:23 +0000584 assert( pPage->leaf==0 || pPage->leaf==1 );
drh271efa52004-05-30 19:19:05 +0000585 n = pPage->childPtrSize;
586 assert( n==4-4*pPage->leaf );
drh504b6982006-01-22 21:52:56 +0000587 if( pPage->intKey ){
drh79df1f42008-07-18 00:57:33 +0000588 if( pPage->hasData ){
589 n += getVarint32(&pCell[n], nPayload);
590 }else{
591 nPayload = 0;
592 }
drh1bd10f82008-12-10 21:19:56 +0000593 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
drh79df1f42008-07-18 00:57:33 +0000594 pInfo->nData = nPayload;
drh504b6982006-01-22 21:52:56 +0000595 }else{
drh79df1f42008-07-18 00:57:33 +0000596 pInfo->nData = 0;
597 n += getVarint32(&pCell[n], nPayload);
598 pInfo->nKey = nPayload;
drh6f11bef2004-05-13 01:12:56 +0000599 }
drh72365832007-03-06 15:53:44 +0000600 pInfo->nPayload = nPayload;
drh504b6982006-01-22 21:52:56 +0000601 pInfo->nHeader = n;
drh79df1f42008-07-18 00:57:33 +0000602 if( likely(nPayload<=pPage->maxLocal) ){
drh271efa52004-05-30 19:19:05 +0000603 /* This is the (easy) common case where the entire payload fits
604 ** on the local page. No overflow is required.
605 */
606 int nSize; /* Total size of cell content in bytes */
drh79df1f42008-07-18 00:57:33 +0000607 nSize = nPayload + n;
drhf49661a2008-12-10 16:45:50 +0000608 pInfo->nLocal = (u16)nPayload;
drh6f11bef2004-05-13 01:12:56 +0000609 pInfo->iOverflow = 0;
drh79df1f42008-07-18 00:57:33 +0000610 if( (nSize & ~3)==0 ){
drh271efa52004-05-30 19:19:05 +0000611 nSize = 4; /* Minimum cell size is 4 */
drh43605152004-05-29 21:46:49 +0000612 }
drh1bd10f82008-12-10 21:19:56 +0000613 pInfo->nSize = (u16)nSize;
drh6f11bef2004-05-13 01:12:56 +0000614 }else{
drh271efa52004-05-30 19:19:05 +0000615 /* If the payload will not fit completely on the local page, we have
616 ** to decide how much to store locally and how much to spill onto
617 ** overflow pages. The strategy is to minimize the amount of unused
618 ** space on overflow pages while keeping the amount of local storage
619 ** in between minLocal and maxLocal.
620 **
621 ** Warning: changing the way overflow payload is distributed in any
622 ** way will result in an incompatible file format.
623 */
624 int minLocal; /* Minimum amount of payload held locally */
625 int maxLocal; /* Maximum amount of payload held locally */
626 int surplus; /* Overflow payload available for local storage */
627
628 minLocal = pPage->minLocal;
629 maxLocal = pPage->maxLocal;
630 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
drh6f11bef2004-05-13 01:12:56 +0000631 if( surplus <= maxLocal ){
drhf49661a2008-12-10 16:45:50 +0000632 pInfo->nLocal = (u16)surplus;
drh6f11bef2004-05-13 01:12:56 +0000633 }else{
drhf49661a2008-12-10 16:45:50 +0000634 pInfo->nLocal = (u16)minLocal;
drh6f11bef2004-05-13 01:12:56 +0000635 }
drhf49661a2008-12-10 16:45:50 +0000636 pInfo->iOverflow = (u16)(pInfo->nLocal + n);
drh6f11bef2004-05-13 01:12:56 +0000637 pInfo->nSize = pInfo->iOverflow + 4;
638 }
drh3aac2dd2004-04-26 14:10:20 +0000639}
danielk19771cc5ed82007-05-16 17:28:43 +0000640#define parseCell(pPage, iCell, pInfo) \
641 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
drh16a9b832007-05-05 18:39:25 +0000642void sqlite3BtreeParseCell(
drh43605152004-05-29 21:46:49 +0000643 MemPage *pPage, /* Page containing the cell */
644 int iCell, /* The cell index. First cell is 0 */
645 CellInfo *pInfo /* Fill in this structure */
646){
danielk19771cc5ed82007-05-16 17:28:43 +0000647 parseCell(pPage, iCell, pInfo);
drh43605152004-05-29 21:46:49 +0000648}
drh3aac2dd2004-04-26 14:10:20 +0000649
650/*
drh43605152004-05-29 21:46:49 +0000651** Compute the total number of bytes that a Cell needs in the cell
652** data area of the btree-page. The return number includes the cell
653** data header and the local payload, but not any overflow page or
654** the space used by the cell pointer.
drh3b7511c2001-05-26 13:15:44 +0000655*/
danielk1977bc6ada42004-06-30 08:20:16 +0000656#ifndef NDEBUG
drha9121e42008-02-19 14:59:35 +0000657static u16 cellSize(MemPage *pPage, int iCell){
drh6f11bef2004-05-13 01:12:56 +0000658 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000659 sqlite3BtreeParseCell(pPage, iCell, &info);
drh43605152004-05-29 21:46:49 +0000660 return info.nSize;
661}
danielk1977bc6ada42004-06-30 08:20:16 +0000662#endif
drha9121e42008-02-19 14:59:35 +0000663static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
drh43605152004-05-29 21:46:49 +0000664 CellInfo info;
drh16a9b832007-05-05 18:39:25 +0000665 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +0000666 return info.nSize;
drh3b7511c2001-05-26 13:15:44 +0000667}
668
danielk197779a40da2005-01-16 08:00:01 +0000669#ifndef SQLITE_OMIT_AUTOVACUUM
drh3b7511c2001-05-26 13:15:44 +0000670/*
danielk197726836652005-01-17 01:33:13 +0000671** If the cell pCell, part of page pPage contains a pointer
danielk197779a40da2005-01-16 08:00:01 +0000672** to an overflow page, insert an entry into the pointer-map
673** for the overflow page.
danielk1977ac11ee62005-01-15 12:45:51 +0000674*/
danielk197726836652005-01-17 01:33:13 +0000675static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
drhfa67c3c2008-07-11 02:21:40 +0000676 CellInfo info;
677 assert( pCell!=0 );
678 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
679 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
680 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
681 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
682 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977ac11ee62005-01-15 12:45:51 +0000683 }
danielk197779a40da2005-01-16 08:00:01 +0000684 return SQLITE_OK;
danielk1977ac11ee62005-01-15 12:45:51 +0000685}
danielk197726836652005-01-17 01:33:13 +0000686/*
687** If the cell with index iCell on page pPage contains a pointer
688** to an overflow page, insert an entry into the pointer-map
689** for the overflow page.
690*/
691static int ptrmapPutOvfl(MemPage *pPage, int iCell){
692 u8 *pCell;
drh1fee73e2007-08-29 04:00:57 +0000693 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197726836652005-01-17 01:33:13 +0000694 pCell = findOverflowCell(pPage, iCell);
695 return ptrmapPutOvflPtr(pPage, pCell);
696}
danielk197779a40da2005-01-16 08:00:01 +0000697#endif
698
danielk1977ac11ee62005-01-15 12:45:51 +0000699
drhda200cc2004-05-09 11:51:38 +0000700/*
drh72f82862001-05-24 21:06:34 +0000701** Defragment the page given. All Cells are moved to the
drh3a4a2d42005-11-24 14:24:28 +0000702** end of the page and all free space is collected into one
703** big FreeBlk that occurs in between the header and cell
drh31beae92005-11-24 14:34:36 +0000704** pointer array and the cell content area.
drh365d68f2001-05-11 11:02:46 +0000705*/
shane0af3f892008-11-12 04:55:34 +0000706static int defragmentPage(MemPage *pPage){
drh43605152004-05-29 21:46:49 +0000707 int i; /* Loop counter */
708 int pc; /* Address of a i-th cell */
709 int addr; /* Offset of first byte after cell pointer array */
710 int hdr; /* Offset to the page header */
711 int size; /* Size of a cell */
712 int usableSize; /* Number of usable bytes on a page */
713 int cellOffset; /* Offset to the cell pointer array */
drh281b21d2008-08-22 12:57:08 +0000714 int cbrk; /* Offset to the cell content area */
drh43605152004-05-29 21:46:49 +0000715 int nCell; /* Number of cells on the page */
drh2e38c322004-09-03 18:38:44 +0000716 unsigned char *data; /* The page data */
717 unsigned char *temp; /* Temp area for cell content */
drh2af926b2001-05-15 00:39:25 +0000718
danielk19773b8a05f2007-03-19 17:44:26 +0000719 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000720 assert( pPage->pBt!=0 );
drh90f5ecb2004-07-22 01:19:35 +0000721 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
drh43605152004-05-29 21:46:49 +0000722 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +0000723 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh26b79942007-11-28 16:19:56 +0000724 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
drh43605152004-05-29 21:46:49 +0000725 data = pPage->aData;
drh9e572e62004-04-23 23:43:10 +0000726 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000727 cellOffset = pPage->cellOffset;
728 nCell = pPage->nCell;
729 assert( nCell==get2byte(&data[hdr+3]) );
730 usableSize = pPage->pBt->usableSize;
drh281b21d2008-08-22 12:57:08 +0000731 cbrk = get2byte(&data[hdr+5]);
732 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
733 cbrk = usableSize;
drh43605152004-05-29 21:46:49 +0000734 for(i=0; i<nCell; i++){
735 u8 *pAddr; /* The i-th cell pointer */
736 pAddr = &data[cellOffset + i*2];
737 pc = get2byte(pAddr);
shanedcc50b72008-11-13 18:29:50 +0000738 if( pc>=usableSize ){
shane0af3f892008-11-12 04:55:34 +0000739 return SQLITE_CORRUPT_BKPT;
740 }
drh43605152004-05-29 21:46:49 +0000741 size = cellSizePtr(pPage, &temp[pc]);
drh281b21d2008-08-22 12:57:08 +0000742 cbrk -= size;
danielk19770d065412008-11-12 18:21:36 +0000743 if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){
shane0af3f892008-11-12 04:55:34 +0000744 return SQLITE_CORRUPT_BKPT;
745 }
danielk19770d065412008-11-12 18:21:36 +0000746 assert( cbrk+size<=usableSize && cbrk>=0 );
drh281b21d2008-08-22 12:57:08 +0000747 memcpy(&data[cbrk], &temp[pc], size);
748 put2byte(pAddr, cbrk);
drh2af926b2001-05-15 00:39:25 +0000749 }
drh281b21d2008-08-22 12:57:08 +0000750 assert( cbrk>=cellOffset+2*nCell );
751 put2byte(&data[hdr+5], cbrk);
drh43605152004-05-29 21:46:49 +0000752 data[hdr+1] = 0;
753 data[hdr+2] = 0;
754 data[hdr+7] = 0;
755 addr = cellOffset+2*nCell;
drh281b21d2008-08-22 12:57:08 +0000756 memset(&data[addr], 0, cbrk-addr);
drhc5053fb2008-11-27 02:22:10 +0000757 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk1977360e6342008-11-12 08:49:51 +0000758 if( cbrk-addr!=pPage->nFree ){
759 return SQLITE_CORRUPT_BKPT;
760 }
shane0af3f892008-11-12 04:55:34 +0000761 return SQLITE_OK;
drh365d68f2001-05-11 11:02:46 +0000762}
763
drha059ad02001-04-17 20:09:11 +0000764/*
drh43605152004-05-29 21:46:49 +0000765** Allocate nByte bytes of space on a page.
drhbd03cae2001-06-02 02:40:57 +0000766**
drh9e572e62004-04-23 23:43:10 +0000767** Return the index into pPage->aData[] of the first byte of
drhfa67c3c2008-07-11 02:21:40 +0000768** the new allocation. The caller guarantees that there is enough
769** space. This routine will never fail.
drh2af926b2001-05-15 00:39:25 +0000770**
drh72f82862001-05-24 21:06:34 +0000771** If the page contains nBytes of free space but does not contain
drh8b2f49b2001-06-08 00:21:52 +0000772** nBytes of contiguous free space, then this routine automatically
773** calls defragementPage() to consolidate all free space before
774** allocating the new chunk.
drh7e3b0a02001-04-28 16:52:40 +0000775*/
drh9e572e62004-04-23 23:43:10 +0000776static int allocateSpace(MemPage *pPage, int nByte){
drh3aac2dd2004-04-26 14:10:20 +0000777 int addr, pc, hdr;
drh9e572e62004-04-23 23:43:10 +0000778 int size;
drh24cd67e2004-05-10 16:18:47 +0000779 int nFrag;
drh43605152004-05-29 21:46:49 +0000780 int top;
781 int nCell;
782 int cellOffset;
drh9e572e62004-04-23 23:43:10 +0000783 unsigned char *data;
drh43605152004-05-29 21:46:49 +0000784
drh9e572e62004-04-23 23:43:10 +0000785 data = pPage->aData;
danielk19773b8a05f2007-03-19 17:44:26 +0000786 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000787 assert( pPage->pBt );
drh1fee73e2007-08-29 04:00:57 +0000788 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhfa67c3c2008-07-11 02:21:40 +0000789 assert( nByte>=0 ); /* Minimum cell size is 4 */
790 assert( pPage->nFree>=nByte );
791 assert( pPage->nOverflow==0 );
drhf49661a2008-12-10 16:45:50 +0000792 pPage->nFree -= (u16)nByte;
drh9e572e62004-04-23 23:43:10 +0000793 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +0000794
795 nFrag = data[hdr+7];
796 if( nFrag<60 ){
797 /* Search the freelist looking for a slot big enough to satisfy the
798 ** space request. */
799 addr = hdr+1;
800 while( (pc = get2byte(&data[addr]))>0 ){
801 size = get2byte(&data[pc+2]);
802 if( size>=nByte ){
drhf49661a2008-12-10 16:45:50 +0000803 int x = size - nByte;
drh43605152004-05-29 21:46:49 +0000804 if( size<nByte+4 ){
805 memcpy(&data[addr], &data[pc], 2);
drhf49661a2008-12-10 16:45:50 +0000806 data[hdr+7] = (u8)(nFrag + x);
drh43605152004-05-29 21:46:49 +0000807 return pc;
808 }else{
drhf49661a2008-12-10 16:45:50 +0000809 put2byte(&data[pc+2], x);
810 return pc + x;
drh43605152004-05-29 21:46:49 +0000811 }
812 }
813 addr = pc;
drh9e572e62004-04-23 23:43:10 +0000814 }
815 }
drh43605152004-05-29 21:46:49 +0000816
817 /* Allocate memory from the gap in between the cell pointer array
818 ** and the cell content area.
819 */
820 top = get2byte(&data[hdr+5]);
821 nCell = get2byte(&data[hdr+3]);
822 cellOffset = pPage->cellOffset;
823 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
danielk1977474b7cc2008-07-09 11:49:46 +0000824 defragmentPage(pPage);
drh43605152004-05-29 21:46:49 +0000825 top = get2byte(&data[hdr+5]);
drh2af926b2001-05-15 00:39:25 +0000826 }
drh43605152004-05-29 21:46:49 +0000827 top -= nByte;
828 assert( cellOffset + 2*nCell <= top );
829 put2byte(&data[hdr+5], top);
drhc5053fb2008-11-27 02:22:10 +0000830 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +0000831 return top;
drh7e3b0a02001-04-28 16:52:40 +0000832}
833
834/*
drh9e572e62004-04-23 23:43:10 +0000835** Return a section of the pPage->aData to the freelist.
836** The first byte of the new free block is pPage->aDisk[start]
837** and the size of the block is "size" bytes.
drh306dc212001-05-21 13:45:10 +0000838**
839** Most of the effort here is involved in coalesing adjacent
840** free blocks into a single big free block.
drh7e3b0a02001-04-28 16:52:40 +0000841*/
shanedcc50b72008-11-13 18:29:50 +0000842static int freeSpace(MemPage *pPage, int start, int size){
drh43605152004-05-29 21:46:49 +0000843 int addr, pbegin, hdr;
drh9e572e62004-04-23 23:43:10 +0000844 unsigned char *data = pPage->aData;
drh2af926b2001-05-15 00:39:25 +0000845
drh9e572e62004-04-23 23:43:10 +0000846 assert( pPage->pBt!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +0000847 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh9e572e62004-04-23 23:43:10 +0000848 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
danielk1977bc6ada42004-06-30 08:20:16 +0000849 assert( (start + size)<=pPage->pBt->usableSize );
drh1fee73e2007-08-29 04:00:57 +0000850 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh34004ce2008-07-11 16:15:17 +0000851 assert( size>=0 ); /* Minimum cell size is 4 */
drh9e572e62004-04-23 23:43:10 +0000852
drhfcce93f2006-02-22 03:08:32 +0000853#ifdef SQLITE_SECURE_DELETE
854 /* Overwrite deleted information with zeros when the SECURE_DELETE
855 ** option is enabled at compile-time */
856 memset(&data[start], 0, size);
857#endif
858
drh9e572e62004-04-23 23:43:10 +0000859 /* Add the space back into the linked list of freeblocks */
drh43605152004-05-29 21:46:49 +0000860 hdr = pPage->hdrOffset;
861 addr = hdr + 1;
drh3aac2dd2004-04-26 14:10:20 +0000862 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
drhb6f41482004-05-14 01:58:11 +0000863 assert( pbegin<=pPage->pBt->usableSize-4 );
shanedcc50b72008-11-13 18:29:50 +0000864 if( pbegin<=addr ) {
865 return SQLITE_CORRUPT_BKPT;
866 }
drh3aac2dd2004-04-26 14:10:20 +0000867 addr = pbegin;
drh2af926b2001-05-15 00:39:25 +0000868 }
shanedcc50b72008-11-13 18:29:50 +0000869 if ( pbegin>pPage->pBt->usableSize-4 ) {
870 return SQLITE_CORRUPT_BKPT;
871 }
drh3aac2dd2004-04-26 14:10:20 +0000872 assert( pbegin>addr || pbegin==0 );
drha34b6762004-05-07 13:30:42 +0000873 put2byte(&data[addr], start);
874 put2byte(&data[start], pbegin);
875 put2byte(&data[start+2], size);
drhf49661a2008-12-10 16:45:50 +0000876 pPage->nFree += (u16)size;
drh9e572e62004-04-23 23:43:10 +0000877
878 /* Coalesce adjacent free blocks */
drh3aac2dd2004-04-26 14:10:20 +0000879 addr = pPage->hdrOffset + 1;
880 while( (pbegin = get2byte(&data[addr]))>0 ){
drhf49661a2008-12-10 16:45:50 +0000881 int pnext, psize, x;
drh3aac2dd2004-04-26 14:10:20 +0000882 assert( pbegin>addr );
drh43605152004-05-29 21:46:49 +0000883 assert( pbegin<=pPage->pBt->usableSize-4 );
drh9e572e62004-04-23 23:43:10 +0000884 pnext = get2byte(&data[pbegin]);
885 psize = get2byte(&data[pbegin+2]);
886 if( pbegin + psize + 3 >= pnext && pnext>0 ){
887 int frag = pnext - (pbegin+psize);
drhf49661a2008-12-10 16:45:50 +0000888 if( (frag<0) || (frag>(int)data[pPage->hdrOffset+7]) ){
shanedcc50b72008-11-13 18:29:50 +0000889 return SQLITE_CORRUPT_BKPT;
890 }
drhf49661a2008-12-10 16:45:50 +0000891 data[pPage->hdrOffset+7] -= (u8)frag;
892 x = get2byte(&data[pnext]);
893 put2byte(&data[pbegin], x);
894 x = pnext + get2byte(&data[pnext+2]) - pbegin;
895 put2byte(&data[pbegin+2], x);
drh9e572e62004-04-23 23:43:10 +0000896 }else{
drh3aac2dd2004-04-26 14:10:20 +0000897 addr = pbegin;
drh9e572e62004-04-23 23:43:10 +0000898 }
899 }
drh7e3b0a02001-04-28 16:52:40 +0000900
drh43605152004-05-29 21:46:49 +0000901 /* If the cell content area begins with a freeblock, remove it. */
902 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
903 int top;
904 pbegin = get2byte(&data[hdr+1]);
905 memcpy(&data[hdr+1], &data[pbegin], 2);
drhf49661a2008-12-10 16:45:50 +0000906 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
907 put2byte(&data[hdr+5], top);
drh4b70f112004-05-02 21:12:19 +0000908 }
drhc5053fb2008-11-27 02:22:10 +0000909 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
shanedcc50b72008-11-13 18:29:50 +0000910 return SQLITE_OK;
drh4b70f112004-05-02 21:12:19 +0000911}
912
913/*
drh271efa52004-05-30 19:19:05 +0000914** Decode the flags byte (the first byte of the header) for a page
915** and initialize fields of the MemPage structure accordingly.
drh44845222008-07-17 18:39:57 +0000916**
917** Only the following combinations are supported. Anything different
918** indicates a corrupt database files:
919**
920** PTF_ZERODATA
921** PTF_ZERODATA | PTF_LEAF
922** PTF_LEAFDATA | PTF_INTKEY
923** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
drh271efa52004-05-30 19:19:05 +0000924*/
drh44845222008-07-17 18:39:57 +0000925static int decodeFlags(MemPage *pPage, int flagByte){
danielk1977aef0bf62005-12-30 16:28:01 +0000926 BtShared *pBt; /* A copy of pPage->pBt */
drh271efa52004-05-30 19:19:05 +0000927
928 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
drh1fee73e2007-08-29 04:00:57 +0000929 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf49661a2008-12-10 16:45:50 +0000930 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );
drh44845222008-07-17 18:39:57 +0000931 flagByte &= ~PTF_LEAF;
932 pPage->childPtrSize = 4-4*pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000933 pBt = pPage->pBt;
drh44845222008-07-17 18:39:57 +0000934 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
935 pPage->intKey = 1;
936 pPage->hasData = pPage->leaf;
drh271efa52004-05-30 19:19:05 +0000937 pPage->maxLocal = pBt->maxLeaf;
938 pPage->minLocal = pBt->minLeaf;
drh44845222008-07-17 18:39:57 +0000939 }else if( flagByte==PTF_ZERODATA ){
940 pPage->intKey = 0;
941 pPage->hasData = 0;
drh271efa52004-05-30 19:19:05 +0000942 pPage->maxLocal = pBt->maxLocal;
943 pPage->minLocal = pBt->minLocal;
drh44845222008-07-17 18:39:57 +0000944 }else{
945 return SQLITE_CORRUPT_BKPT;
drh271efa52004-05-30 19:19:05 +0000946 }
drh44845222008-07-17 18:39:57 +0000947 return SQLITE_OK;
drh271efa52004-05-30 19:19:05 +0000948}
949
950/*
drh7e3b0a02001-04-28 16:52:40 +0000951** Initialize the auxiliary information for a disk block.
drh72f82862001-05-24 21:06:34 +0000952**
953** Return SQLITE_OK on success. If we see that the page does
drhda47d772002-12-02 04:25:19 +0000954** not contain a well-formed database page, then return
drh72f82862001-05-24 21:06:34 +0000955** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
956** guarantee that the page is well-formed. It only shows that
957** we failed to detect any corruption.
drh7e3b0a02001-04-28 16:52:40 +0000958*/
danielk197771d5d2c2008-09-29 11:49:47 +0000959int sqlite3BtreeInitPage(MemPage *pPage){
drh2af926b2001-05-15 00:39:25 +0000960
danielk197771d5d2c2008-09-29 11:49:47 +0000961 assert( pPage->pBt!=0 );
962 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +0000963 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
drhbf4bca52007-09-06 22:19:14 +0000964 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
965 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +0000966
967 if( !pPage->isInit ){
drhf49661a2008-12-10 16:45:50 +0000968 u16 pc; /* Address of a freeblock within pPage->aData[] */
969 u8 hdr; /* Offset to beginning of page header */
danielk197771d5d2c2008-09-29 11:49:47 +0000970 u8 *data; /* Equal to pPage->aData */
971 BtShared *pBt; /* The main btree structure */
drhf49661a2008-12-10 16:45:50 +0000972 u16 usableSize; /* Amount of usable space on each page */
973 u16 cellOffset; /* Offset from start of page to first cell pointer */
974 u16 nFree; /* Number of unused bytes on the page */
975 u16 top; /* First byte of the cell content area */
danielk197771d5d2c2008-09-29 11:49:47 +0000976
977 pBt = pPage->pBt;
978
danielk1977eaa06f62008-09-18 17:34:44 +0000979 hdr = pPage->hdrOffset;
980 data = pPage->aData;
981 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
982 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
983 pPage->maskPage = pBt->pageSize - 1;
984 pPage->nOverflow = 0;
danielk1977eaa06f62008-09-18 17:34:44 +0000985 usableSize = pBt->usableSize;
986 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
987 top = get2byte(&data[hdr+5]);
988 pPage->nCell = get2byte(&data[hdr+3]);
989 if( pPage->nCell>MX_CELL(pBt) ){
990 /* To many cells for a single page. The page must be corrupt */
991 return SQLITE_CORRUPT_BKPT;
992 }
danielk1977eaa06f62008-09-18 17:34:44 +0000993
994 /* Compute the total free space on the page */
995 pc = get2byte(&data[hdr+1]);
996 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
997 while( pc>0 ){
drh1bd10f82008-12-10 21:19:56 +0000998 u16 next, size;
danielk1977eaa06f62008-09-18 17:34:44 +0000999 if( pc>usableSize-4 ){
1000 /* Free block is off the page */
1001 return SQLITE_CORRUPT_BKPT;
1002 }
1003 next = get2byte(&data[pc]);
1004 size = get2byte(&data[pc+2]);
1005 if( next>0 && next<=pc+size+3 ){
1006 /* Free blocks must be in accending order */
1007 return SQLITE_CORRUPT_BKPT;
1008 }
1009 nFree += size;
1010 pc = next;
1011 }
drhf49661a2008-12-10 16:45:50 +00001012 pPage->nFree = (u16)nFree;
danielk1977eaa06f62008-09-18 17:34:44 +00001013 if( nFree>=usableSize ){
1014 /* Free space cannot exceed total page size */
drh49285702005-09-17 15:20:26 +00001015 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001016 }
drh9e572e62004-04-23 23:43:10 +00001017
drh1688c862008-07-18 02:44:17 +00001018#if 0
1019 /* Check that all the offsets in the cell offset array are within range.
1020 **
1021 ** Omitting this consistency check and using the pPage->maskPage mask
1022 ** to prevent overrunning the page buffer in findCell() results in a
1023 ** 2.5% performance gain.
1024 */
1025 {
1026 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1027 u8 *pEnd; /* Pointer to end of cell offset array */
1028 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1029 mask = ~(((u8)(pBt->pageSize>>8))-1);
1030 pEnd = &data[cellOffset + pPage->nCell*2];
1031 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1032 if( pOff!=pEnd ){
1033 return SQLITE_CORRUPT_BKPT;
1034 }
danielk1977e16535f2008-06-11 18:15:29 +00001035 }
drh1688c862008-07-18 02:44:17 +00001036#endif
danielk1977e16535f2008-06-11 18:15:29 +00001037
danielk197771d5d2c2008-09-29 11:49:47 +00001038 pPage->isInit = 1;
1039 }
drh9e572e62004-04-23 23:43:10 +00001040 return SQLITE_OK;
drh7e3b0a02001-04-28 16:52:40 +00001041}
1042
1043/*
drh8b2f49b2001-06-08 00:21:52 +00001044** Set up a raw page so that it looks like a database page holding
1045** no entries.
drhbd03cae2001-06-02 02:40:57 +00001046*/
drh9e572e62004-04-23 23:43:10 +00001047static void zeroPage(MemPage *pPage, int flags){
1048 unsigned char *data = pPage->aData;
danielk1977aef0bf62005-12-30 16:28:01 +00001049 BtShared *pBt = pPage->pBt;
drhf49661a2008-12-10 16:45:50 +00001050 u8 hdr = pPage->hdrOffset;
1051 u16 first;
drh9e572e62004-04-23 23:43:10 +00001052
danielk19773b8a05f2007-03-19 17:44:26 +00001053 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
drhbf4bca52007-09-06 22:19:14 +00001054 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1055 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
danielk19773b8a05f2007-03-19 17:44:26 +00001056 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00001057 assert( sqlite3_mutex_held(pBt->mutex) );
drh1af4a6e2008-07-18 03:32:51 +00001058 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
drh1bd10f82008-12-10 21:19:56 +00001059 data[hdr] = (char)flags;
1060 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
drh43605152004-05-29 21:46:49 +00001061 memset(&data[hdr+1], 0, 4);
1062 data[hdr+7] = 0;
1063 put2byte(&data[hdr+5], pBt->usableSize);
drhb6f41482004-05-14 01:58:11 +00001064 pPage->nFree = pBt->usableSize - first;
drh271efa52004-05-30 19:19:05 +00001065 decodeFlags(pPage, flags);
drh9e572e62004-04-23 23:43:10 +00001066 pPage->hdrOffset = hdr;
drh43605152004-05-29 21:46:49 +00001067 pPage->cellOffset = first;
1068 pPage->nOverflow = 0;
drh1688c862008-07-18 02:44:17 +00001069 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1070 pPage->maskPage = pBt->pageSize - 1;
drh43605152004-05-29 21:46:49 +00001071 pPage->nCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001072 pPage->isInit = 1;
drhbd03cae2001-06-02 02:40:57 +00001073}
1074
drh897a8202008-09-18 01:08:15 +00001075
1076/*
1077** Convert a DbPage obtained from the pager into a MemPage used by
1078** the btree layer.
1079*/
1080static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1081 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1082 pPage->aData = sqlite3PagerGetData(pDbPage);
1083 pPage->pDbPage = pDbPage;
1084 pPage->pBt = pBt;
1085 pPage->pgno = pgno;
1086 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1087 return pPage;
1088}
1089
drhbd03cae2001-06-02 02:40:57 +00001090/*
drh3aac2dd2004-04-26 14:10:20 +00001091** Get a page from the pager. Initialize the MemPage.pBt and
1092** MemPage.aData elements if needed.
drh538f5702007-04-13 02:14:30 +00001093**
1094** If the noContent flag is set, it means that we do not care about
1095** the content of the page at this time. So do not go to the disk
1096** to fetch the content. Just fill in the content with zeros for now.
1097** If in the future we call sqlite3PagerWrite() on this page, that
1098** means we have started to be concerned about content and the disk
1099** read should occur at that point.
drh3aac2dd2004-04-26 14:10:20 +00001100*/
drh16a9b832007-05-05 18:39:25 +00001101int sqlite3BtreeGetPage(
1102 BtShared *pBt, /* The btree */
1103 Pgno pgno, /* Number of the page to fetch */
1104 MemPage **ppPage, /* Return the page in this parameter */
1105 int noContent /* Do not load page content if true */
1106){
drh3aac2dd2004-04-26 14:10:20 +00001107 int rc;
danielk19773b8a05f2007-03-19 17:44:26 +00001108 DbPage *pDbPage;
1109
drh1fee73e2007-08-29 04:00:57 +00001110 assert( sqlite3_mutex_held(pBt->mutex) );
drh538f5702007-04-13 02:14:30 +00001111 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
drh3aac2dd2004-04-26 14:10:20 +00001112 if( rc ) return rc;
drh897a8202008-09-18 01:08:15 +00001113 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
drh3aac2dd2004-04-26 14:10:20 +00001114 return SQLITE_OK;
1115}
1116
1117/*
danielk197789d40042008-11-17 14:20:56 +00001118** Return the size of the database file in pages. If there is any kind of
1119** error, return ((unsigned int)-1).
danielk197767fd7a92008-09-10 17:53:35 +00001120*/
danielk197789d40042008-11-17 14:20:56 +00001121static Pgno pagerPagecount(BtShared *pBt){
1122 int nPage = -1;
danielk197767fd7a92008-09-10 17:53:35 +00001123 int rc;
danielk197789d40042008-11-17 14:20:56 +00001124 assert( pBt->pPage1 );
1125 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1126 assert( rc==SQLITE_OK || nPage==-1 );
1127 return (Pgno)nPage;
danielk197767fd7a92008-09-10 17:53:35 +00001128}
1129
1130/*
drhde647132004-05-07 17:57:49 +00001131** Get a page from the pager and initialize it. This routine
1132** is just a convenience wrapper around separate calls to
drh16a9b832007-05-05 18:39:25 +00001133** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
drhde647132004-05-07 17:57:49 +00001134*/
1135static int getAndInitPage(
danielk1977aef0bf62005-12-30 16:28:01 +00001136 BtShared *pBt, /* The database file */
drhde647132004-05-07 17:57:49 +00001137 Pgno pgno, /* Number of the page to get */
danielk197771d5d2c2008-09-29 11:49:47 +00001138 MemPage **ppPage /* Write the page pointer here */
drhde647132004-05-07 17:57:49 +00001139){
1140 int rc;
drh897a8202008-09-18 01:08:15 +00001141 DbPage *pDbPage;
1142 MemPage *pPage;
1143
drh1fee73e2007-08-29 04:00:57 +00001144 assert( sqlite3_mutex_held(pBt->mutex) );
drh897a8202008-09-18 01:08:15 +00001145 if( pgno==0 ){
drh49285702005-09-17 15:20:26 +00001146 return SQLITE_CORRUPT_BKPT;
drhee696e22004-08-30 16:52:17 +00001147 }
danielk19779f580ad2008-09-10 14:45:57 +00001148
drh897a8202008-09-18 01:08:15 +00001149 /* It is often the case that the page we want is already in cache.
1150 ** If so, get it directly. This saves us from having to call
1151 ** pagerPagecount() to make sure pgno is within limits, which results
1152 ** in a measureable performance improvements.
1153 */
1154 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1155 if( pDbPage ){
1156 /* Page is already in cache */
1157 *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1158 rc = SQLITE_OK;
1159 }else{
1160 /* Page not in cache. Acquire it. */
danielk197789d40042008-11-17 14:20:56 +00001161 if( pgno>pagerPagecount(pBt) ){
drh897a8202008-09-18 01:08:15 +00001162 return SQLITE_CORRUPT_BKPT;
1163 }
1164 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1165 if( rc ) return rc;
1166 pPage = *ppPage;
1167 }
danielk197771d5d2c2008-09-29 11:49:47 +00001168 if( !pPage->isInit ){
1169 rc = sqlite3BtreeInitPage(pPage);
drh897a8202008-09-18 01:08:15 +00001170 }
1171 if( rc!=SQLITE_OK ){
1172 releasePage(pPage);
1173 *ppPage = 0;
1174 }
drhde647132004-05-07 17:57:49 +00001175 return rc;
1176}
1177
1178/*
drh3aac2dd2004-04-26 14:10:20 +00001179** Release a MemPage. This should be called once for each prior
drh16a9b832007-05-05 18:39:25 +00001180** call to sqlite3BtreeGetPage.
drh3aac2dd2004-04-26 14:10:20 +00001181*/
drh4b70f112004-05-02 21:12:19 +00001182static void releasePage(MemPage *pPage){
drh3aac2dd2004-04-26 14:10:20 +00001183 if( pPage ){
1184 assert( pPage->aData );
1185 assert( pPage->pBt );
drhbf4bca52007-09-06 22:19:14 +00001186 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1187 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
drh1fee73e2007-08-29 04:00:57 +00001188 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk19773b8a05f2007-03-19 17:44:26 +00001189 sqlite3PagerUnref(pPage->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00001190 }
1191}
1192
1193/*
drha6abd042004-06-09 17:37:22 +00001194** During a rollback, when the pager reloads information into the cache
1195** so that the cache is restored to its original state at the start of
1196** the transaction, for each page restored this routine is called.
1197**
1198** This routine needs to reset the extra data section at the end of the
1199** page to agree with the restored data.
1200*/
danielk1977eaa06f62008-09-18 17:34:44 +00001201static void pageReinit(DbPage *pData){
drh07d183d2005-05-01 22:52:42 +00001202 MemPage *pPage;
danielk19773b8a05f2007-03-19 17:44:26 +00001203 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
danielk197771d5d2c2008-09-29 11:49:47 +00001204 if( pPage->isInit ){
drh1fee73e2007-08-29 04:00:57 +00001205 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drha6abd042004-06-09 17:37:22 +00001206 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00001207 if( sqlite3PagerPageRefcount(pData)>0 ){
1208 sqlite3BtreeInitPage(pPage);
1209 }
drha6abd042004-06-09 17:37:22 +00001210 }
1211}
1212
1213/*
drhe5fe6902007-12-07 18:55:28 +00001214** Invoke the busy handler for a btree.
1215*/
danielk19771ceedd32008-11-19 10:22:33 +00001216static int btreeInvokeBusyHandler(void *pArg){
drhe5fe6902007-12-07 18:55:28 +00001217 BtShared *pBt = (BtShared*)pArg;
1218 assert( pBt->db );
1219 assert( sqlite3_mutex_held(pBt->db->mutex) );
1220 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1221}
1222
1223/*
drhad3e0102004-09-03 23:32:18 +00001224** Open a database file.
1225**
drh382c0242001-10-06 16:33:02 +00001226** zFilename is the name of the database file. If zFilename is NULL
drh1bee3d72001-10-15 00:44:35 +00001227** a new database with a random name is created. This randomly named
drh23e11ca2004-05-04 17:27:28 +00001228** database file will be deleted when sqlite3BtreeClose() is called.
drhe53831d2007-08-17 01:14:38 +00001229** If zFilename is ":memory:" then an in-memory database is created
1230** that is automatically destroyed when it is closed.
drha059ad02001-04-17 20:09:11 +00001231*/
drh23e11ca2004-05-04 17:27:28 +00001232int sqlite3BtreeOpen(
drh3aac2dd2004-04-26 14:10:20 +00001233 const char *zFilename, /* Name of the file containing the BTree database */
drhe5fe6902007-12-07 18:55:28 +00001234 sqlite3 *db, /* Associated database handle */
drh3aac2dd2004-04-26 14:10:20 +00001235 Btree **ppBtree, /* Pointer to new Btree object written here */
drh33f4e022007-09-03 15:19:34 +00001236 int flags, /* Options */
1237 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
drh6019e162001-07-02 17:51:45 +00001238){
drhd677b3d2007-08-20 22:48:41 +00001239 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
drhe53831d2007-08-17 01:14:38 +00001240 BtShared *pBt = 0; /* Shared part of btree structure */
danielk1977aef0bf62005-12-30 16:28:01 +00001241 Btree *p; /* Handle to return */
danielk1977dddbcdc2007-04-26 14:42:34 +00001242 int rc = SQLITE_OK;
drhf49661a2008-12-10 16:45:50 +00001243 u8 nReserve;
drh90f5ecb2004-07-22 01:19:35 +00001244 unsigned char zDbHeader[100];
danielk1977aef0bf62005-12-30 16:28:01 +00001245
1246 /* Set the variable isMemdb to true for an in-memory database, or
1247 ** false for a file-based database. This symbol is only required if
1248 ** either of the shared-data or autovacuum features are compiled
1249 ** into the library.
1250 */
1251#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1252 #ifdef SQLITE_OMIT_MEMORYDB
drh980b1a72006-08-16 16:42:48 +00001253 const int isMemdb = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00001254 #else
drh980b1a72006-08-16 16:42:48 +00001255 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
danielk1977aef0bf62005-12-30 16:28:01 +00001256 #endif
1257#endif
1258
drhe5fe6902007-12-07 18:55:28 +00001259 assert( db!=0 );
1260 assert( sqlite3_mutex_held(db->mutex) );
drh153c62c2007-08-24 03:51:33 +00001261
drhe5fe6902007-12-07 18:55:28 +00001262 pVfs = db->pVfs;
drh17435752007-08-16 04:30:38 +00001263 p = sqlite3MallocZero(sizeof(Btree));
danielk1977aef0bf62005-12-30 16:28:01 +00001264 if( !p ){
1265 return SQLITE_NOMEM;
1266 }
1267 p->inTrans = TRANS_NONE;
drhe5fe6902007-12-07 18:55:28 +00001268 p->db = db;
danielk1977aef0bf62005-12-30 16:28:01 +00001269
drh198bf392006-01-06 21:52:49 +00001270#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001271 /*
1272 ** If this Btree is a candidate for shared cache, try to find an
1273 ** existing BtShared object that we can share with
1274 */
drh34004ce2008-07-11 16:15:17 +00001275 if( isMemdb==0
drhe5fe6902007-12-07 18:55:28 +00001276 && (db->flags & SQLITE_Vtab)==0
drhe53831d2007-08-17 01:14:38 +00001277 && zFilename && zFilename[0]
drhe53831d2007-08-17 01:14:38 +00001278 ){
danielk1977502b4e02008-09-02 14:07:24 +00001279 if( sqlite3GlobalConfig.sharedCacheEnabled ){
danielk1977adfb9b02007-09-17 07:02:56 +00001280 int nFullPathname = pVfs->mxPathname+1;
drhe5ae5732008-06-15 02:51:47 +00001281 char *zFullPathname = sqlite3Malloc(nFullPathname);
drhff0587c2007-08-29 17:43:19 +00001282 sqlite3_mutex *mutexShared;
1283 p->sharable = 1;
drh34004ce2008-07-11 16:15:17 +00001284 db->flags |= SQLITE_SharedCache;
drhff0587c2007-08-29 17:43:19 +00001285 if( !zFullPathname ){
1286 sqlite3_free(p);
1287 return SQLITE_NOMEM;
1288 }
danielk1977adfb9b02007-09-17 07:02:56 +00001289 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
danielk197759f8c082008-06-18 17:09:10 +00001290 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhff0587c2007-08-29 17:43:19 +00001291 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001292 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
drhff0587c2007-08-29 17:43:19 +00001293 assert( pBt->nRef>0 );
1294 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1295 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1296 p->pBt = pBt;
1297 pBt->nRef++;
1298 break;
1299 }
1300 }
1301 sqlite3_mutex_leave(mutexShared);
1302 sqlite3_free(zFullPathname);
danielk1977aef0bf62005-12-30 16:28:01 +00001303 }
drhff0587c2007-08-29 17:43:19 +00001304#ifdef SQLITE_DEBUG
1305 else{
1306 /* In debug mode, we mark all persistent databases as sharable
1307 ** even when they are not. This exercises the locking code and
1308 ** gives more opportunity for asserts(sqlite3_mutex_held())
1309 ** statements to find locking problems.
1310 */
1311 p->sharable = 1;
1312 }
1313#endif
danielk1977aef0bf62005-12-30 16:28:01 +00001314 }
1315#endif
drha059ad02001-04-17 20:09:11 +00001316 if( pBt==0 ){
drhe53831d2007-08-17 01:14:38 +00001317 /*
1318 ** The following asserts make sure that structures used by the btree are
1319 ** the right size. This is to guard against size changes that result
1320 ** when compiling on a different architecture.
danielk197703aded42004-11-22 05:26:27 +00001321 */
drhe53831d2007-08-17 01:14:38 +00001322 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1323 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1324 assert( sizeof(u32)==4 );
1325 assert( sizeof(u16)==2 );
1326 assert( sizeof(Pgno)==4 );
1327
1328 pBt = sqlite3MallocZero( sizeof(*pBt) );
1329 if( pBt==0 ){
1330 rc = SQLITE_NOMEM;
1331 goto btree_open_out;
1332 }
danielk197771d5d2c2008-09-29 11:49:47 +00001333 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
drh33f4e022007-09-03 15:19:34 +00001334 EXTRA_SIZE, flags, vfsFlags);
drhe53831d2007-08-17 01:14:38 +00001335 if( rc==SQLITE_OK ){
1336 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1337 }
1338 if( rc!=SQLITE_OK ){
1339 goto btree_open_out;
1340 }
danielk19771ceedd32008-11-19 10:22:33 +00001341 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
drhe53831d2007-08-17 01:14:38 +00001342 p->pBt = pBt;
1343
drhe53831d2007-08-17 01:14:38 +00001344 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1345 pBt->pCursor = 0;
1346 pBt->pPage1 = 0;
1347 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1348 pBt->pageSize = get2byte(&zDbHeader[16]);
1349 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1350 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
danielk1977a1644fd2007-08-29 12:31:25 +00001351 pBt->pageSize = 0;
1352 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001353#ifndef SQLITE_OMIT_AUTOVACUUM
1354 /* If the magic name ":memory:" will create an in-memory database, then
1355 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1356 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1357 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1358 ** regular file-name. In this case the auto-vacuum applies as per normal.
1359 */
1360 if( zFilename && !isMemdb ){
1361 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1362 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1363 }
1364#endif
1365 nReserve = 0;
1366 }else{
1367 nReserve = zDbHeader[20];
drhe53831d2007-08-17 01:14:38 +00001368 pBt->pageSizeFixed = 1;
1369#ifndef SQLITE_OMIT_AUTOVACUUM
1370 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1371 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1372#endif
1373 }
1374 pBt->usableSize = pBt->pageSize - nReserve;
1375 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
danielk1977a1644fd2007-08-29 12:31:25 +00001376 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drhe53831d2007-08-17 01:14:38 +00001377
1378#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1379 /* Add the new BtShared object to the linked list sharable BtShareds.
1380 */
1381 if( p->sharable ){
1382 sqlite3_mutex *mutexShared;
1383 pBt->nRef = 1;
danielk197759f8c082008-06-18 17:09:10 +00001384 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
danielk1977075c23a2008-09-01 18:34:20 +00001385 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
danielk197759f8c082008-06-18 17:09:10 +00001386 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
drh3285db22007-09-03 22:00:39 +00001387 if( pBt->mutex==0 ){
1388 rc = SQLITE_NOMEM;
drhe5fe6902007-12-07 18:55:28 +00001389 db->mallocFailed = 0;
drh3285db22007-09-03 22:00:39 +00001390 goto btree_open_out;
1391 }
drhff0587c2007-08-29 17:43:19 +00001392 }
drhe53831d2007-08-17 01:14:38 +00001393 sqlite3_mutex_enter(mutexShared);
drh78f82d12008-09-02 00:52:52 +00001394 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1395 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
drhe53831d2007-08-17 01:14:38 +00001396 sqlite3_mutex_leave(mutexShared);
danielk1977951af802004-11-05 15:45:09 +00001397 }
drheee46cf2004-11-06 00:02:48 +00001398#endif
drh90f5ecb2004-07-22 01:19:35 +00001399 }
danielk1977aef0bf62005-12-30 16:28:01 +00001400
drhcfed7bc2006-03-13 14:28:05 +00001401#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
drhe53831d2007-08-17 01:14:38 +00001402 /* If the new Btree uses a sharable pBtShared, then link the new
1403 ** Btree into the list of all sharable Btrees for the same connection.
drhabddb0c2007-08-20 13:14:28 +00001404 ** The list is kept in ascending order by pBt address.
danielk197754f01982006-01-18 15:25:17 +00001405 */
drhe53831d2007-08-17 01:14:38 +00001406 if( p->sharable ){
1407 int i;
1408 Btree *pSib;
drhe5fe6902007-12-07 18:55:28 +00001409 for(i=0; i<db->nDb; i++){
1410 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
drhe53831d2007-08-17 01:14:38 +00001411 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1412 if( p->pBt<pSib->pBt ){
1413 p->pNext = pSib;
1414 p->pPrev = 0;
1415 pSib->pPrev = p;
1416 }else{
drhabddb0c2007-08-20 13:14:28 +00001417 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
drhe53831d2007-08-17 01:14:38 +00001418 pSib = pSib->pNext;
1419 }
1420 p->pNext = pSib->pNext;
1421 p->pPrev = pSib;
1422 if( p->pNext ){
1423 p->pNext->pPrev = p;
1424 }
1425 pSib->pNext = p;
1426 }
1427 break;
1428 }
1429 }
danielk1977aef0bf62005-12-30 16:28:01 +00001430 }
danielk1977aef0bf62005-12-30 16:28:01 +00001431#endif
1432 *ppBtree = p;
danielk1977dddbcdc2007-04-26 14:42:34 +00001433
1434btree_open_out:
1435 if( rc!=SQLITE_OK ){
1436 if( pBt && pBt->pPager ){
1437 sqlite3PagerClose(pBt->pPager);
1438 }
drh17435752007-08-16 04:30:38 +00001439 sqlite3_free(pBt);
1440 sqlite3_free(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001441 *ppBtree = 0;
1442 }
1443 return rc;
drha059ad02001-04-17 20:09:11 +00001444}
1445
1446/*
drhe53831d2007-08-17 01:14:38 +00001447** Decrement the BtShared.nRef counter. When it reaches zero,
1448** remove the BtShared structure from the sharing list. Return
1449** true if the BtShared.nRef counter reaches zero and return
1450** false if it is still positive.
1451*/
1452static int removeFromSharingList(BtShared *pBt){
1453#ifndef SQLITE_OMIT_SHARED_CACHE
1454 sqlite3_mutex *pMaster;
1455 BtShared *pList;
1456 int removed = 0;
1457
drhd677b3d2007-08-20 22:48:41 +00001458 assert( sqlite3_mutex_notheld(pBt->mutex) );
danielk197759f8c082008-06-18 17:09:10 +00001459 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
drhe53831d2007-08-17 01:14:38 +00001460 sqlite3_mutex_enter(pMaster);
1461 pBt->nRef--;
1462 if( pBt->nRef<=0 ){
drh78f82d12008-09-02 00:52:52 +00001463 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1464 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
drhe53831d2007-08-17 01:14:38 +00001465 }else{
drh78f82d12008-09-02 00:52:52 +00001466 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
drh34004ce2008-07-11 16:15:17 +00001467 while( ALWAYS(pList) && pList->pNext!=pBt ){
drhe53831d2007-08-17 01:14:38 +00001468 pList=pList->pNext;
1469 }
drh34004ce2008-07-11 16:15:17 +00001470 if( ALWAYS(pList) ){
drhe53831d2007-08-17 01:14:38 +00001471 pList->pNext = pBt->pNext;
1472 }
1473 }
drh3285db22007-09-03 22:00:39 +00001474 if( SQLITE_THREADSAFE ){
1475 sqlite3_mutex_free(pBt->mutex);
1476 }
drhe53831d2007-08-17 01:14:38 +00001477 removed = 1;
1478 }
1479 sqlite3_mutex_leave(pMaster);
1480 return removed;
1481#else
1482 return 1;
1483#endif
1484}
1485
1486/*
drhf7141992008-06-19 00:16:08 +00001487** Make sure pBt->pTmpSpace points to an allocation of
1488** MX_CELL_SIZE(pBt) bytes.
1489*/
1490static void allocateTempSpace(BtShared *pBt){
1491 if( !pBt->pTmpSpace ){
1492 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1493 }
1494}
1495
1496/*
1497** Free the pBt->pTmpSpace allocation
1498*/
1499static void freeTempSpace(BtShared *pBt){
1500 sqlite3PageFree( pBt->pTmpSpace);
1501 pBt->pTmpSpace = 0;
1502}
1503
1504/*
drha059ad02001-04-17 20:09:11 +00001505** Close an open database and invalidate all cursors.
1506*/
danielk1977aef0bf62005-12-30 16:28:01 +00001507int sqlite3BtreeClose(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00001508 BtShared *pBt = p->pBt;
1509 BtCursor *pCur;
1510
danielk1977aef0bf62005-12-30 16:28:01 +00001511 /* Close all cursors opened via this handle. */
drhe5fe6902007-12-07 18:55:28 +00001512 assert( sqlite3_mutex_held(p->db->mutex) );
drhe53831d2007-08-17 01:14:38 +00001513 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001514 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001515 pCur = pBt->pCursor;
1516 while( pCur ){
1517 BtCursor *pTmp = pCur;
1518 pCur = pCur->pNext;
1519 if( pTmp->pBtree==p ){
1520 sqlite3BtreeCloseCursor(pTmp);
1521 }
drha059ad02001-04-17 20:09:11 +00001522 }
danielk1977aef0bf62005-12-30 16:28:01 +00001523
danielk19778d34dfd2006-01-24 16:37:57 +00001524 /* Rollback any active transaction and free the handle structure.
1525 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1526 ** this handle.
1527 */
danielk1977b597f742006-01-15 11:39:18 +00001528 sqlite3BtreeRollback(p);
drhe53831d2007-08-17 01:14:38 +00001529 sqlite3BtreeLeave(p);
danielk1977aef0bf62005-12-30 16:28:01 +00001530
danielk1977aef0bf62005-12-30 16:28:01 +00001531 /* If there are still other outstanding references to the shared-btree
1532 ** structure, return now. The remainder of this procedure cleans
1533 ** up the shared-btree.
1534 */
drhe53831d2007-08-17 01:14:38 +00001535 assert( p->wantToLock==0 && p->locked==0 );
1536 if( !p->sharable || removeFromSharingList(pBt) ){
1537 /* The pBt is no longer on the sharing list, so we can access
1538 ** it without having to hold the mutex.
1539 **
1540 ** Clean out and delete the BtShared object.
1541 */
1542 assert( !pBt->pCursor );
drhe53831d2007-08-17 01:14:38 +00001543 sqlite3PagerClose(pBt->pPager);
1544 if( pBt->xFreeSchema && pBt->pSchema ){
1545 pBt->xFreeSchema(pBt->pSchema);
1546 }
1547 sqlite3_free(pBt->pSchema);
drhf7141992008-06-19 00:16:08 +00001548 freeTempSpace(pBt);
drh65bbf292008-06-19 01:03:17 +00001549 sqlite3_free(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00001550 }
1551
drhe53831d2007-08-17 01:14:38 +00001552#ifndef SQLITE_OMIT_SHARED_CACHE
drhcab5ed72007-08-22 11:41:18 +00001553 assert( p->wantToLock==0 );
1554 assert( p->locked==0 );
1555 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1556 if( p->pNext ) p->pNext->pPrev = p->pPrev;
danielk1977aef0bf62005-12-30 16:28:01 +00001557#endif
1558
drhe53831d2007-08-17 01:14:38 +00001559 sqlite3_free(p);
drha059ad02001-04-17 20:09:11 +00001560 return SQLITE_OK;
1561}
1562
1563/*
drhda47d772002-12-02 04:25:19 +00001564** Change the limit on the number of pages allowed in the cache.
drhcd61c282002-03-06 22:01:34 +00001565**
1566** The maximum number of cache pages is set to the absolute
1567** value of mxPage. If mxPage is negative, the pager will
1568** operate asynchronously - it will not stop to do fsync()s
1569** to insure data is written to the disk surface before
1570** continuing. Transactions still work if synchronous is off,
1571** and the database cannot be corrupted if this program
1572** crashes. But if the operating system crashes or there is
1573** an abrupt power failure when synchronous is off, the database
1574** could be left in an inconsistent and unrecoverable state.
1575** Synchronous is on by default so database corruption is not
1576** normally a worry.
drhf57b14a2001-09-14 18:54:08 +00001577*/
danielk1977aef0bf62005-12-30 16:28:01 +00001578int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1579 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001580 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001581 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001582 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
drhd677b3d2007-08-20 22:48:41 +00001583 sqlite3BtreeLeave(p);
drhf57b14a2001-09-14 18:54:08 +00001584 return SQLITE_OK;
1585}
1586
1587/*
drh973b6e32003-02-12 14:09:42 +00001588** Change the way data is synced to disk in order to increase or decrease
1589** how well the database resists damage due to OS crashes and power
1590** failures. Level 1 is the same as asynchronous (no syncs() occur and
1591** there is a high probability of damage) Level 2 is the default. There
1592** is a very low but non-zero probability of damage. Level 3 reduces the
1593** probability of damage to near zero but with a write performance reduction.
1594*/
danielk197793758c82005-01-21 08:13:14 +00001595#ifndef SQLITE_OMIT_PAGER_PRAGMAS
drhac530b12006-02-11 01:25:50 +00001596int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
danielk1977aef0bf62005-12-30 16:28:01 +00001597 BtShared *pBt = p->pBt;
drhe5fe6902007-12-07 18:55:28 +00001598 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001599 sqlite3BtreeEnter(p);
danielk19773b8a05f2007-03-19 17:44:26 +00001600 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
drhd677b3d2007-08-20 22:48:41 +00001601 sqlite3BtreeLeave(p);
drh973b6e32003-02-12 14:09:42 +00001602 return SQLITE_OK;
1603}
danielk197793758c82005-01-21 08:13:14 +00001604#endif
drh973b6e32003-02-12 14:09:42 +00001605
drh2c8997b2005-08-27 16:36:48 +00001606/*
1607** Return TRUE if the given btree is set to safety level 1. In other
1608** words, return TRUE if no sync() occurs on the disk files.
1609*/
danielk1977aef0bf62005-12-30 16:28:01 +00001610int sqlite3BtreeSyncDisabled(Btree *p){
1611 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001612 int rc;
drhe5fe6902007-12-07 18:55:28 +00001613 assert( sqlite3_mutex_held(p->db->mutex) );
drhd677b3d2007-08-20 22:48:41 +00001614 sqlite3BtreeEnter(p);
drhd0679ed2007-08-28 22:24:34 +00001615 assert( pBt && pBt->pPager );
drhd677b3d2007-08-20 22:48:41 +00001616 rc = sqlite3PagerNosync(pBt->pPager);
1617 sqlite3BtreeLeave(p);
1618 return rc;
drh2c8997b2005-08-27 16:36:48 +00001619}
1620
danielk1977576ec6b2005-01-21 11:55:25 +00001621#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
drh973b6e32003-02-12 14:09:42 +00001622/*
drh90f5ecb2004-07-22 01:19:35 +00001623** Change the default pages size and the number of reserved bytes per page.
drh06f50212004-11-02 14:24:33 +00001624**
1625** The page size must be a power of 2 between 512 and 65536. If the page
1626** size supplied does not meet this constraint then the page size is not
1627** changed.
1628**
1629** Page sizes are constrained to be a power of two so that the region
1630** of the database file used for locking (beginning at PENDING_BYTE,
1631** the first byte past the 1GB boundary, 0x40000000) needs to occur
1632** at the beginning of a page.
danielk197728129562005-01-11 10:25:06 +00001633**
1634** If parameter nReserve is less than zero, then the number of reserved
1635** bytes per page is left unchanged.
drh90f5ecb2004-07-22 01:19:35 +00001636*/
danielk1977aef0bf62005-12-30 16:28:01 +00001637int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
danielk1977a1644fd2007-08-29 12:31:25 +00001638 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00001639 BtShared *pBt = p->pBt;
drhf49661a2008-12-10 16:45:50 +00001640 assert( nReserve>=-1 && nReserve<=255 );
drhd677b3d2007-08-20 22:48:41 +00001641 sqlite3BtreeEnter(p);
drh90f5ecb2004-07-22 01:19:35 +00001642 if( pBt->pageSizeFixed ){
drhd677b3d2007-08-20 22:48:41 +00001643 sqlite3BtreeLeave(p);
drh90f5ecb2004-07-22 01:19:35 +00001644 return SQLITE_READONLY;
1645 }
1646 if( nReserve<0 ){
1647 nReserve = pBt->pageSize - pBt->usableSize;
1648 }
drhf49661a2008-12-10 16:45:50 +00001649 assert( nReserve>=0 && nReserve<=255 );
drh06f50212004-11-02 14:24:33 +00001650 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1651 ((pageSize-1)&pageSize)==0 ){
drh07d183d2005-05-01 22:52:42 +00001652 assert( (pageSize & 7)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00001653 assert( !pBt->pPage1 && !pBt->pCursor );
drh1bd10f82008-12-10 21:19:56 +00001654 pBt->pageSize = (u16)pageSize;
drhf7141992008-06-19 00:16:08 +00001655 freeTempSpace(pBt);
danielk1977a1644fd2007-08-29 12:31:25 +00001656 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
drh90f5ecb2004-07-22 01:19:35 +00001657 }
drhf49661a2008-12-10 16:45:50 +00001658 pBt->usableSize = pBt->pageSize - (u16)nReserve;
drhd677b3d2007-08-20 22:48:41 +00001659 sqlite3BtreeLeave(p);
danielk1977a1644fd2007-08-29 12:31:25 +00001660 return rc;
drh90f5ecb2004-07-22 01:19:35 +00001661}
1662
1663/*
1664** Return the currently defined page size
1665*/
danielk1977aef0bf62005-12-30 16:28:01 +00001666int sqlite3BtreeGetPageSize(Btree *p){
1667 return p->pBt->pageSize;
drh90f5ecb2004-07-22 01:19:35 +00001668}
danielk1977aef0bf62005-12-30 16:28:01 +00001669int sqlite3BtreeGetReserve(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00001670 int n;
1671 sqlite3BtreeEnter(p);
1672 n = p->pBt->pageSize - p->pBt->usableSize;
1673 sqlite3BtreeLeave(p);
1674 return n;
drh2011d5f2004-07-22 02:40:37 +00001675}
drhf8e632b2007-05-08 14:51:36 +00001676
1677/*
1678** Set the maximum page count for a database if mxPage is positive.
1679** No changes are made if mxPage is 0 or negative.
1680** Regardless of the value of mxPage, return the maximum page count.
1681*/
1682int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
drhd677b3d2007-08-20 22:48:41 +00001683 int n;
1684 sqlite3BtreeEnter(p);
1685 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1686 sqlite3BtreeLeave(p);
1687 return n;
drhf8e632b2007-05-08 14:51:36 +00001688}
danielk1977576ec6b2005-01-21 11:55:25 +00001689#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
drh90f5ecb2004-07-22 01:19:35 +00001690
1691/*
danielk1977951af802004-11-05 15:45:09 +00001692** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1693** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1694** is disabled. The default value for the auto-vacuum property is
1695** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1696*/
danielk1977aef0bf62005-12-30 16:28:01 +00001697int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
danielk1977951af802004-11-05 15:45:09 +00001698#ifdef SQLITE_OMIT_AUTOVACUUM
drheee46cf2004-11-06 00:02:48 +00001699 return SQLITE_READONLY;
danielk1977951af802004-11-05 15:45:09 +00001700#else
danielk1977dddbcdc2007-04-26 14:42:34 +00001701 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00001702 int rc = SQLITE_OK;
drhf49661a2008-12-10 16:45:50 +00001703 u8 av = autoVacuum ?1:0;
drhd677b3d2007-08-20 22:48:41 +00001704
1705 sqlite3BtreeEnter(p);
danielk1977dddbcdc2007-04-26 14:42:34 +00001706 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00001707 rc = SQLITE_READONLY;
1708 }else{
1709 pBt->autoVacuum = av;
danielk1977951af802004-11-05 15:45:09 +00001710 }
drhd677b3d2007-08-20 22:48:41 +00001711 sqlite3BtreeLeave(p);
1712 return rc;
danielk1977951af802004-11-05 15:45:09 +00001713#endif
1714}
1715
1716/*
1717** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1718** enabled 1 is returned. Otherwise 0.
1719*/
danielk1977aef0bf62005-12-30 16:28:01 +00001720int sqlite3BtreeGetAutoVacuum(Btree *p){
danielk1977951af802004-11-05 15:45:09 +00001721#ifdef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001722 return BTREE_AUTOVACUUM_NONE;
danielk1977951af802004-11-05 15:45:09 +00001723#else
drhd677b3d2007-08-20 22:48:41 +00001724 int rc;
1725 sqlite3BtreeEnter(p);
1726 rc = (
danielk1977dddbcdc2007-04-26 14:42:34 +00001727 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1728 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1729 BTREE_AUTOVACUUM_INCR
1730 );
drhd677b3d2007-08-20 22:48:41 +00001731 sqlite3BtreeLeave(p);
1732 return rc;
danielk1977951af802004-11-05 15:45:09 +00001733#endif
1734}
1735
1736
1737/*
drha34b6762004-05-07 13:30:42 +00001738** Get a reference to pPage1 of the database file. This will
drh306dc212001-05-21 13:45:10 +00001739** also acquire a readlock on that file.
1740**
1741** SQLITE_OK is returned on success. If the file is not a
1742** well-formed database file, then SQLITE_CORRUPT is returned.
1743** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
drh4f0ee682007-03-30 20:43:40 +00001744** is returned if we run out of memory.
drh306dc212001-05-21 13:45:10 +00001745*/
danielk1977aef0bf62005-12-30 16:28:01 +00001746static int lockBtree(BtShared *pBt){
danielk1977f653d782008-03-20 11:04:21 +00001747 int rc;
drh3aac2dd2004-04-26 14:10:20 +00001748 MemPage *pPage1;
danielk197793f7af92008-05-09 16:57:50 +00001749 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001750
drh1fee73e2007-08-29 04:00:57 +00001751 assert( sqlite3_mutex_held(pBt->mutex) );
drha34b6762004-05-07 13:30:42 +00001752 if( pBt->pPage1 ) return SQLITE_OK;
drh16a9b832007-05-05 18:39:25 +00001753 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
drh306dc212001-05-21 13:45:10 +00001754 if( rc!=SQLITE_OK ) return rc;
drh306dc212001-05-21 13:45:10 +00001755
1756 /* Do some checking to help insure the file we opened really is
1757 ** a valid database file.
1758 */
danielk1977ad0132d2008-06-07 08:58:22 +00001759 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1760 if( rc!=SQLITE_OK ){
danielk197793f7af92008-05-09 16:57:50 +00001761 goto page1_init_failed;
1762 }else if( nPage>0 ){
danielk1977f653d782008-03-20 11:04:21 +00001763 int pageSize;
1764 int usableSize;
drhb6f41482004-05-14 01:58:11 +00001765 u8 *page1 = pPage1->aData;
danielk1977ad0132d2008-06-07 08:58:22 +00001766 rc = SQLITE_NOTADB;
drhb6f41482004-05-14 01:58:11 +00001767 if( memcmp(page1, zMagicHeader, 16)!=0 ){
drh72f82862001-05-24 21:06:34 +00001768 goto page1_init_failed;
drh306dc212001-05-21 13:45:10 +00001769 }
drh309169a2007-04-24 17:27:51 +00001770 if( page1[18]>1 ){
1771 pBt->readOnly = 1;
1772 }
1773 if( page1[19]>1 ){
drhb6f41482004-05-14 01:58:11 +00001774 goto page1_init_failed;
1775 }
drhe5ae5732008-06-15 02:51:47 +00001776
1777 /* The maximum embedded fraction must be exactly 25%. And the minimum
1778 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1779 ** The original design allowed these amounts to vary, but as of
1780 ** version 3.6.0, we require them to be fixed.
1781 */
1782 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1783 goto page1_init_failed;
1784 }
drh07d183d2005-05-01 22:52:42 +00001785 pageSize = get2byte(&page1[16]);
drh7dc385e2007-09-06 23:39:36 +00001786 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1787 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1788 ){
drh07d183d2005-05-01 22:52:42 +00001789 goto page1_init_failed;
1790 }
1791 assert( (pageSize & 7)==0 );
danielk1977f653d782008-03-20 11:04:21 +00001792 usableSize = pageSize - page1[20];
1793 if( pageSize!=pBt->pageSize ){
1794 /* After reading the first page of the database assuming a page size
1795 ** of BtShared.pageSize, we have discovered that the page-size is
1796 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1797 ** zero and return SQLITE_OK. The caller will call this function
1798 ** again with the correct page-size.
1799 */
1800 releasePage(pPage1);
drhf49661a2008-12-10 16:45:50 +00001801 pBt->usableSize = (u16)usableSize;
1802 pBt->pageSize = (u16)pageSize;
drhf7141992008-06-19 00:16:08 +00001803 freeTempSpace(pBt);
danielk1977f653d782008-03-20 11:04:21 +00001804 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1805 return SQLITE_OK;
1806 }
1807 if( usableSize<500 ){
drhb6f41482004-05-14 01:58:11 +00001808 goto page1_init_failed;
1809 }
drh1bd10f82008-12-10 21:19:56 +00001810 pBt->pageSize = (u16)pageSize;
1811 pBt->usableSize = (u16)usableSize;
drh057cd3a2005-02-15 16:23:02 +00001812#ifndef SQLITE_OMIT_AUTOVACUUM
1813 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
danielk197727b1f952007-06-25 08:16:58 +00001814 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
drh057cd3a2005-02-15 16:23:02 +00001815#endif
drh306dc212001-05-21 13:45:10 +00001816 }
drhb6f41482004-05-14 01:58:11 +00001817
1818 /* maxLocal is the maximum amount of payload to store locally for
1819 ** a cell. Make sure it is small enough so that at least minFanout
1820 ** cells can will fit on one page. We assume a 10-byte page header.
1821 ** Besides the payload, the cell must store:
drh43605152004-05-29 21:46:49 +00001822 ** 2-byte pointer to the cell
drhb6f41482004-05-14 01:58:11 +00001823 ** 4-byte child pointer
1824 ** 9-byte nKey value
1825 ** 4-byte nData value
1826 ** 4-byte overflow page pointer
drh43605152004-05-29 21:46:49 +00001827 ** So a cell consists of a 2-byte poiner, a header which is as much as
1828 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1829 ** page pointer.
drhb6f41482004-05-14 01:58:11 +00001830 */
drhe5ae5732008-06-15 02:51:47 +00001831 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1832 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
drh43605152004-05-29 21:46:49 +00001833 pBt->maxLeaf = pBt->usableSize - 35;
drhe5ae5732008-06-15 02:51:47 +00001834 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
drh2e38c322004-09-03 18:38:44 +00001835 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
drh3aac2dd2004-04-26 14:10:20 +00001836 pBt->pPage1 = pPage1;
drhb6f41482004-05-14 01:58:11 +00001837 return SQLITE_OK;
drh306dc212001-05-21 13:45:10 +00001838
drh72f82862001-05-24 21:06:34 +00001839page1_init_failed:
drh3aac2dd2004-04-26 14:10:20 +00001840 releasePage(pPage1);
1841 pBt->pPage1 = 0;
drh72f82862001-05-24 21:06:34 +00001842 return rc;
drh306dc212001-05-21 13:45:10 +00001843}
1844
1845/*
drhb8ef32c2005-03-14 02:01:49 +00001846** This routine works like lockBtree() except that it also invokes the
1847** busy callback if there is lock contention.
1848*/
danielk1977aef0bf62005-12-30 16:28:01 +00001849static int lockBtreeWithRetry(Btree *pRef){
drhb8ef32c2005-03-14 02:01:49 +00001850 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00001851
drh1fee73e2007-08-29 04:00:57 +00001852 assert( sqlite3BtreeHoldsMutex(pRef) );
danielk1977aef0bf62005-12-30 16:28:01 +00001853 if( pRef->inTrans==TRANS_NONE ){
1854 u8 inTransaction = pRef->pBt->inTransaction;
1855 btreeIntegrity(pRef);
1856 rc = sqlite3BtreeBeginTrans(pRef, 0);
1857 pRef->pBt->inTransaction = inTransaction;
1858 pRef->inTrans = TRANS_NONE;
1859 if( rc==SQLITE_OK ){
1860 pRef->pBt->nTransaction--;
1861 }
1862 btreeIntegrity(pRef);
drhb8ef32c2005-03-14 02:01:49 +00001863 }
1864 return rc;
1865}
1866
1867
1868/*
drhb8ca3072001-12-05 00:21:20 +00001869** If there are no outstanding cursors and we are not in the middle
1870** of a transaction but there is a read lock on the database, then
1871** this routine unrefs the first page of the database file which
1872** has the effect of releasing the read lock.
1873**
1874** If there are any outstanding cursors, this routine is a no-op.
1875**
1876** If there is a transaction in progress, this routine is a no-op.
1877*/
danielk1977aef0bf62005-12-30 16:28:01 +00001878static void unlockBtreeIfUnused(BtShared *pBt){
drh1fee73e2007-08-29 04:00:57 +00001879 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00001880 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00001881 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
drhde4fcfd2008-01-19 23:50:26 +00001882 assert( pBt->pPage1->aData );
1883#if 0
drh24c9a2e2007-01-05 02:00:47 +00001884 if( pBt->pPage1->aData==0 ){
1885 MemPage *pPage = pBt->pPage1;
drhbf4bca52007-09-06 22:19:14 +00001886 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
drh24c9a2e2007-01-05 02:00:47 +00001887 pPage->pBt = pBt;
1888 pPage->pgno = 1;
1889 }
drhde4fcfd2008-01-19 23:50:26 +00001890#endif
drh24c9a2e2007-01-05 02:00:47 +00001891 releasePage(pBt->pPage1);
drh51c6d962004-06-06 00:42:25 +00001892 }
drh3aac2dd2004-04-26 14:10:20 +00001893 pBt->pPage1 = 0;
drh3aac2dd2004-04-26 14:10:20 +00001894 pBt->inStmt = 0;
drhb8ca3072001-12-05 00:21:20 +00001895 }
1896}
1897
1898/*
drh9e572e62004-04-23 23:43:10 +00001899** Create a new database by initializing the first page of the
drh8c42ca92001-06-22 19:15:00 +00001900** file.
drh8b2f49b2001-06-08 00:21:52 +00001901*/
danielk1977aef0bf62005-12-30 16:28:01 +00001902static int newDatabase(BtShared *pBt){
drh9e572e62004-04-23 23:43:10 +00001903 MemPage *pP1;
1904 unsigned char *data;
drh8c42ca92001-06-22 19:15:00 +00001905 int rc;
danielk1977ad0132d2008-06-07 08:58:22 +00001906 int nPage;
drhd677b3d2007-08-20 22:48:41 +00001907
drh1fee73e2007-08-29 04:00:57 +00001908 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977ad0132d2008-06-07 08:58:22 +00001909 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1910 if( rc!=SQLITE_OK || nPage>0 ){
1911 return rc;
1912 }
drh3aac2dd2004-04-26 14:10:20 +00001913 pP1 = pBt->pPage1;
drh9e572e62004-04-23 23:43:10 +00001914 assert( pP1!=0 );
1915 data = pP1->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00001916 rc = sqlite3PagerWrite(pP1->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00001917 if( rc ) return rc;
drh9e572e62004-04-23 23:43:10 +00001918 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1919 assert( sizeof(zMagicHeader)==16 );
drhb6f41482004-05-14 01:58:11 +00001920 put2byte(&data[16], pBt->pageSize);
drh9e572e62004-04-23 23:43:10 +00001921 data[18] = 1;
1922 data[19] = 1;
drhf49661a2008-12-10 16:45:50 +00001923 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
1924 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
drhe5ae5732008-06-15 02:51:47 +00001925 data[21] = 64;
1926 data[22] = 32;
1927 data[23] = 32;
drhb6f41482004-05-14 01:58:11 +00001928 memset(&data[24], 0, 100-24);
drhe6c43812004-05-14 12:17:46 +00001929 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
drhf2a611c2004-09-05 00:33:43 +00001930 pBt->pageSizeFixed = 1;
danielk1977003ba062004-11-04 02:57:33 +00001931#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00001932 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
danielk1977418899a2007-06-24 10:14:00 +00001933 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
danielk1977dddbcdc2007-04-26 14:42:34 +00001934 put4byte(&data[36 + 4*4], pBt->autoVacuum);
danielk1977418899a2007-06-24 10:14:00 +00001935 put4byte(&data[36 + 7*4], pBt->incrVacuum);
danielk1977003ba062004-11-04 02:57:33 +00001936#endif
drh8b2f49b2001-06-08 00:21:52 +00001937 return SQLITE_OK;
1938}
1939
1940/*
danielk1977ee5741e2004-05-31 10:01:34 +00001941** Attempt to start a new transaction. A write-transaction
drh684917c2004-10-05 02:41:42 +00001942** is started if the second argument is nonzero, otherwise a read-
1943** transaction. If the second argument is 2 or more and exclusive
1944** transaction is started, meaning that no other process is allowed
1945** to access the database. A preexisting transaction may not be
drhb8ef32c2005-03-14 02:01:49 +00001946** upgraded to exclusive by calling this routine a second time - the
drh684917c2004-10-05 02:41:42 +00001947** exclusivity flag only works for a new transaction.
drh8b2f49b2001-06-08 00:21:52 +00001948**
danielk1977ee5741e2004-05-31 10:01:34 +00001949** A write-transaction must be started before attempting any
1950** changes to the database. None of the following routines
1951** will work unless a transaction is started first:
drh8b2f49b2001-06-08 00:21:52 +00001952**
drh23e11ca2004-05-04 17:27:28 +00001953** sqlite3BtreeCreateTable()
1954** sqlite3BtreeCreateIndex()
1955** sqlite3BtreeClearTable()
1956** sqlite3BtreeDropTable()
1957** sqlite3BtreeInsert()
1958** sqlite3BtreeDelete()
1959** sqlite3BtreeUpdateMeta()
danielk197713adf8a2004-06-03 16:08:41 +00001960**
drhb8ef32c2005-03-14 02:01:49 +00001961** If an initial attempt to acquire the lock fails because of lock contention
1962** and the database was previously unlocked, then invoke the busy handler
1963** if there is one. But if there was previously a read-lock, do not
1964** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1965** returned when there is already a read-lock in order to avoid a deadlock.
1966**
1967** Suppose there are two processes A and B. A has a read lock and B has
1968** a reserved lock. B tries to promote to exclusive but is blocked because
1969** of A's read lock. A tries to promote to reserved but is blocked by B.
1970** One or the other of the two processes must give way or there can be
1971** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1972** when A already has a read lock, we encourage A to give up and let B
1973** proceed.
drha059ad02001-04-17 20:09:11 +00001974*/
danielk1977aef0bf62005-12-30 16:28:01 +00001975int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1976 BtShared *pBt = p->pBt;
danielk1977ee5741e2004-05-31 10:01:34 +00001977 int rc = SQLITE_OK;
1978
drhd677b3d2007-08-20 22:48:41 +00001979 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00001980 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00001981 btreeIntegrity(p);
1982
danielk1977ee5741e2004-05-31 10:01:34 +00001983 /* If the btree is already in a write-transaction, or it
1984 ** is already in a read-transaction and a read-transaction
1985 ** is requested, this is a no-op.
1986 */
danielk1977aef0bf62005-12-30 16:28:01 +00001987 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
drhd677b3d2007-08-20 22:48:41 +00001988 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001989 }
drhb8ef32c2005-03-14 02:01:49 +00001990
1991 /* Write transactions are not possible on a read-only database */
danielk1977ee5741e2004-05-31 10:01:34 +00001992 if( pBt->readOnly && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00001993 rc = SQLITE_READONLY;
1994 goto trans_begun;
danielk1977ee5741e2004-05-31 10:01:34 +00001995 }
1996
danielk1977aef0bf62005-12-30 16:28:01 +00001997 /* If another database handle has already opened a write transaction
1998 ** on this shared-btree structure and a second write transaction is
1999 ** requested, return SQLITE_BUSY.
2000 */
2001 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
drhd677b3d2007-08-20 22:48:41 +00002002 rc = SQLITE_BUSY;
2003 goto trans_begun;
danielk1977aef0bf62005-12-30 16:28:01 +00002004 }
2005
danielk1977641b0f42007-12-21 04:47:25 +00002006#ifndef SQLITE_OMIT_SHARED_CACHE
2007 if( wrflag>1 ){
2008 BtLock *pIter;
2009 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2010 if( pIter->pBtree!=p ){
2011 rc = SQLITE_BUSY;
2012 goto trans_begun;
2013 }
2014 }
2015 }
2016#endif
2017
drhb8ef32c2005-03-14 02:01:49 +00002018 do {
drh8a9c17f2008-05-02 14:23:54 +00002019 if( pBt->pPage1==0 ){
2020 do{
2021 rc = lockBtree(pBt);
2022 }while( pBt->pPage1==0 && rc==SQLITE_OK );
drh8c42ca92001-06-22 19:15:00 +00002023 }
drh309169a2007-04-24 17:27:51 +00002024
drhb8ef32c2005-03-14 02:01:49 +00002025 if( rc==SQLITE_OK && wrflag ){
drh309169a2007-04-24 17:27:51 +00002026 if( pBt->readOnly ){
2027 rc = SQLITE_READONLY;
2028 }else{
2029 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2030 if( rc==SQLITE_OK ){
2031 rc = newDatabase(pBt);
2032 }
drhb8ef32c2005-03-14 02:01:49 +00002033 }
2034 }
2035
2036 if( rc==SQLITE_OK ){
drhb8ef32c2005-03-14 02:01:49 +00002037 if( wrflag ) pBt->inStmt = 0;
2038 }else{
2039 unlockBtreeIfUnused(pBt);
2040 }
danielk1977aef0bf62005-12-30 16:28:01 +00002041 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
danielk19771ceedd32008-11-19 10:22:33 +00002042 btreeInvokeBusyHandler(pBt) );
danielk1977aef0bf62005-12-30 16:28:01 +00002043
2044 if( rc==SQLITE_OK ){
2045 if( p->inTrans==TRANS_NONE ){
2046 pBt->nTransaction++;
2047 }
2048 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2049 if( p->inTrans>pBt->inTransaction ){
2050 pBt->inTransaction = p->inTrans;
2051 }
danielk1977641b0f42007-12-21 04:47:25 +00002052#ifndef SQLITE_OMIT_SHARED_CACHE
2053 if( wrflag>1 ){
2054 assert( !pBt->pExclusive );
2055 pBt->pExclusive = p;
2056 }
2057#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002058 }
2059
drhd677b3d2007-08-20 22:48:41 +00002060
2061trans_begun:
danielk1977fd7f0452008-12-17 17:30:26 +00002062 if( rc==SQLITE_OK && wrflag ){
2063 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2064 }
danielk1977aef0bf62005-12-30 16:28:01 +00002065 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002066 sqlite3BtreeLeave(p);
drhb8ca3072001-12-05 00:21:20 +00002067 return rc;
drha059ad02001-04-17 20:09:11 +00002068}
2069
danielk1977687566d2004-11-02 12:56:41 +00002070#ifndef SQLITE_OMIT_AUTOVACUUM
2071
2072/*
2073** Set the pointer-map entries for all children of page pPage. Also, if
2074** pPage contains cells that point to overflow pages, set the pointer
2075** map entries for the overflow pages as well.
2076*/
2077static int setChildPtrmaps(MemPage *pPage){
2078 int i; /* Counter variable */
2079 int nCell; /* Number of cells in page pPage */
danielk19772df71c72007-05-24 07:22:42 +00002080 int rc; /* Return code */
danielk1977aef0bf62005-12-30 16:28:01 +00002081 BtShared *pBt = pPage->pBt;
drhf49661a2008-12-10 16:45:50 +00002082 u8 isInitOrig = pPage->isInit;
danielk1977687566d2004-11-02 12:56:41 +00002083 Pgno pgno = pPage->pgno;
2084
drh1fee73e2007-08-29 04:00:57 +00002085 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00002086 rc = sqlite3BtreeInitPage(pPage);
danielk19772df71c72007-05-24 07:22:42 +00002087 if( rc!=SQLITE_OK ){
2088 goto set_child_ptrmaps_out;
2089 }
danielk1977687566d2004-11-02 12:56:41 +00002090 nCell = pPage->nCell;
2091
2092 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002093 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002094
danielk197726836652005-01-17 01:33:13 +00002095 rc = ptrmapPutOvflPtr(pPage, pCell);
2096 if( rc!=SQLITE_OK ){
2097 goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002098 }
danielk197726836652005-01-17 01:33:13 +00002099
danielk1977687566d2004-11-02 12:56:41 +00002100 if( !pPage->leaf ){
2101 Pgno childPgno = get4byte(pCell);
2102 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
danielk197700a696d2008-09-29 16:41:31 +00002103 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
danielk1977687566d2004-11-02 12:56:41 +00002104 }
2105 }
2106
2107 if( !pPage->leaf ){
2108 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2109 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2110 }
2111
2112set_child_ptrmaps_out:
2113 pPage->isInit = isInitOrig;
2114 return rc;
2115}
2116
2117/*
2118** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2119** page, is a pointer to page iFrom. Modify this pointer so that it points to
2120** iTo. Parameter eType describes the type of pointer to be modified, as
2121** follows:
2122**
2123** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2124** page of pPage.
2125**
2126** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2127** page pointed to by one of the cells on pPage.
2128**
2129** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2130** overflow page in the list.
2131*/
danielk1977fdb7cdb2005-01-17 02:12:18 +00002132static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
drh1fee73e2007-08-29 04:00:57 +00002133 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhc5053fb2008-11-27 02:22:10 +00002134 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk1977687566d2004-11-02 12:56:41 +00002135 if( eType==PTRMAP_OVERFLOW2 ){
danielk1977f78fc082004-11-02 14:40:32 +00002136 /* The pointer is always the first 4 bytes of the page in this case. */
danielk1977fdb7cdb2005-01-17 02:12:18 +00002137 if( get4byte(pPage->aData)!=iFrom ){
drh49285702005-09-17 15:20:26 +00002138 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002139 }
danielk1977f78fc082004-11-02 14:40:32 +00002140 put4byte(pPage->aData, iTo);
danielk1977687566d2004-11-02 12:56:41 +00002141 }else{
drhf49661a2008-12-10 16:45:50 +00002142 u8 isInitOrig = pPage->isInit;
danielk1977687566d2004-11-02 12:56:41 +00002143 int i;
2144 int nCell;
2145
danielk197771d5d2c2008-09-29 11:49:47 +00002146 sqlite3BtreeInitPage(pPage);
danielk1977687566d2004-11-02 12:56:41 +00002147 nCell = pPage->nCell;
2148
danielk1977687566d2004-11-02 12:56:41 +00002149 for(i=0; i<nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00002150 u8 *pCell = findCell(pPage, i);
danielk1977687566d2004-11-02 12:56:41 +00002151 if( eType==PTRMAP_OVERFLOW1 ){
2152 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00002153 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
danielk1977687566d2004-11-02 12:56:41 +00002154 if( info.iOverflow ){
2155 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2156 put4byte(&pCell[info.iOverflow], iTo);
2157 break;
2158 }
2159 }
2160 }else{
2161 if( get4byte(pCell)==iFrom ){
2162 put4byte(pCell, iTo);
2163 break;
2164 }
2165 }
2166 }
2167
2168 if( i==nCell ){
danielk1977fdb7cdb2005-01-17 02:12:18 +00002169 if( eType!=PTRMAP_BTREE ||
2170 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
drh49285702005-09-17 15:20:26 +00002171 return SQLITE_CORRUPT_BKPT;
danielk1977fdb7cdb2005-01-17 02:12:18 +00002172 }
danielk1977687566d2004-11-02 12:56:41 +00002173 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2174 }
2175
2176 pPage->isInit = isInitOrig;
2177 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002178 return SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002179}
2180
danielk1977003ba062004-11-04 02:57:33 +00002181
danielk19777701e812005-01-10 12:59:51 +00002182/*
2183** Move the open database page pDbPage to location iFreePage in the
2184** database. The pDbPage reference remains valid.
2185*/
danielk1977003ba062004-11-04 02:57:33 +00002186static int relocatePage(
danielk1977aef0bf62005-12-30 16:28:01 +00002187 BtShared *pBt, /* Btree */
danielk19777701e812005-01-10 12:59:51 +00002188 MemPage *pDbPage, /* Open page to move */
2189 u8 eType, /* Pointer map 'type' entry for pDbPage */
2190 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
danielk19774c999992008-07-16 18:17:55 +00002191 Pgno iFreePage, /* The location to move pDbPage to */
2192 int isCommit
danielk1977003ba062004-11-04 02:57:33 +00002193){
2194 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2195 Pgno iDbPage = pDbPage->pgno;
2196 Pager *pPager = pBt->pPager;
2197 int rc;
2198
danielk1977a0bf2652004-11-04 14:30:04 +00002199 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2200 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
drh1fee73e2007-08-29 04:00:57 +00002201 assert( sqlite3_mutex_held(pBt->mutex) );
drhd0679ed2007-08-28 22:24:34 +00002202 assert( pDbPage->pBt==pBt );
danielk1977003ba062004-11-04 02:57:33 +00002203
drh85b623f2007-12-13 21:54:09 +00002204 /* Move page iDbPage from its current location to page number iFreePage */
danielk1977003ba062004-11-04 02:57:33 +00002205 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2206 iDbPage, iFreePage, iPtrPage, eType));
danielk19774c999992008-07-16 18:17:55 +00002207 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
danielk1977003ba062004-11-04 02:57:33 +00002208 if( rc!=SQLITE_OK ){
2209 return rc;
2210 }
2211 pDbPage->pgno = iFreePage;
2212
2213 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2214 ** that point to overflow pages. The pointer map entries for all these
2215 ** pages need to be changed.
2216 **
2217 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2218 ** pointer to a subsequent overflow page. If this is the case, then
2219 ** the pointer map needs to be updated for the subsequent overflow page.
2220 */
danielk1977a0bf2652004-11-04 14:30:04 +00002221 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00002222 rc = setChildPtrmaps(pDbPage);
2223 if( rc!=SQLITE_OK ){
2224 return rc;
2225 }
2226 }else{
2227 Pgno nextOvfl = get4byte(pDbPage->aData);
2228 if( nextOvfl!=0 ){
danielk1977003ba062004-11-04 02:57:33 +00002229 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2230 if( rc!=SQLITE_OK ){
2231 return rc;
2232 }
2233 }
2234 }
2235
2236 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2237 ** that it points at iFreePage. Also fix the pointer map entry for
2238 ** iPtrPage.
2239 */
danielk1977a0bf2652004-11-04 14:30:04 +00002240 if( eType!=PTRMAP_ROOTPAGE ){
drh16a9b832007-05-05 18:39:25 +00002241 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00002242 if( rc!=SQLITE_OK ){
2243 return rc;
2244 }
danielk19773b8a05f2007-03-19 17:44:26 +00002245 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
danielk1977a0bf2652004-11-04 14:30:04 +00002246 if( rc!=SQLITE_OK ){
2247 releasePage(pPtrPage);
2248 return rc;
2249 }
danielk1977fdb7cdb2005-01-17 02:12:18 +00002250 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
danielk1977003ba062004-11-04 02:57:33 +00002251 releasePage(pPtrPage);
danielk1977fdb7cdb2005-01-17 02:12:18 +00002252 if( rc==SQLITE_OK ){
2253 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2254 }
danielk1977003ba062004-11-04 02:57:33 +00002255 }
danielk1977003ba062004-11-04 02:57:33 +00002256 return rc;
2257}
2258
danielk1977dddbcdc2007-04-26 14:42:34 +00002259/* Forward declaration required by incrVacuumStep(). */
drh4f0c5872007-03-26 22:05:01 +00002260static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
danielk1977687566d2004-11-02 12:56:41 +00002261
2262/*
danielk1977dddbcdc2007-04-26 14:42:34 +00002263** Perform a single step of an incremental-vacuum. If successful,
2264** return SQLITE_OK. If there is no work to do (and therefore no
2265** point in calling this function again), return SQLITE_DONE.
2266**
2267** More specificly, this function attempts to re-organize the
2268** database so that the last page of the file currently in use
2269** is no longer in use.
2270**
2271** If the nFin parameter is non-zero, the implementation assumes
2272** that the caller will keep calling incrVacuumStep() until
2273** it returns SQLITE_DONE or an error, and that nFin is the
2274** number of pages the database file will contain after this
2275** process is complete.
2276*/
2277static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2278 Pgno iLastPg; /* Last page in the database */
2279 Pgno nFreeList; /* Number of pages still on the free-list */
2280
drh1fee73e2007-08-29 04:00:57 +00002281 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977dddbcdc2007-04-26 14:42:34 +00002282 iLastPg = pBt->nTrunc;
2283 if( iLastPg==0 ){
danielk197789d40042008-11-17 14:20:56 +00002284 iLastPg = pagerPagecount(pBt);
danielk1977dddbcdc2007-04-26 14:42:34 +00002285 }
2286
2287 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2288 int rc;
2289 u8 eType;
2290 Pgno iPtrPage;
2291
2292 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2293 if( nFreeList==0 || nFin==iLastPg ){
2294 return SQLITE_DONE;
2295 }
2296
2297 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2298 if( rc!=SQLITE_OK ){
2299 return rc;
2300 }
2301 if( eType==PTRMAP_ROOTPAGE ){
2302 return SQLITE_CORRUPT_BKPT;
2303 }
2304
2305 if( eType==PTRMAP_FREEPAGE ){
2306 if( nFin==0 ){
2307 /* Remove the page from the files free-list. This is not required
danielk19774ef24492007-05-23 09:52:41 +00002308 ** if nFin is non-zero. In that case, the free-list will be
danielk1977dddbcdc2007-04-26 14:42:34 +00002309 ** truncated to zero after this function returns, so it doesn't
2310 ** matter if it still contains some garbage entries.
2311 */
2312 Pgno iFreePg;
2313 MemPage *pFreePg;
2314 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2315 if( rc!=SQLITE_OK ){
2316 return rc;
2317 }
2318 assert( iFreePg==iLastPg );
2319 releasePage(pFreePg);
2320 }
2321 } else {
2322 Pgno iFreePg; /* Index of free page to move pLastPg to */
2323 MemPage *pLastPg;
2324
drh16a9b832007-05-05 18:39:25 +00002325 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002326 if( rc!=SQLITE_OK ){
2327 return rc;
2328 }
2329
danielk1977b4626a32007-04-28 15:47:43 +00002330 /* If nFin is zero, this loop runs exactly once and page pLastPg
2331 ** is swapped with the first free page pulled off the free list.
2332 **
2333 ** On the other hand, if nFin is greater than zero, then keep
2334 ** looping until a free-page located within the first nFin pages
2335 ** of the file is found.
2336 */
danielk1977dddbcdc2007-04-26 14:42:34 +00002337 do {
2338 MemPage *pFreePg;
2339 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2340 if( rc!=SQLITE_OK ){
2341 releasePage(pLastPg);
2342 return rc;
2343 }
2344 releasePage(pFreePg);
2345 }while( nFin!=0 && iFreePg>nFin );
2346 assert( iFreePg<iLastPg );
danielk1977b4626a32007-04-28 15:47:43 +00002347
2348 rc = sqlite3PagerWrite(pLastPg->pDbPage);
danielk1977662278e2007-11-05 15:30:12 +00002349 if( rc==SQLITE_OK ){
danielk19774c999992008-07-16 18:17:55 +00002350 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
danielk1977662278e2007-11-05 15:30:12 +00002351 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002352 releasePage(pLastPg);
2353 if( rc!=SQLITE_OK ){
2354 return rc;
danielk1977662278e2007-11-05 15:30:12 +00002355 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002356 }
2357 }
2358
2359 pBt->nTrunc = iLastPg - 1;
2360 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2361 pBt->nTrunc--;
2362 }
2363 return SQLITE_OK;
2364}
2365
2366/*
2367** A write-transaction must be opened before calling this function.
2368** It performs a single unit of work towards an incremental vacuum.
2369**
2370** If the incremental vacuum is finished after this function has run,
2371** SQLITE_DONE is returned. If it is not finished, but no error occured,
2372** SQLITE_OK is returned. Otherwise an SQLite error code.
2373*/
2374int sqlite3BtreeIncrVacuum(Btree *p){
drhd677b3d2007-08-20 22:48:41 +00002375 int rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002376 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002377
2378 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002379 pBt->db = p->db;
danielk1977dddbcdc2007-04-26 14:42:34 +00002380 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2381 if( !pBt->autoVacuum ){
drhd677b3d2007-08-20 22:48:41 +00002382 rc = SQLITE_DONE;
2383 }else{
2384 invalidateAllOverflowCache(pBt);
2385 rc = incrVacuumStep(pBt, 0);
danielk1977dddbcdc2007-04-26 14:42:34 +00002386 }
drhd677b3d2007-08-20 22:48:41 +00002387 sqlite3BtreeLeave(p);
2388 return rc;
danielk1977dddbcdc2007-04-26 14:42:34 +00002389}
2390
2391/*
danielk19773b8a05f2007-03-19 17:44:26 +00002392** This routine is called prior to sqlite3PagerCommit when a transaction
danielk1977687566d2004-11-02 12:56:41 +00002393** is commited for an auto-vacuum database.
danielk197724168722007-04-02 05:07:47 +00002394**
2395** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2396** the database file should be truncated to during the commit process.
2397** i.e. the database has been reorganized so that only the first *pnTrunc
2398** pages are in use.
danielk1977687566d2004-11-02 12:56:41 +00002399*/
danielk197724168722007-04-02 05:07:47 +00002400static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
danielk1977dddbcdc2007-04-26 14:42:34 +00002401 int rc = SQLITE_OK;
danielk1977687566d2004-11-02 12:56:41 +00002402 Pager *pPager = pBt->pPager;
drhf94a1732008-09-30 17:18:17 +00002403 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002404
drh1fee73e2007-08-29 04:00:57 +00002405 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197792d4d7a2007-05-04 12:05:56 +00002406 invalidateAllOverflowCache(pBt);
danielk1977dddbcdc2007-04-26 14:42:34 +00002407 assert(pBt->autoVacuum);
2408 if( !pBt->incrVacuum ){
2409 Pgno nFin = 0;
danielk1977687566d2004-11-02 12:56:41 +00002410
danielk1977dddbcdc2007-04-26 14:42:34 +00002411 if( pBt->nTrunc==0 ){
2412 Pgno nFree;
2413 Pgno nPtrmap;
2414 const int pgsz = pBt->pageSize;
danielk197789d40042008-11-17 14:20:56 +00002415 Pgno nOrig = pagerPagecount(pBt);
danielk1977e5321f02007-04-27 07:05:44 +00002416
2417 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2418 return SQLITE_CORRUPT_BKPT;
2419 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002420 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2421 nOrig--;
danielk1977687566d2004-11-02 12:56:41 +00002422 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002423 nFree = get4byte(&pBt->pPage1->aData[36]);
2424 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2425 nFin = nOrig - nFree - nPtrmap;
2426 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2427 nFin--;
danielk1977ac11ee62005-01-15 12:45:51 +00002428 }
danielk1977dddbcdc2007-04-26 14:42:34 +00002429 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2430 nFin--;
2431 }
2432 }
danielk1977687566d2004-11-02 12:56:41 +00002433
danielk1977dddbcdc2007-04-26 14:42:34 +00002434 while( rc==SQLITE_OK ){
2435 rc = incrVacuumStep(pBt, nFin);
2436 }
2437 if( rc==SQLITE_DONE ){
2438 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2439 rc = SQLITE_OK;
danielk19770ba32df2008-05-07 07:13:16 +00002440 if( pBt->nTrunc && nFin ){
drh67f80b62007-07-23 19:26:17 +00002441 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
danielk1977dddbcdc2007-04-26 14:42:34 +00002442 put4byte(&pBt->pPage1->aData[32], 0);
2443 put4byte(&pBt->pPage1->aData[36], 0);
2444 pBt->nTrunc = nFin;
2445 }
2446 }
2447 if( rc!=SQLITE_OK ){
2448 sqlite3PagerRollback(pPager);
2449 }
danielk1977687566d2004-11-02 12:56:41 +00002450 }
2451
danielk1977dddbcdc2007-04-26 14:42:34 +00002452 if( rc==SQLITE_OK ){
2453 *pnTrunc = pBt->nTrunc;
2454 pBt->nTrunc = 0;
2455 }
danielk19773b8a05f2007-03-19 17:44:26 +00002456 assert( nRef==sqlite3PagerRefcount(pPager) );
danielk1977687566d2004-11-02 12:56:41 +00002457 return rc;
2458}
danielk1977dddbcdc2007-04-26 14:42:34 +00002459
shane831c3292008-11-10 17:14:58 +00002460#endif /* ifndef SQLITE_OMIT_AUTOVACUUM */
danielk1977687566d2004-11-02 12:56:41 +00002461
2462/*
drh80e35f42007-03-30 14:06:34 +00002463** This routine does the first phase of a two-phase commit. This routine
2464** causes a rollback journal to be created (if it does not already exist)
2465** and populated with enough information so that if a power loss occurs
2466** the database can be restored to its original state by playing back
2467** the journal. Then the contents of the journal are flushed out to
2468** the disk. After the journal is safely on oxide, the changes to the
2469** database are written into the database file and flushed to oxide.
2470** At the end of this call, the rollback journal still exists on the
2471** disk and we are still holding all locks, so the transaction has not
2472** committed. See sqlite3BtreeCommit() for the second phase of the
2473** commit process.
2474**
2475** This call is a no-op if no write-transaction is currently active on pBt.
2476**
2477** Otherwise, sync the database file for the btree pBt. zMaster points to
2478** the name of a master journal file that should be written into the
2479** individual journal file, or is NULL, indicating no master journal file
2480** (single database transaction).
2481**
2482** When this is called, the master journal should already have been
2483** created, populated with this journal pointer and synced to disk.
2484**
2485** Once this is routine has returned, the only thing required to commit
2486** the write-transaction for this database file is to delete the journal.
2487*/
2488int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2489 int rc = SQLITE_OK;
2490 if( p->inTrans==TRANS_WRITE ){
2491 BtShared *pBt = p->pBt;
2492 Pgno nTrunc = 0;
drhd677b3d2007-08-20 22:48:41 +00002493 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002494 pBt->db = p->db;
drh80e35f42007-03-30 14:06:34 +00002495#ifndef SQLITE_OMIT_AUTOVACUUM
2496 if( pBt->autoVacuum ){
2497 rc = autoVacuumCommit(pBt, &nTrunc);
2498 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002499 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002500 return rc;
2501 }
2502 }
2503#endif
danielk1977f653d782008-03-20 11:04:21 +00002504 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
drhd677b3d2007-08-20 22:48:41 +00002505 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002506 }
2507 return rc;
2508}
2509
2510/*
drh2aa679f2001-06-25 02:11:07 +00002511** Commit the transaction currently in progress.
drh5e00f6c2001-09-13 13:46:56 +00002512**
drh6e345992007-03-30 11:12:08 +00002513** This routine implements the second phase of a 2-phase commit. The
2514** sqlite3BtreeSync() routine does the first phase and should be invoked
2515** prior to calling this routine. The sqlite3BtreeSync() routine did
2516** all the work of writing information out to disk and flushing the
2517** contents so that they are written onto the disk platter. All this
2518** routine has to do is delete or truncate the rollback journal
2519** (which causes the transaction to commit) and drop locks.
2520**
drh5e00f6c2001-09-13 13:46:56 +00002521** This will release the write lock on the database file. If there
2522** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002523*/
drh80e35f42007-03-30 14:06:34 +00002524int sqlite3BtreeCommitPhaseTwo(Btree *p){
danielk1977aef0bf62005-12-30 16:28:01 +00002525 BtShared *pBt = p->pBt;
2526
drhd677b3d2007-08-20 22:48:41 +00002527 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002528 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002529 btreeIntegrity(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002530
2531 /* If the handle has a write-transaction open, commit the shared-btrees
2532 ** transaction and set the shared state to TRANS_READ.
2533 */
2534 if( p->inTrans==TRANS_WRITE ){
danielk19777f7bc662006-01-23 13:47:47 +00002535 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002536 assert( pBt->inTransaction==TRANS_WRITE );
2537 assert( pBt->nTransaction>0 );
drh80e35f42007-03-30 14:06:34 +00002538 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
danielk19777f7bc662006-01-23 13:47:47 +00002539 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00002540 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002541 return rc;
2542 }
danielk1977aef0bf62005-12-30 16:28:01 +00002543 pBt->inTransaction = TRANS_READ;
2544 pBt->inStmt = 0;
danielk1977ee5741e2004-05-31 10:01:34 +00002545 }
danielk19777f7bc662006-01-23 13:47:47 +00002546 unlockAllTables(p);
danielk1977aef0bf62005-12-30 16:28:01 +00002547
2548 /* If the handle has any kind of transaction open, decrement the transaction
2549 ** count of the shared btree. If the transaction count reaches 0, set
2550 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2551 ** will unlock the pager.
2552 */
2553 if( p->inTrans!=TRANS_NONE ){
2554 pBt->nTransaction--;
2555 if( 0==pBt->nTransaction ){
2556 pBt->inTransaction = TRANS_NONE;
2557 }
2558 }
2559
2560 /* Set the handles current transaction state to TRANS_NONE and unlock
2561 ** the pager if this call closed the only read or write transaction.
2562 */
2563 p->inTrans = TRANS_NONE;
drh5e00f6c2001-09-13 13:46:56 +00002564 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002565
2566 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002567 sqlite3BtreeLeave(p);
danielk19777f7bc662006-01-23 13:47:47 +00002568 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002569}
2570
drh80e35f42007-03-30 14:06:34 +00002571/*
2572** Do both phases of a commit.
2573*/
2574int sqlite3BtreeCommit(Btree *p){
2575 int rc;
drhd677b3d2007-08-20 22:48:41 +00002576 sqlite3BtreeEnter(p);
drh80e35f42007-03-30 14:06:34 +00002577 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2578 if( rc==SQLITE_OK ){
2579 rc = sqlite3BtreeCommitPhaseTwo(p);
2580 }
drhd677b3d2007-08-20 22:48:41 +00002581 sqlite3BtreeLeave(p);
drh80e35f42007-03-30 14:06:34 +00002582 return rc;
2583}
2584
danielk1977fbcd5852004-06-15 02:44:18 +00002585#ifndef NDEBUG
2586/*
2587** Return the number of write-cursors open on this handle. This is for use
2588** in assert() expressions, so it is only compiled if NDEBUG is not
2589** defined.
drhfb982642007-08-30 01:19:59 +00002590**
2591** For the purposes of this routine, a write-cursor is any cursor that
2592** is capable of writing to the databse. That means the cursor was
2593** originally opened for writing and the cursor has not be disabled
2594** by having its state changed to CURSOR_FAULT.
danielk1977fbcd5852004-06-15 02:44:18 +00002595*/
danielk1977aef0bf62005-12-30 16:28:01 +00002596static int countWriteCursors(BtShared *pBt){
danielk1977fbcd5852004-06-15 02:44:18 +00002597 BtCursor *pCur;
2598 int r = 0;
2599 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
drhfb982642007-08-30 01:19:59 +00002600 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
danielk1977fbcd5852004-06-15 02:44:18 +00002601 }
2602 return r;
2603}
2604#endif
2605
drhc39e0002004-05-07 23:50:57 +00002606/*
drhfb982642007-08-30 01:19:59 +00002607** This routine sets the state to CURSOR_FAULT and the error
2608** code to errCode for every cursor on BtShared that pBtree
2609** references.
2610**
2611** Every cursor is tripped, including cursors that belong
2612** to other database connections that happen to be sharing
2613** the cache with pBtree.
2614**
2615** This routine gets called when a rollback occurs.
2616** All cursors using the same cache must be tripped
2617** to prevent them from trying to use the btree after
2618** the rollback. The rollback may have deleted tables
2619** or moved root pages, so it is not sufficient to
2620** save the state of the cursor. The cursor must be
2621** invalidated.
2622*/
2623void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2624 BtCursor *p;
2625 sqlite3BtreeEnter(pBtree);
2626 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
danielk1977bc2ca9e2008-11-13 14:28:28 +00002627 int i;
danielk1977be51a652008-10-08 17:58:48 +00002628 sqlite3BtreeClearCursor(p);
drhfb982642007-08-30 01:19:59 +00002629 p->eState = CURSOR_FAULT;
2630 p->skip = errCode;
danielk1977bc2ca9e2008-11-13 14:28:28 +00002631 for(i=0; i<=p->iPage; i++){
2632 releasePage(p->apPage[i]);
2633 p->apPage[i] = 0;
2634 }
drhfb982642007-08-30 01:19:59 +00002635 }
2636 sqlite3BtreeLeave(pBtree);
2637}
2638
2639/*
drhecdc7532001-09-23 02:35:53 +00002640** Rollback the transaction in progress. All cursors will be
2641** invalided by this operation. Any attempt to use a cursor
2642** that was open at the beginning of this operation will result
2643** in an error.
drh5e00f6c2001-09-13 13:46:56 +00002644**
2645** This will release the write lock on the database file. If there
2646** are no active cursors, it also releases the read lock.
drha059ad02001-04-17 20:09:11 +00002647*/
danielk1977aef0bf62005-12-30 16:28:01 +00002648int sqlite3BtreeRollback(Btree *p){
danielk19778d34dfd2006-01-24 16:37:57 +00002649 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002650 BtShared *pBt = p->pBt;
drh24cd67e2004-05-10 16:18:47 +00002651 MemPage *pPage1;
danielk1977aef0bf62005-12-30 16:28:01 +00002652
drhd677b3d2007-08-20 22:48:41 +00002653 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002654 pBt->db = p->db;
danielk19772b8c13e2006-01-24 14:21:24 +00002655 rc = saveAllCursors(pBt, 0, 0);
danielk19778d34dfd2006-01-24 16:37:57 +00002656#ifndef SQLITE_OMIT_SHARED_CACHE
danielk19772b8c13e2006-01-24 14:21:24 +00002657 if( rc!=SQLITE_OK ){
danielk19778d34dfd2006-01-24 16:37:57 +00002658 /* This is a horrible situation. An IO or malloc() error occured whilst
2659 ** trying to save cursor positions. If this is an automatic rollback (as
2660 ** the result of a constraint, malloc() failure or IO error) then
2661 ** the cache may be internally inconsistent (not contain valid trees) so
2662 ** we cannot simply return the error to the caller. Instead, abort
2663 ** all queries that may be using any of the cursors that failed to save.
2664 */
drhfb982642007-08-30 01:19:59 +00002665 sqlite3BtreeTripAllCursors(p, rc);
danielk19772b8c13e2006-01-24 14:21:24 +00002666 }
danielk19778d34dfd2006-01-24 16:37:57 +00002667#endif
danielk1977aef0bf62005-12-30 16:28:01 +00002668 btreeIntegrity(p);
2669 unlockAllTables(p);
2670
2671 if( p->inTrans==TRANS_WRITE ){
danielk19778d34dfd2006-01-24 16:37:57 +00002672 int rc2;
danielk1977aef0bf62005-12-30 16:28:01 +00002673
danielk1977dddbcdc2007-04-26 14:42:34 +00002674#ifndef SQLITE_OMIT_AUTOVACUUM
2675 pBt->nTrunc = 0;
2676#endif
2677
danielk19778d34dfd2006-01-24 16:37:57 +00002678 assert( TRANS_WRITE==pBt->inTransaction );
danielk19773b8a05f2007-03-19 17:44:26 +00002679 rc2 = sqlite3PagerRollback(pBt->pPager);
danielk19778d34dfd2006-01-24 16:37:57 +00002680 if( rc2!=SQLITE_OK ){
2681 rc = rc2;
2682 }
2683
drh24cd67e2004-05-10 16:18:47 +00002684 /* The rollback may have destroyed the pPage1->aData value. So
drh16a9b832007-05-05 18:39:25 +00002685 ** call sqlite3BtreeGetPage() on page 1 again to make
2686 ** sure pPage1->aData is set correctly. */
2687 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
drh24cd67e2004-05-10 16:18:47 +00002688 releasePage(pPage1);
2689 }
danielk1977fbcd5852004-06-15 02:44:18 +00002690 assert( countWriteCursors(pBt)==0 );
danielk1977aef0bf62005-12-30 16:28:01 +00002691 pBt->inTransaction = TRANS_READ;
drh24cd67e2004-05-10 16:18:47 +00002692 }
danielk1977aef0bf62005-12-30 16:28:01 +00002693
2694 if( p->inTrans!=TRANS_NONE ){
2695 assert( pBt->nTransaction>0 );
2696 pBt->nTransaction--;
2697 if( 0==pBt->nTransaction ){
2698 pBt->inTransaction = TRANS_NONE;
2699 }
2700 }
2701
2702 p->inTrans = TRANS_NONE;
danielk1977ee5741e2004-05-31 10:01:34 +00002703 pBt->inStmt = 0;
drh5e00f6c2001-09-13 13:46:56 +00002704 unlockBtreeIfUnused(pBt);
danielk1977aef0bf62005-12-30 16:28:01 +00002705
2706 btreeIntegrity(p);
drhd677b3d2007-08-20 22:48:41 +00002707 sqlite3BtreeLeave(p);
drha059ad02001-04-17 20:09:11 +00002708 return rc;
2709}
2710
2711/*
drhab01f612004-05-22 02:55:23 +00002712** Start a statement subtransaction. The subtransaction can
2713** can be rolled back independently of the main transaction.
2714** You must start a transaction before starting a subtransaction.
2715** The subtransaction is ended automatically if the main transaction
drh663fc632002-02-02 18:49:19 +00002716** commits or rolls back.
2717**
drhab01f612004-05-22 02:55:23 +00002718** Only one subtransaction may be active at a time. It is an error to try
2719** to start a new subtransaction if another subtransaction is already active.
2720**
2721** Statement subtransactions are used around individual SQL statements
2722** that are contained within a BEGIN...COMMIT block. If a constraint
2723** error occurs within the statement, the effect of that one statement
2724** can be rolled back without having to rollback the entire transaction.
drh663fc632002-02-02 18:49:19 +00002725*/
danielk1977aef0bf62005-12-30 16:28:01 +00002726int sqlite3BtreeBeginStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002727 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002728 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002729 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002730 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00002731 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
drhd677b3d2007-08-20 22:48:41 +00002732 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2733 }else{
2734 assert( pBt->inTransaction==TRANS_WRITE );
danielk1977fd7f0452008-12-17 17:30:26 +00002735 if( pBt->readOnly ){
2736 rc = SQLITE_OK;
2737 }else{
2738 /* At the pager level, a statement transaction is a savepoint with
2739 ** an index greater than all savepoints created explicitly using
2740 ** SQL statements. It is illegal to open, release or rollback any
2741 ** such savepoints while the statement transaction savepoint is active.
2742 */
2743 int iStmtpoint = p->db->nSavepoint + 1;
2744 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStmtpoint);
2745 }
drhd677b3d2007-08-20 22:48:41 +00002746 pBt->inStmt = 1;
drh0d65dc02002-02-03 00:56:09 +00002747 }
drhd677b3d2007-08-20 22:48:41 +00002748 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002749 return rc;
2750}
2751
drh663fc632002-02-02 18:49:19 +00002752/*
drhab01f612004-05-22 02:55:23 +00002753** Commit the statment subtransaction currently in progress. If no
2754** subtransaction is active, this is a no-op.
drh663fc632002-02-02 18:49:19 +00002755*/
danielk1977aef0bf62005-12-30 16:28:01 +00002756int sqlite3BtreeCommitStmt(Btree *p){
drh663fc632002-02-02 18:49:19 +00002757 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00002758 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002759 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002760 pBt->db = p->db;
drh3aac2dd2004-04-26 14:10:20 +00002761 if( pBt->inStmt && !pBt->readOnly ){
danielk1977fd7f0452008-12-17 17:30:26 +00002762 int iStmtpoint = p->db->nSavepoint;
2763 rc = sqlite3PagerSavepoint(pBt->pPager, SAVEPOINT_RELEASE, iStmtpoint);
drh663fc632002-02-02 18:49:19 +00002764 }else{
2765 rc = SQLITE_OK;
2766 }
drh3aac2dd2004-04-26 14:10:20 +00002767 pBt->inStmt = 0;
drhd677b3d2007-08-20 22:48:41 +00002768 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002769 return rc;
2770}
2771
2772/*
drhab01f612004-05-22 02:55:23 +00002773** Rollback the active statement subtransaction. If no subtransaction
2774** is active this routine is a no-op.
drh663fc632002-02-02 18:49:19 +00002775**
drhab01f612004-05-22 02:55:23 +00002776** All cursors will be invalidated by this operation. Any attempt
drh663fc632002-02-02 18:49:19 +00002777** to use a cursor that was open at the beginning of this operation
2778** will result in an error.
2779*/
danielk1977aef0bf62005-12-30 16:28:01 +00002780int sqlite3BtreeRollbackStmt(Btree *p){
danielk197797a227c2006-01-20 16:32:04 +00002781 int rc = SQLITE_OK;
danielk1977aef0bf62005-12-30 16:28:01 +00002782 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00002783 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002784 pBt->db = p->db;
danielk197797a227c2006-01-20 16:32:04 +00002785 if( pBt->inStmt && !pBt->readOnly ){
danielk1977fd7f0452008-12-17 17:30:26 +00002786 int iStmtpoint = p->db->nSavepoint;
2787 rc = sqlite3PagerSavepoint(pBt->pPager, SAVEPOINT_ROLLBACK, iStmtpoint);
2788 if( rc==SQLITE_OK ){
2789 rc = sqlite3PagerSavepoint(pBt->pPager, SAVEPOINT_RELEASE, iStmtpoint);
2790 }
danielk197797a227c2006-01-20 16:32:04 +00002791 pBt->inStmt = 0;
2792 }
drhd677b3d2007-08-20 22:48:41 +00002793 sqlite3BtreeLeave(p);
drh663fc632002-02-02 18:49:19 +00002794 return rc;
2795}
2796
2797/*
danielk1977fd7f0452008-12-17 17:30:26 +00002798** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
2799** or SAVEPOINT_RELEASE. This function either releases or rolls back the
2800** savepoint identified by parameter iSavepoint, depending on the value of
2801** op.
2802*/
2803int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
2804 int rc = SQLITE_OK;
2805 if( p && p->inTrans==TRANS_WRITE ){
2806 BtShared *pBt = p->pBt;
2807 assert( pBt->inStmt==0 );
2808 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
2809 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
2810 sqlite3BtreeEnter(p);
2811 pBt->db = p->db;
2812 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
2813 sqlite3BtreeLeave(p);
2814 }
2815 return rc;
2816}
2817
2818/*
drh8b2f49b2001-06-08 00:21:52 +00002819** Create a new cursor for the BTree whose root is on the page
2820** iTable. The act of acquiring a cursor gets a read lock on
2821** the database file.
drh1bee3d72001-10-15 00:44:35 +00002822**
2823** If wrFlag==0, then the cursor can only be used for reading.
drhf74b8d92002-09-01 23:20:45 +00002824** If wrFlag==1, then the cursor can be used for reading or for
2825** writing if other conditions for writing are also met. These
2826** are the conditions that must be met in order for writing to
2827** be allowed:
drh6446c4d2001-12-15 14:22:18 +00002828**
drhf74b8d92002-09-01 23:20:45 +00002829** 1: The cursor must have been opened with wrFlag==1
2830**
drhfe5d71d2007-03-19 11:54:10 +00002831** 2: Other database connections that share the same pager cache
2832** but which are not in the READ_UNCOMMITTED state may not have
2833** cursors open with wrFlag==0 on the same table. Otherwise
2834** the changes made by this write cursor would be visible to
2835** the read cursors in the other database connection.
drhf74b8d92002-09-01 23:20:45 +00002836**
2837** 3: The database must be writable (not on read-only media)
2838**
2839** 4: There must be an active transaction.
2840**
drh6446c4d2001-12-15 14:22:18 +00002841** No checking is done to make sure that page iTable really is the
2842** root page of a b-tree. If it is not, then the cursor acquired
2843** will not work correctly.
danielk197771d5d2c2008-09-29 11:49:47 +00002844**
2845** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
2846** pointed to by pCur have been zeroed by the caller.
drha059ad02001-04-17 20:09:11 +00002847*/
drhd677b3d2007-08-20 22:48:41 +00002848static int btreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002849 Btree *p, /* The btree */
2850 int iTable, /* Root page of table to open */
2851 int wrFlag, /* 1 to write. 0 read-only */
2852 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2853 BtCursor *pCur /* Space for new cursor */
drh3aac2dd2004-04-26 14:10:20 +00002854){
drha059ad02001-04-17 20:09:11 +00002855 int rc;
danielk197789d40042008-11-17 14:20:56 +00002856 Pgno nPage;
danielk1977aef0bf62005-12-30 16:28:01 +00002857 BtShared *pBt = p->pBt;
drhecdc7532001-09-23 02:35:53 +00002858
drh1fee73e2007-08-29 04:00:57 +00002859 assert( sqlite3BtreeHoldsMutex(p) );
drhf49661a2008-12-10 16:45:50 +00002860 assert( wrFlag==0 || wrFlag==1 );
drh8dcd7ca2004-08-08 19:43:29 +00002861 if( wrFlag ){
drh8dcd7ca2004-08-08 19:43:29 +00002862 if( pBt->readOnly ){
2863 return SQLITE_READONLY;
2864 }
danielk19773588ceb2008-06-10 17:30:26 +00002865 if( checkReadLocks(p, iTable, 0, 0) ){
drh8dcd7ca2004-08-08 19:43:29 +00002866 return SQLITE_LOCKED;
2867 }
drha0c9a112004-03-10 13:42:37 +00002868 }
danielk1977aef0bf62005-12-30 16:28:01 +00002869
drh4b70f112004-05-02 21:12:19 +00002870 if( pBt->pPage1==0 ){
danielk1977aef0bf62005-12-30 16:28:01 +00002871 rc = lockBtreeWithRetry(p);
drha059ad02001-04-17 20:09:11 +00002872 if( rc!=SQLITE_OK ){
drha059ad02001-04-17 20:09:11 +00002873 return rc;
2874 }
drh1831f182007-04-24 17:35:59 +00002875 if( pBt->readOnly && wrFlag ){
2876 return SQLITE_READONLY;
2877 }
drha059ad02001-04-17 20:09:11 +00002878 }
drh8b2f49b2001-06-08 00:21:52 +00002879 pCur->pgnoRoot = (Pgno)iTable;
danielk197789d40042008-11-17 14:20:56 +00002880 rc = sqlite3PagerPagecount(pBt->pPager, (int *)&nPage);
2881 if( rc!=SQLITE_OK ){
2882 return rc;
2883 }
2884 if( iTable==1 && nPage==0 ){
drh24cd67e2004-05-10 16:18:47 +00002885 rc = SQLITE_EMPTY;
2886 goto create_cursor_exception;
2887 }
danielk197771d5d2c2008-09-29 11:49:47 +00002888 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
drhbd03cae2001-06-02 02:40:57 +00002889 if( rc!=SQLITE_OK ){
2890 goto create_cursor_exception;
drha059ad02001-04-17 20:09:11 +00002891 }
danielk1977aef0bf62005-12-30 16:28:01 +00002892
danielk1977aef0bf62005-12-30 16:28:01 +00002893 /* Now that no other errors can occur, finish filling in the BtCursor
2894 ** variables, link the cursor into the BtShared list and set *ppCur (the
2895 ** output argument to this function).
2896 */
drh1e968a02008-03-25 00:22:21 +00002897 pCur->pKeyInfo = pKeyInfo;
danielk1977aef0bf62005-12-30 16:28:01 +00002898 pCur->pBtree = p;
drhd0679ed2007-08-28 22:24:34 +00002899 pCur->pBt = pBt;
drhf49661a2008-12-10 16:45:50 +00002900 pCur->wrFlag = (u8)wrFlag;
drha059ad02001-04-17 20:09:11 +00002901 pCur->pNext = pBt->pCursor;
2902 if( pCur->pNext ){
2903 pCur->pNext->pPrev = pCur;
2904 }
2905 pBt->pCursor = pCur;
danielk1977da184232006-01-05 11:34:32 +00002906 pCur->eState = CURSOR_INVALID;
drhbd03cae2001-06-02 02:40:57 +00002907
danielk1977aef0bf62005-12-30 16:28:01 +00002908 return SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00002909
drhbd03cae2001-06-02 02:40:57 +00002910create_cursor_exception:
danielk197771d5d2c2008-09-29 11:49:47 +00002911 releasePage(pCur->apPage[0]);
drh5e00f6c2001-09-13 13:46:56 +00002912 unlockBtreeIfUnused(pBt);
drhbd03cae2001-06-02 02:40:57 +00002913 return rc;
drha059ad02001-04-17 20:09:11 +00002914}
drhd677b3d2007-08-20 22:48:41 +00002915int sqlite3BtreeCursor(
danielk1977cd3e8f72008-03-25 09:47:35 +00002916 Btree *p, /* The btree */
2917 int iTable, /* Root page of table to open */
2918 int wrFlag, /* 1 to write. 0 read-only */
2919 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2920 BtCursor *pCur /* Write new cursor here */
drhd677b3d2007-08-20 22:48:41 +00002921){
2922 int rc;
2923 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00002924 p->pBt->db = p->db;
danielk1977cd3e8f72008-03-25 09:47:35 +00002925 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
drhd677b3d2007-08-20 22:48:41 +00002926 sqlite3BtreeLeave(p);
2927 return rc;
2928}
danielk1977cd3e8f72008-03-25 09:47:35 +00002929int sqlite3BtreeCursorSize(){
2930 return sizeof(BtCursor);
2931}
2932
drhd677b3d2007-08-20 22:48:41 +00002933
drha059ad02001-04-17 20:09:11 +00002934
2935/*
drh5e00f6c2001-09-13 13:46:56 +00002936** Close a cursor. The read lock on the database file is released
drhbd03cae2001-06-02 02:40:57 +00002937** when the last cursor is closed.
drha059ad02001-04-17 20:09:11 +00002938*/
drh3aac2dd2004-04-26 14:10:20 +00002939int sqlite3BtreeCloseCursor(BtCursor *pCur){
drhff0587c2007-08-29 17:43:19 +00002940 Btree *pBtree = pCur->pBtree;
danielk1977cd3e8f72008-03-25 09:47:35 +00002941 if( pBtree ){
danielk197771d5d2c2008-09-29 11:49:47 +00002942 int i;
danielk1977cd3e8f72008-03-25 09:47:35 +00002943 BtShared *pBt = pCur->pBt;
2944 sqlite3BtreeEnter(pBtree);
2945 pBt->db = pBtree->db;
danielk1977be51a652008-10-08 17:58:48 +00002946 sqlite3BtreeClearCursor(pCur);
danielk1977cd3e8f72008-03-25 09:47:35 +00002947 if( pCur->pPrev ){
2948 pCur->pPrev->pNext = pCur->pNext;
2949 }else{
2950 pBt->pCursor = pCur->pNext;
2951 }
2952 if( pCur->pNext ){
2953 pCur->pNext->pPrev = pCur->pPrev;
2954 }
danielk197771d5d2c2008-09-29 11:49:47 +00002955 for(i=0; i<=pCur->iPage; i++){
2956 releasePage(pCur->apPage[i]);
2957 }
danielk1977cd3e8f72008-03-25 09:47:35 +00002958 unlockBtreeIfUnused(pBt);
2959 invalidateOverflowCache(pCur);
2960 /* sqlite3_free(pCur); */
2961 sqlite3BtreeLeave(pBtree);
drha059ad02001-04-17 20:09:11 +00002962 }
drh8c42ca92001-06-22 19:15:00 +00002963 return SQLITE_OK;
drha059ad02001-04-17 20:09:11 +00002964}
2965
drh7e3b0a02001-04-28 16:52:40 +00002966/*
drh5e2f8b92001-05-28 00:41:15 +00002967** Make a temporary cursor by filling in the fields of pTempCur.
2968** The temporary cursor is not on the cursor list for the Btree.
2969*/
drh16a9b832007-05-05 18:39:25 +00002970void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002971 int i;
drh1fee73e2007-08-29 04:00:57 +00002972 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002973 memcpy(pTempCur, pCur, sizeof(BtCursor));
drh5e2f8b92001-05-28 00:41:15 +00002974 pTempCur->pNext = 0;
2975 pTempCur->pPrev = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00002976 for(i=0; i<=pTempCur->iPage; i++){
2977 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002978 }
danielk197736e20932008-11-26 07:40:30 +00002979 assert( pTempCur->pKey==0 );
drh5e2f8b92001-05-28 00:41:15 +00002980}
2981
2982/*
drhbd03cae2001-06-02 02:40:57 +00002983** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
drh5e2f8b92001-05-28 00:41:15 +00002984** function above.
2985*/
drh16a9b832007-05-05 18:39:25 +00002986void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00002987 int i;
drh1fee73e2007-08-29 04:00:57 +00002988 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00002989 for(i=0; i<=pCur->iPage; i++){
2990 sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
drhecdc7532001-09-23 02:35:53 +00002991 }
danielk197736e20932008-11-26 07:40:30 +00002992 sqlite3_free(pCur->pKey);
drh5e2f8b92001-05-28 00:41:15 +00002993}
2994
2995/*
drh86057612007-06-26 01:04:48 +00002996** Make sure the BtCursor* given in the argument has a valid
2997** BtCursor.info structure. If it is not already valid, call
danielk19771cc5ed82007-05-16 17:28:43 +00002998** sqlite3BtreeParseCell() to fill it in.
drhab01f612004-05-22 02:55:23 +00002999**
3000** BtCursor.info is a cache of the information in the current cell.
drh16a9b832007-05-05 18:39:25 +00003001** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
drh86057612007-06-26 01:04:48 +00003002**
3003** 2007-06-25: There is a bug in some versions of MSVC that cause the
3004** compiler to crash when getCellInfo() is implemented as a macro.
3005** But there is a measureable speed advantage to using the macro on gcc
3006** (when less compiler optimizations like -Os or -O0 are used and the
3007** compiler is not doing agressive inlining.) So we use a real function
3008** for MSVC and a macro for everything else. Ticket #2457.
drh9188b382004-05-14 21:12:22 +00003009*/
drh9188b382004-05-14 21:12:22 +00003010#ifndef NDEBUG
danielk19771cc5ed82007-05-16 17:28:43 +00003011 static void assertCellInfo(BtCursor *pCur){
drh9188b382004-05-14 21:12:22 +00003012 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00003013 int iPage = pCur->iPage;
drh51c6d962004-06-06 00:42:25 +00003014 memset(&info, 0, sizeof(info));
danielk197771d5d2c2008-09-29 11:49:47 +00003015 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
drh9188b382004-05-14 21:12:22 +00003016 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
drh9188b382004-05-14 21:12:22 +00003017 }
danielk19771cc5ed82007-05-16 17:28:43 +00003018#else
3019 #define assertCellInfo(x)
3020#endif
drh86057612007-06-26 01:04:48 +00003021#ifdef _MSC_VER
3022 /* Use a real function in MSVC to work around bugs in that compiler. */
3023 static void getCellInfo(BtCursor *pCur){
3024 if( pCur->info.nSize==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003025 int iPage = pCur->iPage;
3026 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
drha2c20e42008-03-29 16:01:04 +00003027 pCur->validNKey = 1;
drh86057612007-06-26 01:04:48 +00003028 }else{
3029 assertCellInfo(pCur);
3030 }
3031 }
3032#else /* if not _MSC_VER */
3033 /* Use a macro in all other compilers so that the function is inlined */
danielk197771d5d2c2008-09-29 11:49:47 +00003034#define getCellInfo(pCur) \
3035 if( pCur->info.nSize==0 ){ \
3036 int iPage = pCur->iPage; \
3037 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3038 pCur->validNKey = 1; \
3039 }else{ \
3040 assertCellInfo(pCur); \
drh86057612007-06-26 01:04:48 +00003041 }
3042#endif /* _MSC_VER */
drh9188b382004-05-14 21:12:22 +00003043
3044/*
drh3aac2dd2004-04-26 14:10:20 +00003045** Set *pSize to the size of the buffer needed to hold the value of
3046** the key for the current entry. If the cursor is not pointing
3047** to a valid entry, *pSize is set to 0.
3048**
drh4b70f112004-05-02 21:12:19 +00003049** For a table with the INTKEY flag set, this routine returns the key
drh3aac2dd2004-04-26 14:10:20 +00003050** itself, not the number of bytes in the key.
drh7e3b0a02001-04-28 16:52:40 +00003051*/
drh4a1c3802004-05-12 15:15:47 +00003052int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
drhd677b3d2007-08-20 22:48:41 +00003053 int rc;
3054
drh1fee73e2007-08-29 04:00:57 +00003055 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003056 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003057 if( rc==SQLITE_OK ){
3058 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3059 if( pCur->eState==CURSOR_INVALID ){
3060 *pSize = 0;
3061 }else{
drh86057612007-06-26 01:04:48 +00003062 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00003063 *pSize = pCur->info.nKey;
3064 }
drh72f82862001-05-24 21:06:34 +00003065 }
danielk1977da184232006-01-05 11:34:32 +00003066 return rc;
drha059ad02001-04-17 20:09:11 +00003067}
drh2af926b2001-05-15 00:39:25 +00003068
drh72f82862001-05-24 21:06:34 +00003069/*
drh0e1c19e2004-05-11 00:58:56 +00003070** Set *pSize to the number of bytes of data in the entry the
3071** cursor currently points to. Always return SQLITE_OK.
3072** Failure is not possible. If the cursor is not currently
3073** pointing to an entry (which can happen, for example, if
3074** the database is empty) then *pSize is set to 0.
3075*/
3076int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
drhd677b3d2007-08-20 22:48:41 +00003077 int rc;
3078
drh1fee73e2007-08-29 04:00:57 +00003079 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003080 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003081 if( rc==SQLITE_OK ){
3082 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3083 if( pCur->eState==CURSOR_INVALID ){
3084 /* Not pointing at a valid entry - set *pSize to 0. */
3085 *pSize = 0;
3086 }else{
drh86057612007-06-26 01:04:48 +00003087 getCellInfo(pCur);
danielk1977da184232006-01-05 11:34:32 +00003088 *pSize = pCur->info.nData;
3089 }
drh0e1c19e2004-05-11 00:58:56 +00003090 }
danielk1977da184232006-01-05 11:34:32 +00003091 return rc;
drh0e1c19e2004-05-11 00:58:56 +00003092}
3093
3094/*
danielk1977d04417962007-05-02 13:16:30 +00003095** Given the page number of an overflow page in the database (parameter
3096** ovfl), this function finds the page number of the next page in the
3097** linked list of overflow pages. If possible, it uses the auto-vacuum
3098** pointer-map data instead of reading the content of page ovfl to do so.
3099**
3100** If an error occurs an SQLite error code is returned. Otherwise:
3101**
3102** Unless pPgnoNext is NULL, the page number of the next overflow
3103** page in the linked list is written to *pPgnoNext. If page ovfl
drh85b623f2007-12-13 21:54:09 +00003104** is the last page in its linked list, *pPgnoNext is set to zero.
danielk1977d04417962007-05-02 13:16:30 +00003105**
3106** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3107** for page ovfl. The underlying pager page may have been requested
3108** with the noContent flag set, so the page data accessable via
3109** this handle may not be trusted.
3110*/
3111static int getOverflowPage(
3112 BtShared *pBt,
3113 Pgno ovfl, /* Overflow page */
3114 MemPage **ppPage, /* OUT: MemPage handle */
3115 Pgno *pPgnoNext /* OUT: Next overflow page number */
3116){
3117 Pgno next = 0;
drh1bd10f82008-12-10 21:19:56 +00003118 int rc = SQLITE_OK;
danielk1977d04417962007-05-02 13:16:30 +00003119
drh1fee73e2007-08-29 04:00:57 +00003120 assert( sqlite3_mutex_held(pBt->mutex) );
danielk1977d04417962007-05-02 13:16:30 +00003121 /* One of these must not be NULL. Otherwise, why call this function? */
3122 assert(ppPage || pPgnoNext);
3123
3124 /* If pPgnoNext is NULL, then this function is being called to obtain
3125 ** a MemPage* reference only. No page-data is required in this case.
3126 */
3127 if( !pPgnoNext ){
drh16a9b832007-05-05 18:39:25 +00003128 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
danielk1977d04417962007-05-02 13:16:30 +00003129 }
3130
3131#ifndef SQLITE_OMIT_AUTOVACUUM
3132 /* Try to find the next page in the overflow list using the
3133 ** autovacuum pointer-map pages. Guess that the next page in
3134 ** the overflow list is page number (ovfl+1). If that guess turns
3135 ** out to be wrong, fall back to loading the data of page
3136 ** number ovfl to determine the next page number.
3137 */
3138 if( pBt->autoVacuum ){
3139 Pgno pgno;
3140 Pgno iGuess = ovfl+1;
3141 u8 eType;
3142
3143 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3144 iGuess++;
3145 }
3146
danielk197789d40042008-11-17 14:20:56 +00003147 if( iGuess<=pagerPagecount(pBt) ){
danielk1977d04417962007-05-02 13:16:30 +00003148 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3149 if( rc!=SQLITE_OK ){
3150 return rc;
3151 }
3152 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3153 next = iGuess;
3154 }
3155 }
3156 }
3157#endif
3158
3159 if( next==0 || ppPage ){
3160 MemPage *pPage = 0;
3161
drh16a9b832007-05-05 18:39:25 +00003162 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
danielk1977d04417962007-05-02 13:16:30 +00003163 assert(rc==SQLITE_OK || pPage==0);
3164 if( next==0 && rc==SQLITE_OK ){
3165 next = get4byte(pPage->aData);
3166 }
3167
3168 if( ppPage ){
3169 *ppPage = pPage;
3170 }else{
3171 releasePage(pPage);
3172 }
3173 }
3174 *pPgnoNext = next;
3175
3176 return rc;
3177}
3178
danielk1977da107192007-05-04 08:32:13 +00003179/*
3180** Copy data from a buffer to a page, or from a page to a buffer.
3181**
3182** pPayload is a pointer to data stored on database page pDbPage.
3183** If argument eOp is false, then nByte bytes of data are copied
3184** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3185** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3186** of data are copied from the buffer pBuf to pPayload.
3187**
3188** SQLITE_OK is returned on success, otherwise an error code.
3189*/
3190static int copyPayload(
3191 void *pPayload, /* Pointer to page data */
3192 void *pBuf, /* Pointer to buffer */
3193 int nByte, /* Number of bytes to copy */
3194 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3195 DbPage *pDbPage /* Page containing pPayload */
3196){
3197 if( eOp ){
3198 /* Copy data from buffer to page (a write operation) */
3199 int rc = sqlite3PagerWrite(pDbPage);
3200 if( rc!=SQLITE_OK ){
3201 return rc;
3202 }
3203 memcpy(pPayload, pBuf, nByte);
3204 }else{
3205 /* Copy data from page to buffer (a read operation) */
3206 memcpy(pBuf, pPayload, nByte);
3207 }
3208 return SQLITE_OK;
3209}
danielk1977d04417962007-05-02 13:16:30 +00003210
3211/*
danielk19779f8d6402007-05-02 17:48:45 +00003212** This function is used to read or overwrite payload information
3213** for the entry that the pCur cursor is pointing to. If the eOp
3214** parameter is 0, this is a read operation (data copied into
3215** buffer pBuf). If it is non-zero, a write (data copied from
3216** buffer pBuf).
3217**
3218** A total of "amt" bytes are read or written beginning at "offset".
3219** Data is read to or from the buffer pBuf.
drh72f82862001-05-24 21:06:34 +00003220**
3221** This routine does not make a distinction between key and data.
danielk19779f8d6402007-05-02 17:48:45 +00003222** It just reads or writes bytes from the payload area. Data might
3223** appear on the main page or be scattered out on multiple overflow
3224** pages.
danielk1977da107192007-05-04 08:32:13 +00003225**
danielk1977dcbb5d32007-05-04 18:36:44 +00003226** If the BtCursor.isIncrblobHandle flag is set, and the current
danielk1977da107192007-05-04 08:32:13 +00003227** cursor entry uses one or more overflow pages, this function
3228** allocates space for and lazily popluates the overflow page-list
3229** cache array (BtCursor.aOverflow). Subsequent calls use this
3230** cache to make seeking to the supplied offset more efficient.
3231**
3232** Once an overflow page-list cache has been allocated, it may be
3233** invalidated if some other cursor writes to the same table, or if
3234** the cursor is moved to a different row. Additionally, in auto-vacuum
3235** mode, the following events may invalidate an overflow page-list cache.
3236**
3237** * An incremental vacuum,
3238** * A commit in auto_vacuum="full" mode,
3239** * Creating a table (may require moving an overflow page).
drh72f82862001-05-24 21:06:34 +00003240*/
danielk19779f8d6402007-05-02 17:48:45 +00003241static int accessPayload(
drh3aac2dd2004-04-26 14:10:20 +00003242 BtCursor *pCur, /* Cursor pointing to entry to read from */
danielk197789d40042008-11-17 14:20:56 +00003243 u32 offset, /* Begin reading this far into payload */
3244 u32 amt, /* Read this many bytes */
drh3aac2dd2004-04-26 14:10:20 +00003245 unsigned char *pBuf, /* Write the bytes into this buffer */
danielk19779f8d6402007-05-02 17:48:45 +00003246 int skipKey, /* offset begins at data if this is true */
3247 int eOp /* zero to read. non-zero to write. */
drh3aac2dd2004-04-26 14:10:20 +00003248){
3249 unsigned char *aPayload;
danielk1977da107192007-05-04 08:32:13 +00003250 int rc = SQLITE_OK;
drhfa1a98a2004-05-14 19:08:17 +00003251 u32 nKey;
danielk19772dec9702007-05-02 16:48:37 +00003252 int iIdx = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003253 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
danielk19770d065412008-11-12 18:21:36 +00003254 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
drh3aac2dd2004-04-26 14:10:20 +00003255
danielk1977da107192007-05-04 08:32:13 +00003256 assert( pPage );
danielk1977da184232006-01-05 11:34:32 +00003257 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003258 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drh1fee73e2007-08-29 04:00:57 +00003259 assert( cursorHoldsMutex(pCur) );
danielk1977da107192007-05-04 08:32:13 +00003260
drh86057612007-06-26 01:04:48 +00003261 getCellInfo(pCur);
drh366fda62006-01-13 02:35:09 +00003262 aPayload = pCur->info.pCell + pCur->info.nHeader;
drhf49661a2008-12-10 16:45:50 +00003263 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
danielk1977da107192007-05-04 08:32:13 +00003264
drh3aac2dd2004-04-26 14:10:20 +00003265 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003266 offset += nKey;
drh3aac2dd2004-04-26 14:10:20 +00003267 }
danielk19770d065412008-11-12 18:21:36 +00003268 if( offset+amt > nKey+pCur->info.nData
3269 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3270 ){
danielk1977da107192007-05-04 08:32:13 +00003271 /* Trying to read or write past the end of the data is an error */
danielk197767fd7a92008-09-10 17:53:35 +00003272 return SQLITE_CORRUPT_BKPT;
drh3aac2dd2004-04-26 14:10:20 +00003273 }
danielk1977da107192007-05-04 08:32:13 +00003274
3275 /* Check if data must be read/written to/from the btree page itself. */
drhfa1a98a2004-05-14 19:08:17 +00003276 if( offset<pCur->info.nLocal ){
drh2af926b2001-05-15 00:39:25 +00003277 int a = amt;
drhfa1a98a2004-05-14 19:08:17 +00003278 if( a+offset>pCur->info.nLocal ){
3279 a = pCur->info.nLocal - offset;
drh2af926b2001-05-15 00:39:25 +00003280 }
danielk1977da107192007-05-04 08:32:13 +00003281 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
drh2aa679f2001-06-25 02:11:07 +00003282 offset = 0;
drha34b6762004-05-07 13:30:42 +00003283 pBuf += a;
drh2af926b2001-05-15 00:39:25 +00003284 amt -= a;
drhdd793422001-06-28 01:54:48 +00003285 }else{
drhfa1a98a2004-05-14 19:08:17 +00003286 offset -= pCur->info.nLocal;
drhbd03cae2001-06-02 02:40:57 +00003287 }
danielk1977da107192007-05-04 08:32:13 +00003288
3289 if( rc==SQLITE_OK && amt>0 ){
danielk197789d40042008-11-17 14:20:56 +00003290 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
danielk1977da107192007-05-04 08:32:13 +00003291 Pgno nextPage;
3292
drhfa1a98a2004-05-14 19:08:17 +00003293 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
danielk1977da107192007-05-04 08:32:13 +00003294
danielk19772dec9702007-05-02 16:48:37 +00003295#ifndef SQLITE_OMIT_INCRBLOB
danielk1977dcbb5d32007-05-04 18:36:44 +00003296 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
danielk1977da107192007-05-04 08:32:13 +00003297 ** has not been allocated, allocate it now. The array is sized at
3298 ** one entry for each overflow page in the overflow chain. The
3299 ** page number of the first overflow page is stored in aOverflow[0],
3300 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3301 ** (the cache is lazily populated).
3302 */
danielk1977dcbb5d32007-05-04 18:36:44 +00003303 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
danielk19772dec9702007-05-02 16:48:37 +00003304 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
drh17435752007-08-16 04:30:38 +00003305 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
danielk19772dec9702007-05-02 16:48:37 +00003306 if( nOvfl && !pCur->aOverflow ){
danielk1977da107192007-05-04 08:32:13 +00003307 rc = SQLITE_NOMEM;
danielk19772dec9702007-05-02 16:48:37 +00003308 }
3309 }
danielk1977da107192007-05-04 08:32:13 +00003310
3311 /* If the overflow page-list cache has been allocated and the
3312 ** entry for the first required overflow page is valid, skip
3313 ** directly to it.
3314 */
danielk19772dec9702007-05-02 16:48:37 +00003315 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3316 iIdx = (offset/ovflSize);
3317 nextPage = pCur->aOverflow[iIdx];
3318 offset = (offset%ovflSize);
3319 }
3320#endif
danielk1977da107192007-05-04 08:32:13 +00003321
3322 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3323
3324#ifndef SQLITE_OMIT_INCRBLOB
3325 /* If required, populate the overflow page-list cache. */
3326 if( pCur->aOverflow ){
3327 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3328 pCur->aOverflow[iIdx] = nextPage;
3329 }
3330#endif
3331
danielk1977d04417962007-05-02 13:16:30 +00003332 if( offset>=ovflSize ){
3333 /* The only reason to read this page is to obtain the page
danielk1977da107192007-05-04 08:32:13 +00003334 ** number for the next page in the overflow chain. The page
drhfd131da2007-08-07 17:13:03 +00003335 ** data is not required. So first try to lookup the overflow
3336 ** page-list cache, if any, then fall back to the getOverflowPage()
danielk1977da107192007-05-04 08:32:13 +00003337 ** function.
danielk1977d04417962007-05-02 13:16:30 +00003338 */
danielk19772dec9702007-05-02 16:48:37 +00003339#ifndef SQLITE_OMIT_INCRBLOB
danielk1977da107192007-05-04 08:32:13 +00003340 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3341 nextPage = pCur->aOverflow[iIdx+1];
3342 } else
danielk19772dec9702007-05-02 16:48:37 +00003343#endif
danielk1977da107192007-05-04 08:32:13 +00003344 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
danielk1977da107192007-05-04 08:32:13 +00003345 offset -= ovflSize;
danielk1977d04417962007-05-02 13:16:30 +00003346 }else{
danielk19779f8d6402007-05-02 17:48:45 +00003347 /* Need to read this page properly. It contains some of the
3348 ** range of data that is being read (eOp==0) or written (eOp!=0).
danielk1977d04417962007-05-02 13:16:30 +00003349 */
3350 DbPage *pDbPage;
danielk1977cfe9a692004-06-16 12:00:29 +00003351 int a = amt;
danielk1977d04417962007-05-02 13:16:30 +00003352 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
danielk1977da107192007-05-04 08:32:13 +00003353 if( rc==SQLITE_OK ){
3354 aPayload = sqlite3PagerGetData(pDbPage);
3355 nextPage = get4byte(aPayload);
3356 if( a + offset > ovflSize ){
3357 a = ovflSize - offset;
danielk19779f8d6402007-05-02 17:48:45 +00003358 }
danielk1977da107192007-05-04 08:32:13 +00003359 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3360 sqlite3PagerUnref(pDbPage);
3361 offset = 0;
3362 amt -= a;
3363 pBuf += a;
danielk19779f8d6402007-05-02 17:48:45 +00003364 }
danielk1977cfe9a692004-06-16 12:00:29 +00003365 }
drh2af926b2001-05-15 00:39:25 +00003366 }
drh2af926b2001-05-15 00:39:25 +00003367 }
danielk1977cfe9a692004-06-16 12:00:29 +00003368
danielk1977da107192007-05-04 08:32:13 +00003369 if( rc==SQLITE_OK && amt>0 ){
drh49285702005-09-17 15:20:26 +00003370 return SQLITE_CORRUPT_BKPT;
drha7fcb052001-12-14 15:09:55 +00003371 }
danielk1977da107192007-05-04 08:32:13 +00003372 return rc;
drh2af926b2001-05-15 00:39:25 +00003373}
3374
drh72f82862001-05-24 21:06:34 +00003375/*
drh3aac2dd2004-04-26 14:10:20 +00003376** Read part of the key associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003377** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003378** begins at "offset".
drh8c1238a2003-01-02 14:43:55 +00003379**
drh3aac2dd2004-04-26 14:10:20 +00003380** Return SQLITE_OK on success or an error code if anything goes
3381** wrong. An error is returned if "offset+amt" is larger than
3382** the available payload.
drh72f82862001-05-24 21:06:34 +00003383*/
drha34b6762004-05-07 13:30:42 +00003384int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003385 int rc;
3386
drh1fee73e2007-08-29 04:00:57 +00003387 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003388 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003389 if( rc==SQLITE_OK ){
3390 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003391 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3392 if( pCur->apPage[0]->intKey ){
danielk1977da184232006-01-05 11:34:32 +00003393 return SQLITE_CORRUPT_BKPT;
3394 }
danielk197771d5d2c2008-09-29 11:49:47 +00003395 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003396 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
drh6575a222005-03-10 17:06:34 +00003397 }
danielk1977da184232006-01-05 11:34:32 +00003398 return rc;
drh3aac2dd2004-04-26 14:10:20 +00003399}
3400
3401/*
drh3aac2dd2004-04-26 14:10:20 +00003402** Read part of the data associated with cursor pCur. Exactly
drha34b6762004-05-07 13:30:42 +00003403** "amt" bytes will be transfered into pBuf[]. The transfer
drh3aac2dd2004-04-26 14:10:20 +00003404** begins at "offset".
3405**
3406** Return SQLITE_OK on success or an error code if anything goes
3407** wrong. An error is returned if "offset+amt" is larger than
3408** the available payload.
drh72f82862001-05-24 21:06:34 +00003409*/
drh3aac2dd2004-04-26 14:10:20 +00003410int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
drhd677b3d2007-08-20 22:48:41 +00003411 int rc;
3412
danielk19773588ceb2008-06-10 17:30:26 +00003413#ifndef SQLITE_OMIT_INCRBLOB
3414 if ( pCur->eState==CURSOR_INVALID ){
3415 return SQLITE_ABORT;
3416 }
3417#endif
3418
drh1fee73e2007-08-29 04:00:57 +00003419 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003420 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003421 if( rc==SQLITE_OK ){
3422 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003423 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3424 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh16a9b832007-05-05 18:39:25 +00003425 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
danielk1977da184232006-01-05 11:34:32 +00003426 }
3427 return rc;
drh2af926b2001-05-15 00:39:25 +00003428}
3429
drh72f82862001-05-24 21:06:34 +00003430/*
drh0e1c19e2004-05-11 00:58:56 +00003431** Return a pointer to payload information from the entry that the
3432** pCur cursor is pointing to. The pointer is to the beginning of
3433** the key if skipKey==0 and it points to the beginning of data if
drhe51c44f2004-05-30 20:46:09 +00003434** skipKey==1. The number of bytes of available key/data is written
3435** into *pAmt. If *pAmt==0, then the value returned will not be
3436** a valid pointer.
drh0e1c19e2004-05-11 00:58:56 +00003437**
3438** This routine is an optimization. It is common for the entire key
3439** and data to fit on the local page and for there to be no overflow
3440** pages. When that is so, this routine can be used to access the
3441** key and data without making a copy. If the key and/or data spills
drh16a9b832007-05-05 18:39:25 +00003442** onto overflow pages, then accessPayload() must be used to reassembly
drh0e1c19e2004-05-11 00:58:56 +00003443** the key/data and copy it into a preallocated buffer.
3444**
3445** The pointer returned by this routine looks directly into the cached
3446** page of the database. The data might change or move the next time
3447** any btree routine is called.
3448*/
3449static const unsigned char *fetchPayload(
3450 BtCursor *pCur, /* Cursor pointing to entry to read from */
drhe51c44f2004-05-30 20:46:09 +00003451 int *pAmt, /* Write the number of available bytes here */
drh0e1c19e2004-05-11 00:58:56 +00003452 int skipKey /* read beginning at data if this is true */
3453){
3454 unsigned char *aPayload;
3455 MemPage *pPage;
drhfa1a98a2004-05-14 19:08:17 +00003456 u32 nKey;
danielk197789d40042008-11-17 14:20:56 +00003457 u32 nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003458
danielk197771d5d2c2008-09-29 11:49:47 +00003459 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
danielk1977da184232006-01-05 11:34:32 +00003460 assert( pCur->eState==CURSOR_VALID );
drh1fee73e2007-08-29 04:00:57 +00003461 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00003462 pPage = pCur->apPage[pCur->iPage];
3463 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
drh86057612007-06-26 01:04:48 +00003464 getCellInfo(pCur);
drh43605152004-05-29 21:46:49 +00003465 aPayload = pCur->info.pCell;
drhfa1a98a2004-05-14 19:08:17 +00003466 aPayload += pCur->info.nHeader;
drh0e1c19e2004-05-11 00:58:56 +00003467 if( pPage->intKey ){
drhfa1a98a2004-05-14 19:08:17 +00003468 nKey = 0;
3469 }else{
drhf49661a2008-12-10 16:45:50 +00003470 nKey = (int)pCur->info.nKey;
drh0e1c19e2004-05-11 00:58:56 +00003471 }
drh0e1c19e2004-05-11 00:58:56 +00003472 if( skipKey ){
drhfa1a98a2004-05-14 19:08:17 +00003473 aPayload += nKey;
3474 nLocal = pCur->info.nLocal - nKey;
drh0e1c19e2004-05-11 00:58:56 +00003475 }else{
drhfa1a98a2004-05-14 19:08:17 +00003476 nLocal = pCur->info.nLocal;
drhe51c44f2004-05-30 20:46:09 +00003477 if( nLocal>nKey ){
3478 nLocal = nKey;
3479 }
drh0e1c19e2004-05-11 00:58:56 +00003480 }
drhe51c44f2004-05-30 20:46:09 +00003481 *pAmt = nLocal;
drh0e1c19e2004-05-11 00:58:56 +00003482 return aPayload;
3483}
3484
3485
3486/*
drhe51c44f2004-05-30 20:46:09 +00003487** For the entry that cursor pCur is point to, return as
3488** many bytes of the key or data as are available on the local
3489** b-tree page. Write the number of available bytes into *pAmt.
drh0e1c19e2004-05-11 00:58:56 +00003490**
3491** The pointer returned is ephemeral. The key/data may move
drhd677b3d2007-08-20 22:48:41 +00003492** or be destroyed on the next call to any Btree routine,
3493** including calls from other threads against the same cache.
3494** Hence, a mutex on the BtShared should be held prior to calling
3495** this routine.
drh0e1c19e2004-05-11 00:58:56 +00003496**
3497** These routines is used to get quick access to key and data
3498** in the common case where no overflow pages are used.
drh0e1c19e2004-05-11 00:58:56 +00003499*/
drhe51c44f2004-05-30 20:46:09 +00003500const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003501 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003502 if( pCur->eState==CURSOR_VALID ){
3503 return (const void*)fetchPayload(pCur, pAmt, 0);
3504 }
3505 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003506}
drhe51c44f2004-05-30 20:46:09 +00003507const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
drh1fee73e2007-08-29 04:00:57 +00003508 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003509 if( pCur->eState==CURSOR_VALID ){
3510 return (const void*)fetchPayload(pCur, pAmt, 1);
3511 }
3512 return 0;
drh0e1c19e2004-05-11 00:58:56 +00003513}
3514
3515
3516/*
drh8178a752003-01-05 21:41:40 +00003517** Move the cursor down to a new child page. The newPgno argument is the
drhab01f612004-05-22 02:55:23 +00003518** page number of the child page to move to.
drh72f82862001-05-24 21:06:34 +00003519*/
drh3aac2dd2004-04-26 14:10:20 +00003520static int moveToChild(BtCursor *pCur, u32 newPgno){
drh72f82862001-05-24 21:06:34 +00003521 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003522 int i = pCur->iPage;
drh72f82862001-05-24 21:06:34 +00003523 MemPage *pNewPage;
drhd0679ed2007-08-28 22:24:34 +00003524 BtShared *pBt = pCur->pBt;
drh72f82862001-05-24 21:06:34 +00003525
drh1fee73e2007-08-29 04:00:57 +00003526 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003527 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003528 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3529 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3530 return SQLITE_CORRUPT_BKPT;
3531 }
3532 rc = getAndInitPage(pBt, newPgno, &pNewPage);
drh6019e162001-07-02 17:51:45 +00003533 if( rc ) return rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003534 pCur->apPage[i+1] = pNewPage;
3535 pCur->aiIdx[i+1] = 0;
3536 pCur->iPage++;
3537
drh271efa52004-05-30 19:19:05 +00003538 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003539 pCur->validNKey = 0;
drh4be295b2003-12-16 03:44:47 +00003540 if( pNewPage->nCell<1 ){
drh49285702005-09-17 15:20:26 +00003541 return SQLITE_CORRUPT_BKPT;
drh4be295b2003-12-16 03:44:47 +00003542 }
drh72f82862001-05-24 21:06:34 +00003543 return SQLITE_OK;
3544}
3545
danielk1977bf93c562008-09-29 15:53:25 +00003546#ifndef NDEBUG
3547/*
3548** Page pParent is an internal (non-leaf) tree page. This function
3549** asserts that page number iChild is the left-child if the iIdx'th
3550** cell in page pParent. Or, if iIdx is equal to the total number of
3551** cells in pParent, that page number iChild is the right-child of
3552** the page.
3553*/
3554static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3555 assert( iIdx<=pParent->nCell );
3556 if( iIdx==pParent->nCell ){
3557 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3558 }else{
3559 assert( get4byte(findCell(pParent, iIdx))==iChild );
3560 }
3561}
3562#else
3563# define assertParentIndex(x,y,z)
3564#endif
3565
drh72f82862001-05-24 21:06:34 +00003566/*
drh5e2f8b92001-05-28 00:41:15 +00003567** Move the cursor up to the parent page.
3568**
3569** pCur->idx is set to the cell index that contains the pointer
3570** to the page we are coming from. If we are coming from the
3571** right-most child page then pCur->idx is set to one more than
drhbd03cae2001-06-02 02:40:57 +00003572** the largest cell index.
drh72f82862001-05-24 21:06:34 +00003573*/
drh16a9b832007-05-05 18:39:25 +00003574void sqlite3BtreeMoveToParent(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00003575 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003576 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003577 assert( pCur->iPage>0 );
3578 assert( pCur->apPage[pCur->iPage] );
danielk1977bf93c562008-09-29 15:53:25 +00003579 assertParentIndex(
3580 pCur->apPage[pCur->iPage-1],
3581 pCur->aiIdx[pCur->iPage-1],
3582 pCur->apPage[pCur->iPage]->pgno
3583 );
danielk197771d5d2c2008-09-29 11:49:47 +00003584 releasePage(pCur->apPage[pCur->iPage]);
3585 pCur->iPage--;
drh271efa52004-05-30 19:19:05 +00003586 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003587 pCur->validNKey = 0;
drh72f82862001-05-24 21:06:34 +00003588}
3589
3590/*
3591** Move the cursor to the root page
3592*/
drh5e2f8b92001-05-28 00:41:15 +00003593static int moveToRoot(BtCursor *pCur){
drh3aac2dd2004-04-26 14:10:20 +00003594 MemPage *pRoot;
drh777e4c42006-01-13 04:31:58 +00003595 int rc = SQLITE_OK;
drhd677b3d2007-08-20 22:48:41 +00003596 Btree *p = pCur->pBtree;
3597 BtShared *pBt = p->pBt;
drhbd03cae2001-06-02 02:40:57 +00003598
drh1fee73e2007-08-29 04:00:57 +00003599 assert( cursorHoldsMutex(pCur) );
drhfb982642007-08-30 01:19:59 +00003600 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3601 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3602 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3603 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3604 if( pCur->eState==CURSOR_FAULT ){
3605 return pCur->skip;
3606 }
danielk1977be51a652008-10-08 17:58:48 +00003607 sqlite3BtreeClearCursor(pCur);
drhbf700f32007-03-31 02:36:44 +00003608 }
danielk197771d5d2c2008-09-29 11:49:47 +00003609
3610 if( pCur->iPage>=0 ){
3611 int i;
3612 for(i=1; i<=pCur->iPage; i++){
3613 releasePage(pCur->apPage[i]);
danielk1977d9f6c532008-09-19 16:39:38 +00003614 }
drh777e4c42006-01-13 04:31:58 +00003615 }else{
3616 if(
danielk197771d5d2c2008-09-29 11:49:47 +00003617 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
drh777e4c42006-01-13 04:31:58 +00003618 ){
3619 pCur->eState = CURSOR_INVALID;
3620 return rc;
3621 }
drhc39e0002004-05-07 23:50:57 +00003622 }
danielk197771d5d2c2008-09-29 11:49:47 +00003623
3624 pRoot = pCur->apPage[0];
3625 assert( pRoot->pgno==pCur->pgnoRoot );
3626 pCur->iPage = 0;
3627 pCur->aiIdx[0] = 0;
drh271efa52004-05-30 19:19:05 +00003628 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003629 pCur->atLast = 0;
3630 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00003631
drh8856d6a2004-04-29 14:42:46 +00003632 if( pRoot->nCell==0 && !pRoot->leaf ){
3633 Pgno subpage;
3634 assert( pRoot->pgno==1 );
drh43605152004-05-29 21:46:49 +00003635 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
drh8856d6a2004-04-29 14:42:46 +00003636 assert( subpage>0 );
danielk1977da184232006-01-05 11:34:32 +00003637 pCur->eState = CURSOR_VALID;
drh4b70f112004-05-02 21:12:19 +00003638 rc = moveToChild(pCur, subpage);
danielk197771d5d2c2008-09-29 11:49:47 +00003639 }else{
3640 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
drh8856d6a2004-04-29 14:42:46 +00003641 }
3642 return rc;
drh72f82862001-05-24 21:06:34 +00003643}
drh2af926b2001-05-15 00:39:25 +00003644
drh5e2f8b92001-05-28 00:41:15 +00003645/*
3646** Move the cursor down to the left-most leaf entry beneath the
3647** entry to which it is currently pointing.
drh777e4c42006-01-13 04:31:58 +00003648**
3649** The left-most leaf is the one with the smallest key - the first
3650** in ascending order.
drh5e2f8b92001-05-28 00:41:15 +00003651*/
3652static int moveToLeftmost(BtCursor *pCur){
3653 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003654 int rc = SQLITE_OK;
drh3aac2dd2004-04-26 14:10:20 +00003655 MemPage *pPage;
drh5e2f8b92001-05-28 00:41:15 +00003656
drh1fee73e2007-08-29 04:00:57 +00003657 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003658 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003659 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3660 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3661 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
drh8178a752003-01-05 21:41:40 +00003662 rc = moveToChild(pCur, pgno);
drh5e2f8b92001-05-28 00:41:15 +00003663 }
drhd677b3d2007-08-20 22:48:41 +00003664 return rc;
drh5e2f8b92001-05-28 00:41:15 +00003665}
3666
drh2dcc9aa2002-12-04 13:40:25 +00003667/*
3668** Move the cursor down to the right-most leaf entry beneath the
3669** page to which it is currently pointing. Notice the difference
3670** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3671** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3672** finds the right-most entry beneath the *page*.
drh777e4c42006-01-13 04:31:58 +00003673**
3674** The right-most entry is the one with the largest key - the last
3675** key in ascending order.
drh2dcc9aa2002-12-04 13:40:25 +00003676*/
3677static int moveToRightmost(BtCursor *pCur){
3678 Pgno pgno;
drhd677b3d2007-08-20 22:48:41 +00003679 int rc = SQLITE_OK;
drh1bd10f82008-12-10 21:19:56 +00003680 MemPage *pPage = 0;
drh2dcc9aa2002-12-04 13:40:25 +00003681
drh1fee73e2007-08-29 04:00:57 +00003682 assert( cursorHoldsMutex(pCur) );
danielk1977da184232006-01-05 11:34:32 +00003683 assert( pCur->eState==CURSOR_VALID );
danielk197771d5d2c2008-09-29 11:49:47 +00003684 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
drh43605152004-05-29 21:46:49 +00003685 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
danielk197771d5d2c2008-09-29 11:49:47 +00003686 pCur->aiIdx[pCur->iPage] = pPage->nCell;
drh8178a752003-01-05 21:41:40 +00003687 rc = moveToChild(pCur, pgno);
drh2dcc9aa2002-12-04 13:40:25 +00003688 }
drhd677b3d2007-08-20 22:48:41 +00003689 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00003690 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
drhd677b3d2007-08-20 22:48:41 +00003691 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003692 pCur->validNKey = 0;
drhd677b3d2007-08-20 22:48:41 +00003693 }
danielk1977518002e2008-09-05 05:02:46 +00003694 return rc;
drh2dcc9aa2002-12-04 13:40:25 +00003695}
3696
drh5e00f6c2001-09-13 13:46:56 +00003697/* Move the cursor to the first entry in the table. Return SQLITE_OK
3698** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003699** or set *pRes to 1 if the table is empty.
drh5e00f6c2001-09-13 13:46:56 +00003700*/
drh3aac2dd2004-04-26 14:10:20 +00003701int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
drh5e00f6c2001-09-13 13:46:56 +00003702 int rc;
drhd677b3d2007-08-20 22:48:41 +00003703
drh1fee73e2007-08-29 04:00:57 +00003704 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003705 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh5e00f6c2001-09-13 13:46:56 +00003706 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003707 if( rc==SQLITE_OK ){
3708 if( pCur->eState==CURSOR_INVALID ){
danielk197771d5d2c2008-09-29 11:49:47 +00003709 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003710 *pRes = 1;
3711 rc = SQLITE_OK;
3712 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003713 assert( pCur->apPage[pCur->iPage]->nCell>0 );
drhd677b3d2007-08-20 22:48:41 +00003714 *pRes = 0;
3715 rc = moveToLeftmost(pCur);
3716 }
drh5e00f6c2001-09-13 13:46:56 +00003717 }
drh5e00f6c2001-09-13 13:46:56 +00003718 return rc;
3719}
drh5e2f8b92001-05-28 00:41:15 +00003720
drh9562b552002-02-19 15:00:07 +00003721/* Move the cursor to the last entry in the table. Return SQLITE_OK
3722** on success. Set *pRes to 0 if the cursor actually points to something
drh77c679c2002-02-19 22:43:58 +00003723** or set *pRes to 1 if the table is empty.
drh9562b552002-02-19 15:00:07 +00003724*/
drh3aac2dd2004-04-26 14:10:20 +00003725int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
drh9562b552002-02-19 15:00:07 +00003726 int rc;
drhd677b3d2007-08-20 22:48:41 +00003727
drh1fee73e2007-08-29 04:00:57 +00003728 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003729 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drh9562b552002-02-19 15:00:07 +00003730 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003731 if( rc==SQLITE_OK ){
3732 if( CURSOR_INVALID==pCur->eState ){
danielk197771d5d2c2008-09-29 11:49:47 +00003733 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhd677b3d2007-08-20 22:48:41 +00003734 *pRes = 1;
3735 }else{
3736 assert( pCur->eState==CURSOR_VALID );
3737 *pRes = 0;
3738 rc = moveToRightmost(pCur);
drha2c20e42008-03-29 16:01:04 +00003739 getCellInfo(pCur);
drhf49661a2008-12-10 16:45:50 +00003740 pCur->atLast = rc==SQLITE_OK ?1:0;
drhd677b3d2007-08-20 22:48:41 +00003741 }
drh9562b552002-02-19 15:00:07 +00003742 }
drh9562b552002-02-19 15:00:07 +00003743 return rc;
3744}
3745
drhe14006d2008-03-25 17:23:32 +00003746/* Move the cursor so that it points to an entry near the key
drhe63d9992008-08-13 19:11:48 +00003747** specified by pIdxKey or intKey. Return a success code.
drh72f82862001-05-24 21:06:34 +00003748**
drhe63d9992008-08-13 19:11:48 +00003749** For INTKEY tables, the intKey parameter is used. pIdxKey
3750** must be NULL. For index tables, pIdxKey is used and intKey
3751** is ignored.
drh3aac2dd2004-04-26 14:10:20 +00003752**
drh5e2f8b92001-05-28 00:41:15 +00003753** If an exact match is not found, then the cursor is always
drhbd03cae2001-06-02 02:40:57 +00003754** left pointing at a leaf page which would hold the entry if it
drh5e2f8b92001-05-28 00:41:15 +00003755** were present. The cursor might point to an entry that comes
3756** before or after the key.
3757**
drhbd03cae2001-06-02 02:40:57 +00003758** The result of comparing the key with the entry to which the
drhab01f612004-05-22 02:55:23 +00003759** cursor is written to *pRes if pRes!=NULL. The meaning of
drhbd03cae2001-06-02 02:40:57 +00003760** this value is as follows:
3761**
3762** *pRes<0 The cursor is left pointing at an entry that
drh1a844c32002-12-04 22:29:28 +00003763** is smaller than pKey or if the table is empty
3764** and the cursor is therefore left point to nothing.
drhbd03cae2001-06-02 02:40:57 +00003765**
3766** *pRes==0 The cursor is left pointing at an entry that
3767** exactly matches pKey.
3768**
3769** *pRes>0 The cursor is left pointing at an entry that
drh7c717f72001-06-24 20:39:41 +00003770** is larger than pKey.
drhd677b3d2007-08-20 22:48:41 +00003771**
drha059ad02001-04-17 20:09:11 +00003772*/
drhe63d9992008-08-13 19:11:48 +00003773int sqlite3BtreeMovetoUnpacked(
3774 BtCursor *pCur, /* The cursor to be moved */
3775 UnpackedRecord *pIdxKey, /* Unpacked index key */
3776 i64 intKey, /* The table key */
3777 int biasRight, /* If true, bias the search to the high end */
3778 int *pRes /* Write search results here */
drhe4d90812007-03-29 05:51:49 +00003779){
drh72f82862001-05-24 21:06:34 +00003780 int rc;
drhd677b3d2007-08-20 22:48:41 +00003781
drh1fee73e2007-08-29 04:00:57 +00003782 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00003783 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
drha2c20e42008-03-29 16:01:04 +00003784
3785 /* If the cursor is already positioned at the point we are trying
3786 ** to move to, then just return without doing any work */
danielk197771d5d2c2008-09-29 11:49:47 +00003787 if( pCur->eState==CURSOR_VALID && pCur->validNKey
3788 && pCur->apPage[0]->intKey
3789 ){
drhe63d9992008-08-13 19:11:48 +00003790 if( pCur->info.nKey==intKey ){
drha2c20e42008-03-29 16:01:04 +00003791 *pRes = 0;
3792 return SQLITE_OK;
3793 }
drhe63d9992008-08-13 19:11:48 +00003794 if( pCur->atLast && pCur->info.nKey<intKey ){
drha2c20e42008-03-29 16:01:04 +00003795 *pRes = -1;
3796 return SQLITE_OK;
3797 }
3798 }
3799
drh5e2f8b92001-05-28 00:41:15 +00003800 rc = moveToRoot(pCur);
drhd677b3d2007-08-20 22:48:41 +00003801 if( rc ){
3802 return rc;
3803 }
danielk197771d5d2c2008-09-29 11:49:47 +00003804 assert( pCur->apPage[pCur->iPage] );
3805 assert( pCur->apPage[pCur->iPage]->isInit );
danielk1977da184232006-01-05 11:34:32 +00003806 if( pCur->eState==CURSOR_INVALID ){
drhf328bc82004-05-10 23:29:49 +00003807 *pRes = -1;
danielk197771d5d2c2008-09-29 11:49:47 +00003808 assert( pCur->apPage[pCur->iPage]->nCell==0 );
drhc39e0002004-05-07 23:50:57 +00003809 return SQLITE_OK;
3810 }
danielk197771d5d2c2008-09-29 11:49:47 +00003811 assert( pCur->apPage[0]->intKey || pIdxKey );
drh14684382006-11-30 13:05:29 +00003812 for(;;){
drh72f82862001-05-24 21:06:34 +00003813 int lwr, upr;
3814 Pgno chldPg;
danielk197771d5d2c2008-09-29 11:49:47 +00003815 MemPage *pPage = pCur->apPage[pCur->iPage];
drh1a844c32002-12-04 22:29:28 +00003816 int c = -1; /* pRes return if table is empty must be -1 */
drh72f82862001-05-24 21:06:34 +00003817 lwr = 0;
3818 upr = pPage->nCell-1;
drhe63d9992008-08-13 19:11:48 +00003819 if( !pPage->intKey && pIdxKey==0 ){
drh1e968a02008-03-25 00:22:21 +00003820 rc = SQLITE_CORRUPT_BKPT;
3821 goto moveto_finish;
drh4eec4c12005-01-21 00:22:37 +00003822 }
drhe4d90812007-03-29 05:51:49 +00003823 if( biasRight ){
drhf49661a2008-12-10 16:45:50 +00003824 pCur->aiIdx[pCur->iPage] = (u16)upr;
drhe4d90812007-03-29 05:51:49 +00003825 }else{
drhf49661a2008-12-10 16:45:50 +00003826 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
drhe4d90812007-03-29 05:51:49 +00003827 }
drhf1d68b32007-03-29 04:43:26 +00003828 if( lwr<=upr ) for(;;){
danielk197713adf8a2004-06-03 16:08:41 +00003829 void *pCellKey;
drh4a1c3802004-05-12 15:15:47 +00003830 i64 nCellKey;
danielk197771d5d2c2008-09-29 11:49:47 +00003831 int idx = pCur->aiIdx[pCur->iPage];
drh366fda62006-01-13 02:35:09 +00003832 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003833 pCur->validNKey = 1;
drh3aac2dd2004-04-26 14:10:20 +00003834 if( pPage->intKey ){
drh777e4c42006-01-13 04:31:58 +00003835 u8 *pCell;
danielk197771d5d2c2008-09-29 11:49:47 +00003836 pCell = findCell(pPage, idx) + pPage->childPtrSize;
drhd172f862006-01-12 15:01:15 +00003837 if( pPage->hasData ){
danielk1977bab45c62006-01-16 15:14:27 +00003838 u32 dummy;
shane3f8d5cf2008-04-24 19:15:09 +00003839 pCell += getVarint32(pCell, dummy);
drhd172f862006-01-12 15:01:15 +00003840 }
drha2c20e42008-03-29 16:01:04 +00003841 getVarint(pCell, (u64*)&nCellKey);
drhe63d9992008-08-13 19:11:48 +00003842 if( nCellKey==intKey ){
drh3aac2dd2004-04-26 14:10:20 +00003843 c = 0;
drhe63d9992008-08-13 19:11:48 +00003844 }else if( nCellKey<intKey ){
drh41eb9e92008-04-02 18:33:07 +00003845 c = -1;
3846 }else{
drhe63d9992008-08-13 19:11:48 +00003847 assert( nCellKey>intKey );
drh41eb9e92008-04-02 18:33:07 +00003848 c = +1;
drh3aac2dd2004-04-26 14:10:20 +00003849 }
drh3aac2dd2004-04-26 14:10:20 +00003850 }else{
drhe51c44f2004-05-30 20:46:09 +00003851 int available;
danielk197713adf8a2004-06-03 16:08:41 +00003852 pCellKey = (void *)fetchPayload(pCur, &available, 0);
drh366fda62006-01-13 02:35:09 +00003853 nCellKey = pCur->info.nKey;
drhe51c44f2004-05-30 20:46:09 +00003854 if( available>=nCellKey ){
drhf49661a2008-12-10 16:45:50 +00003855 c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey);
drhe51c44f2004-05-30 20:46:09 +00003856 }else{
drhf49661a2008-12-10 16:45:50 +00003857 pCellKey = sqlite3Malloc( (int)nCellKey );
danielk19776507ecb2008-03-25 09:56:44 +00003858 if( pCellKey==0 ){
3859 rc = SQLITE_NOMEM;
3860 goto moveto_finish;
3861 }
drhf49661a2008-12-10 16:45:50 +00003862 rc = sqlite3BtreeKey(pCur, 0, (int)nCellKey, (void*)pCellKey);
drh1bd10f82008-12-10 21:19:56 +00003863 c = sqlite3VdbeRecordCompare((int)nCellKey, pCellKey, pIdxKey);
drhfacf0302008-06-17 15:12:00 +00003864 sqlite3_free(pCellKey);
drh1e968a02008-03-25 00:22:21 +00003865 if( rc ) goto moveto_finish;
drhe51c44f2004-05-30 20:46:09 +00003866 }
drh3aac2dd2004-04-26 14:10:20 +00003867 }
drh72f82862001-05-24 21:06:34 +00003868 if( c==0 ){
drha2c20e42008-03-29 16:01:04 +00003869 pCur->info.nKey = nCellKey;
drh44845222008-07-17 18:39:57 +00003870 if( pPage->intKey && !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00003871 lwr = idx;
drhfc70e6f2004-05-12 21:11:27 +00003872 upr = lwr - 1;
drh8b18dd42004-05-12 19:18:15 +00003873 break;
3874 }else{
drh8b18dd42004-05-12 19:18:15 +00003875 if( pRes ) *pRes = 0;
drh1e968a02008-03-25 00:22:21 +00003876 rc = SQLITE_OK;
3877 goto moveto_finish;
drh8b18dd42004-05-12 19:18:15 +00003878 }
drh72f82862001-05-24 21:06:34 +00003879 }
3880 if( c<0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003881 lwr = idx+1;
drh72f82862001-05-24 21:06:34 +00003882 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00003883 upr = idx-1;
drh72f82862001-05-24 21:06:34 +00003884 }
drhf1d68b32007-03-29 04:43:26 +00003885 if( lwr>upr ){
drha2c20e42008-03-29 16:01:04 +00003886 pCur->info.nKey = nCellKey;
drhf1d68b32007-03-29 04:43:26 +00003887 break;
3888 }
drhf49661a2008-12-10 16:45:50 +00003889 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
drh72f82862001-05-24 21:06:34 +00003890 }
3891 assert( lwr==upr+1 );
danielk197771d5d2c2008-09-29 11:49:47 +00003892 assert( pPage->isInit );
drh3aac2dd2004-04-26 14:10:20 +00003893 if( pPage->leaf ){
drha34b6762004-05-07 13:30:42 +00003894 chldPg = 0;
drh3aac2dd2004-04-26 14:10:20 +00003895 }else if( lwr>=pPage->nCell ){
drh43605152004-05-29 21:46:49 +00003896 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh72f82862001-05-24 21:06:34 +00003897 }else{
danielk19771cc5ed82007-05-16 17:28:43 +00003898 chldPg = get4byte(findCell(pPage, lwr));
drh72f82862001-05-24 21:06:34 +00003899 }
3900 if( chldPg==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00003901 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
drh72f82862001-05-24 21:06:34 +00003902 if( pRes ) *pRes = c;
drh1e968a02008-03-25 00:22:21 +00003903 rc = SQLITE_OK;
3904 goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003905 }
drhf49661a2008-12-10 16:45:50 +00003906 pCur->aiIdx[pCur->iPage] = (u16)lwr;
drh271efa52004-05-30 19:19:05 +00003907 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00003908 pCur->validNKey = 0;
drh8178a752003-01-05 21:41:40 +00003909 rc = moveToChild(pCur, chldPg);
drh1e968a02008-03-25 00:22:21 +00003910 if( rc ) goto moveto_finish;
drh72f82862001-05-24 21:06:34 +00003911 }
drh1e968a02008-03-25 00:22:21 +00003912moveto_finish:
drhe63d9992008-08-13 19:11:48 +00003913 return rc;
3914}
3915
3916/*
3917** In this version of BtreeMoveto, pKey is a packed index record
3918** such as is generated by the OP_MakeRecord opcode. Unpack the
3919** record and then call BtreeMovetoUnpacked() to do the work.
3920*/
3921int sqlite3BtreeMoveto(
3922 BtCursor *pCur, /* Cursor open on the btree to be searched */
3923 const void *pKey, /* Packed key if the btree is an index */
3924 i64 nKey, /* Integer key for tables. Size of pKey for indices */
3925 int bias, /* Bias search to the high end */
3926 int *pRes /* Write search results here */
3927){
3928 int rc; /* Status code */
3929 UnpackedRecord *pIdxKey; /* Unpacked index key */
drh23f79d02008-08-20 22:06:47 +00003930 UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
drhe63d9992008-08-13 19:11:48 +00003931
drhe14006d2008-03-25 17:23:32 +00003932 if( pKey ){
drhf49661a2008-12-10 16:45:50 +00003933 assert( nKey==(i64)(int)nKey );
3934 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
drh23f79d02008-08-20 22:06:47 +00003935 aSpace, sizeof(aSpace));
drhe63d9992008-08-13 19:11:48 +00003936 if( pIdxKey==0 ) return SQLITE_NOMEM;
3937 }else{
3938 pIdxKey = 0;
3939 }
3940 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
3941 if( pKey ){
3942 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
drhe14006d2008-03-25 17:23:32 +00003943 }
drh1e968a02008-03-25 00:22:21 +00003944 return rc;
drh72f82862001-05-24 21:06:34 +00003945}
3946
drhd677b3d2007-08-20 22:48:41 +00003947
drh72f82862001-05-24 21:06:34 +00003948/*
drhc39e0002004-05-07 23:50:57 +00003949** Return TRUE if the cursor is not pointing at an entry of the table.
3950**
3951** TRUE will be returned after a call to sqlite3BtreeNext() moves
3952** past the last entry in the table or sqlite3BtreePrev() moves past
3953** the first entry. TRUE is also returned if the table is empty.
3954*/
3955int sqlite3BtreeEof(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00003956 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3957 ** have been deleted? This API will need to change to return an error code
3958 ** as well as the boolean result value.
3959 */
3960 return (CURSOR_VALID!=pCur->eState);
drhc39e0002004-05-07 23:50:57 +00003961}
3962
3963/*
drhb21c8cd2007-08-21 19:33:56 +00003964** Return the database connection handle for a cursor.
3965*/
3966sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
drhe5fe6902007-12-07 18:55:28 +00003967 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3968 return pCur->pBtree->db;
drhb21c8cd2007-08-21 19:33:56 +00003969}
3970
3971/*
drhbd03cae2001-06-02 02:40:57 +00003972** Advance the cursor to the next entry in the database. If
drh8c1238a2003-01-02 14:43:55 +00003973** successful then set *pRes=0. If the cursor
drhbd03cae2001-06-02 02:40:57 +00003974** was already pointing to the last entry in the database before
drh8c1238a2003-01-02 14:43:55 +00003975** this routine was called, then set *pRes=1.
drh72f82862001-05-24 21:06:34 +00003976*/
drhd094db12008-04-03 21:46:57 +00003977int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
drh72f82862001-05-24 21:06:34 +00003978 int rc;
danielk197771d5d2c2008-09-29 11:49:47 +00003979 int idx;
danielk197797a227c2006-01-20 16:32:04 +00003980 MemPage *pPage;
drh8b18dd42004-05-12 19:18:15 +00003981
drh1fee73e2007-08-29 04:00:57 +00003982 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00003983 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00003984 if( rc!=SQLITE_OK ){
3985 return rc;
3986 }
drh8c4d3a62007-04-06 01:03:32 +00003987 assert( pRes!=0 );
drh8c4d3a62007-04-06 01:03:32 +00003988 if( CURSOR_INVALID==pCur->eState ){
3989 *pRes = 1;
3990 return SQLITE_OK;
3991 }
danielk1977da184232006-01-05 11:34:32 +00003992 if( pCur->skip>0 ){
3993 pCur->skip = 0;
3994 *pRes = 0;
3995 return SQLITE_OK;
3996 }
3997 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00003998
danielk197771d5d2c2008-09-29 11:49:47 +00003999 pPage = pCur->apPage[pCur->iPage];
4000 idx = ++pCur->aiIdx[pCur->iPage];
4001 assert( pPage->isInit );
4002 assert( idx<=pPage->nCell );
danielk19776a43f9b2004-11-16 04:57:24 +00004003
drh271efa52004-05-30 19:19:05 +00004004 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004005 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004006 if( idx>=pPage->nCell ){
drha34b6762004-05-07 13:30:42 +00004007 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00004008 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
drh5e2f8b92001-05-28 00:41:15 +00004009 if( rc ) return rc;
4010 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00004011 *pRes = 0;
4012 return rc;
drh72f82862001-05-24 21:06:34 +00004013 }
drh5e2f8b92001-05-28 00:41:15 +00004014 do{
danielk197771d5d2c2008-09-29 11:49:47 +00004015 if( pCur->iPage==0 ){
drh8c1238a2003-01-02 14:43:55 +00004016 *pRes = 1;
danielk1977da184232006-01-05 11:34:32 +00004017 pCur->eState = CURSOR_INVALID;
drh5e2f8b92001-05-28 00:41:15 +00004018 return SQLITE_OK;
4019 }
drh16a9b832007-05-05 18:39:25 +00004020 sqlite3BtreeMoveToParent(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00004021 pPage = pCur->apPage[pCur->iPage];
4022 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
drh8c1238a2003-01-02 14:43:55 +00004023 *pRes = 0;
drh44845222008-07-17 18:39:57 +00004024 if( pPage->intKey ){
drh8b18dd42004-05-12 19:18:15 +00004025 rc = sqlite3BtreeNext(pCur, pRes);
4026 }else{
4027 rc = SQLITE_OK;
4028 }
4029 return rc;
drh8178a752003-01-05 21:41:40 +00004030 }
4031 *pRes = 0;
drh3aac2dd2004-04-26 14:10:20 +00004032 if( pPage->leaf ){
drh8178a752003-01-05 21:41:40 +00004033 return SQLITE_OK;
drh72f82862001-05-24 21:06:34 +00004034 }
drh5e2f8b92001-05-28 00:41:15 +00004035 rc = moveToLeftmost(pCur);
drh8c1238a2003-01-02 14:43:55 +00004036 return rc;
drh72f82862001-05-24 21:06:34 +00004037}
drhd677b3d2007-08-20 22:48:41 +00004038
drh72f82862001-05-24 21:06:34 +00004039
drh3b7511c2001-05-26 13:15:44 +00004040/*
drh2dcc9aa2002-12-04 13:40:25 +00004041** Step the cursor to the back to the previous entry in the database. If
drh8178a752003-01-05 21:41:40 +00004042** successful then set *pRes=0. If the cursor
drh2dcc9aa2002-12-04 13:40:25 +00004043** was already pointing to the first entry in the database before
drh8178a752003-01-05 21:41:40 +00004044** this routine was called, then set *pRes=1.
drh2dcc9aa2002-12-04 13:40:25 +00004045*/
drhd094db12008-04-03 21:46:57 +00004046int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
drh2dcc9aa2002-12-04 13:40:25 +00004047 int rc;
drh8178a752003-01-05 21:41:40 +00004048 MemPage *pPage;
danielk1977da184232006-01-05 11:34:32 +00004049
drh1fee73e2007-08-29 04:00:57 +00004050 assert( cursorHoldsMutex(pCur) );
drha3460582008-07-11 21:02:53 +00004051 rc = restoreCursorPosition(pCur);
danielk1977da184232006-01-05 11:34:32 +00004052 if( rc!=SQLITE_OK ){
4053 return rc;
4054 }
drha2c20e42008-03-29 16:01:04 +00004055 pCur->atLast = 0;
drh8c4d3a62007-04-06 01:03:32 +00004056 if( CURSOR_INVALID==pCur->eState ){
4057 *pRes = 1;
4058 return SQLITE_OK;
4059 }
danielk1977da184232006-01-05 11:34:32 +00004060 if( pCur->skip<0 ){
4061 pCur->skip = 0;
4062 *pRes = 0;
4063 return SQLITE_OK;
4064 }
4065 pCur->skip = 0;
danielk1977da184232006-01-05 11:34:32 +00004066
danielk197771d5d2c2008-09-29 11:49:47 +00004067 pPage = pCur->apPage[pCur->iPage];
4068 assert( pPage->isInit );
drha34b6762004-05-07 13:30:42 +00004069 if( !pPage->leaf ){
danielk197771d5d2c2008-09-29 11:49:47 +00004070 int idx = pCur->aiIdx[pCur->iPage];
4071 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
drhd677b3d2007-08-20 22:48:41 +00004072 if( rc ){
4073 return rc;
4074 }
drh2dcc9aa2002-12-04 13:40:25 +00004075 rc = moveToRightmost(pCur);
4076 }else{
danielk197771d5d2c2008-09-29 11:49:47 +00004077 while( pCur->aiIdx[pCur->iPage]==0 ){
4078 if( pCur->iPage==0 ){
danielk1977da184232006-01-05 11:34:32 +00004079 pCur->eState = CURSOR_INVALID;
drhc39e0002004-05-07 23:50:57 +00004080 *pRes = 1;
drh2dcc9aa2002-12-04 13:40:25 +00004081 return SQLITE_OK;
4082 }
drh16a9b832007-05-05 18:39:25 +00004083 sqlite3BtreeMoveToParent(pCur);
drh2dcc9aa2002-12-04 13:40:25 +00004084 }
drh271efa52004-05-30 19:19:05 +00004085 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00004086 pCur->validNKey = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004087
4088 pCur->aiIdx[pCur->iPage]--;
4089 pPage = pCur->apPage[pCur->iPage];
drh44845222008-07-17 18:39:57 +00004090 if( pPage->intKey && !pPage->leaf ){
drh8b18dd42004-05-12 19:18:15 +00004091 rc = sqlite3BtreePrevious(pCur, pRes);
4092 }else{
4093 rc = SQLITE_OK;
4094 }
drh2dcc9aa2002-12-04 13:40:25 +00004095 }
drh8178a752003-01-05 21:41:40 +00004096 *pRes = 0;
drh2dcc9aa2002-12-04 13:40:25 +00004097 return rc;
4098}
4099
4100/*
drh3b7511c2001-05-26 13:15:44 +00004101** Allocate a new page from the database file.
4102**
danielk19773b8a05f2007-03-19 17:44:26 +00004103** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
drh3b7511c2001-05-26 13:15:44 +00004104** has already been called on the new page.) The new page has also
4105** been referenced and the calling routine is responsible for calling
danielk19773b8a05f2007-03-19 17:44:26 +00004106** sqlite3PagerUnref() on the new page when it is done.
drh3b7511c2001-05-26 13:15:44 +00004107**
4108** SQLITE_OK is returned on success. Any other return value indicates
4109** an error. *ppPage and *pPgno are undefined in the event of an error.
danielk19773b8a05f2007-03-19 17:44:26 +00004110** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
drhbea00b92002-07-08 10:59:50 +00004111**
drh199e3cf2002-07-18 11:01:47 +00004112** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4113** locate a page close to the page number "nearby". This can be used in an
drhbea00b92002-07-08 10:59:50 +00004114** attempt to keep related pages close to each other in the database file,
4115** which in turn can make database access faster.
danielk1977cb1a7eb2004-11-05 12:27:02 +00004116**
4117** If the "exact" parameter is not 0, and the page-number nearby exists
4118** anywhere on the free-list, then it is guarenteed to be returned. This
4119** is only used by auto-vacuum databases when allocating a new table.
drh3b7511c2001-05-26 13:15:44 +00004120*/
drh4f0c5872007-03-26 22:05:01 +00004121static int allocateBtreePage(
danielk1977aef0bf62005-12-30 16:28:01 +00004122 BtShared *pBt,
danielk1977cb1a7eb2004-11-05 12:27:02 +00004123 MemPage **ppPage,
4124 Pgno *pPgno,
4125 Pgno nearby,
4126 u8 exact
4127){
drh3aac2dd2004-04-26 14:10:20 +00004128 MemPage *pPage1;
drh8c42ca92001-06-22 19:15:00 +00004129 int rc;
drh3aac2dd2004-04-26 14:10:20 +00004130 int n; /* Number of pages on the freelist */
4131 int k; /* Number of leaves on the trunk of the freelist */
drhd3627af2006-12-18 18:34:51 +00004132 MemPage *pTrunk = 0;
4133 MemPage *pPrevTrunk = 0;
drh30e58752002-03-02 20:41:57 +00004134
drh1fee73e2007-08-29 04:00:57 +00004135 assert( sqlite3_mutex_held(pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004136 pPage1 = pBt->pPage1;
4137 n = get4byte(&pPage1->aData[36]);
4138 if( n>0 ){
drh91025292004-05-03 19:49:32 +00004139 /* There are pages on the freelist. Reuse one of those pages. */
danielk1977cb1a7eb2004-11-05 12:27:02 +00004140 Pgno iTrunk;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004141 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4142
4143 /* If the 'exact' parameter was true and a query of the pointer-map
4144 ** shows that the page 'nearby' is somewhere on the free-list, then
4145 ** the entire-list will be searched for that page.
4146 */
4147#ifndef SQLITE_OMIT_AUTOVACUUM
danielk197789d40042008-11-17 14:20:56 +00004148 if( exact && nearby<=pagerPagecount(pBt) ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004149 u8 eType;
4150 assert( nearby>0 );
4151 assert( pBt->autoVacuum );
4152 rc = ptrmapGet(pBt, nearby, &eType, 0);
4153 if( rc ) return rc;
4154 if( eType==PTRMAP_FREEPAGE ){
4155 searchList = 1;
4156 }
4157 *pPgno = nearby;
4158 }
4159#endif
4160
4161 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4162 ** first free-list trunk page. iPrevTrunk is initially 1.
4163 */
danielk19773b8a05f2007-03-19 17:44:26 +00004164 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3b7511c2001-05-26 13:15:44 +00004165 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004166 put4byte(&pPage1->aData[36], n-1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004167
4168 /* The code within this loop is run only once if the 'searchList' variable
4169 ** is not true. Otherwise, it runs once for each trunk-page on the
4170 ** free-list until the page 'nearby' is located.
4171 */
4172 do {
4173 pPrevTrunk = pTrunk;
4174 if( pPrevTrunk ){
4175 iTrunk = get4byte(&pPrevTrunk->aData[0]);
drhbea00b92002-07-08 10:59:50 +00004176 }else{
danielk1977cb1a7eb2004-11-05 12:27:02 +00004177 iTrunk = get4byte(&pPage1->aData[32]);
drhbea00b92002-07-08 10:59:50 +00004178 }
drh16a9b832007-05-05 18:39:25 +00004179 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004180 if( rc ){
drhd3627af2006-12-18 18:34:51 +00004181 pTrunk = 0;
4182 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004183 }
4184
4185 k = get4byte(&pTrunk->aData[4]);
4186 if( k==0 && !searchList ){
4187 /* The trunk has no leaves and the list is not being searched.
4188 ** So extract the trunk page itself and use it as the newly
4189 ** allocated page */
4190 assert( pPrevTrunk==0 );
danielk19773b8a05f2007-03-19 17:44:26 +00004191 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004192 if( rc ){
4193 goto end_allocate_page;
4194 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004195 *pPgno = iTrunk;
4196 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4197 *ppPage = pTrunk;
4198 pTrunk = 0;
4199 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
drh45b1fac2008-07-04 17:52:42 +00004200 }else if( k>pBt->usableSize/4 - 2 ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004201 /* Value of k is out of range. Database corruption */
drhd3627af2006-12-18 18:34:51 +00004202 rc = SQLITE_CORRUPT_BKPT;
4203 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004204#ifndef SQLITE_OMIT_AUTOVACUUM
4205 }else if( searchList && nearby==iTrunk ){
4206 /* The list is being searched and this trunk page is the page
4207 ** to allocate, regardless of whether it has leaves.
4208 */
4209 assert( *pPgno==iTrunk );
4210 *ppPage = pTrunk;
4211 searchList = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00004212 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004213 if( rc ){
4214 goto end_allocate_page;
4215 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004216 if( k==0 ){
4217 if( !pPrevTrunk ){
4218 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4219 }else{
4220 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4221 }
4222 }else{
4223 /* The trunk page is required by the caller but it contains
4224 ** pointers to free-list leaves. The first leaf becomes a trunk
4225 ** page in this case.
4226 */
4227 MemPage *pNewTrunk;
4228 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
drh16a9b832007-05-05 18:39:25 +00004229 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004230 if( rc!=SQLITE_OK ){
drhd3627af2006-12-18 18:34:51 +00004231 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004232 }
danielk19773b8a05f2007-03-19 17:44:26 +00004233 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004234 if( rc!=SQLITE_OK ){
4235 releasePage(pNewTrunk);
drhd3627af2006-12-18 18:34:51 +00004236 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004237 }
4238 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4239 put4byte(&pNewTrunk->aData[4], k-1);
4240 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
drhd3627af2006-12-18 18:34:51 +00004241 releasePage(pNewTrunk);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004242 if( !pPrevTrunk ){
drhc5053fb2008-11-27 02:22:10 +00004243 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
danielk1977cb1a7eb2004-11-05 12:27:02 +00004244 put4byte(&pPage1->aData[32], iNewTrunk);
4245 }else{
danielk19773b8a05f2007-03-19 17:44:26 +00004246 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004247 if( rc ){
4248 goto end_allocate_page;
4249 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004250 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4251 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004252 }
4253 pTrunk = 0;
4254 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4255#endif
4256 }else{
4257 /* Extract a leaf from the trunk */
4258 int closest;
4259 Pgno iPage;
4260 unsigned char *aData = pTrunk->aData;
danielk19773b8a05f2007-03-19 17:44:26 +00004261 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhd3627af2006-12-18 18:34:51 +00004262 if( rc ){
4263 goto end_allocate_page;
4264 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004265 if( nearby>0 ){
4266 int i, dist;
4267 closest = 0;
4268 dist = get4byte(&aData[8]) - nearby;
4269 if( dist<0 ) dist = -dist;
4270 for(i=1; i<k; i++){
4271 int d2 = get4byte(&aData[8+i*4]) - nearby;
4272 if( d2<0 ) d2 = -d2;
4273 if( d2<dist ){
4274 closest = i;
4275 dist = d2;
4276 }
4277 }
4278 }else{
4279 closest = 0;
4280 }
4281
4282 iPage = get4byte(&aData[8+closest*4]);
4283 if( !searchList || iPage==nearby ){
danielk197789d40042008-11-17 14:20:56 +00004284 Pgno nPage;
shane1f9e6aa2008-06-09 19:27:11 +00004285 *pPgno = iPage;
danielk197789d40042008-11-17 14:20:56 +00004286 nPage = pagerPagecount(pBt);
danielk1977ad0132d2008-06-07 08:58:22 +00004287 if( *pPgno>nPage ){
danielk1977cb1a7eb2004-11-05 12:27:02 +00004288 /* Free page off the end of the file */
danielk197743e377a2008-05-05 12:09:32 +00004289 rc = SQLITE_CORRUPT_BKPT;
4290 goto end_allocate_page;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004291 }
4292 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4293 ": %d more free pages\n",
4294 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4295 if( closest<k-1 ){
4296 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4297 }
4298 put4byte(&aData[4], k-1);
drhc5053fb2008-11-27 02:22:10 +00004299 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
drh16a9b832007-05-05 18:39:25 +00004300 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
danielk1977cb1a7eb2004-11-05 12:27:02 +00004301 if( rc==SQLITE_OK ){
drh538f5702007-04-13 02:14:30 +00004302 sqlite3PagerDontRollback((*ppPage)->pDbPage);
danielk19773b8a05f2007-03-19 17:44:26 +00004303 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004304 if( rc!=SQLITE_OK ){
4305 releasePage(*ppPage);
4306 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004307 }
4308 searchList = 0;
4309 }
drhee696e22004-08-30 16:52:17 +00004310 }
danielk1977cb1a7eb2004-11-05 12:27:02 +00004311 releasePage(pPrevTrunk);
drhd3627af2006-12-18 18:34:51 +00004312 pPrevTrunk = 0;
danielk1977cb1a7eb2004-11-05 12:27:02 +00004313 }while( searchList );
drh3b7511c2001-05-26 13:15:44 +00004314 }else{
drh3aac2dd2004-04-26 14:10:20 +00004315 /* There are no pages on the freelist, so create a new page at the
4316 ** end of the file */
danielk197789d40042008-11-17 14:20:56 +00004317 int nPage = pagerPagecount(pBt);
danielk1977ad0132d2008-06-07 08:58:22 +00004318 *pPgno = nPage + 1;
danielk1977afcdd022004-10-31 16:25:42 +00004319
4320#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977dddbcdc2007-04-26 14:42:34 +00004321 if( pBt->nTrunc ){
4322 /* An incr-vacuum has already run within this transaction. So the
4323 ** page to allocate is not from the physical end of the file, but
4324 ** at pBt->nTrunc.
4325 */
4326 *pPgno = pBt->nTrunc+1;
4327 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4328 (*pPgno)++;
4329 }
4330 }
danielk1977266664d2006-02-10 08:24:21 +00004331 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
danielk1977afcdd022004-10-31 16:25:42 +00004332 /* If *pPgno refers to a pointer-map page, allocate two new pages
4333 ** at the end of the file instead of one. The first allocated page
4334 ** becomes a new pointer-map page, the second is used by the caller.
4335 */
4336 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
danielk1977599fcba2004-11-08 07:13:13 +00004337 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
danielk1977afcdd022004-10-31 16:25:42 +00004338 (*pPgno)++;
drh72190432008-01-31 14:54:43 +00004339 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
danielk1977afcdd022004-10-31 16:25:42 +00004340 }
danielk1977dddbcdc2007-04-26 14:42:34 +00004341 if( pBt->nTrunc ){
4342 pBt->nTrunc = *pPgno;
4343 }
danielk1977afcdd022004-10-31 16:25:42 +00004344#endif
4345
danielk1977599fcba2004-11-08 07:13:13 +00004346 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drh16a9b832007-05-05 18:39:25 +00004347 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
drh3b7511c2001-05-26 13:15:44 +00004348 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00004349 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
danielk1977aac0a382005-01-16 11:07:06 +00004350 if( rc!=SQLITE_OK ){
4351 releasePage(*ppPage);
4352 }
drh3a4c1412004-05-09 20:40:11 +00004353 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
drh3b7511c2001-05-26 13:15:44 +00004354 }
danielk1977599fcba2004-11-08 07:13:13 +00004355
4356 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
drhd3627af2006-12-18 18:34:51 +00004357
4358end_allocate_page:
4359 releasePage(pTrunk);
4360 releasePage(pPrevTrunk);
danielk1977b247c212008-11-21 09:09:01 +00004361 if( rc==SQLITE_OK ){
4362 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4363 releasePage(*ppPage);
4364 return SQLITE_CORRUPT_BKPT;
4365 }
4366 (*ppPage)->isInit = 0;
danielk1977eaa06f62008-09-18 17:34:44 +00004367 }
drh3b7511c2001-05-26 13:15:44 +00004368 return rc;
4369}
4370
4371/*
drh3aac2dd2004-04-26 14:10:20 +00004372** Add a page of the database file to the freelist.
drh5e2f8b92001-05-28 00:41:15 +00004373**
danielk19773b8a05f2007-03-19 17:44:26 +00004374** sqlite3PagerUnref() is NOT called for pPage.
drh3b7511c2001-05-26 13:15:44 +00004375*/
drh3aac2dd2004-04-26 14:10:20 +00004376static int freePage(MemPage *pPage){
danielk1977aef0bf62005-12-30 16:28:01 +00004377 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004378 MemPage *pPage1 = pBt->pPage1;
4379 int rc, n, k;
drh8b2f49b2001-06-08 00:21:52 +00004380
drh3aac2dd2004-04-26 14:10:20 +00004381 /* Prepare the page for freeing */
drh1fee73e2007-08-29 04:00:57 +00004382 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh3aac2dd2004-04-26 14:10:20 +00004383 assert( pPage->pgno>1 );
4384 pPage->isInit = 0;
drh3aac2dd2004-04-26 14:10:20 +00004385
drha34b6762004-05-07 13:30:42 +00004386 /* Increment the free page count on pPage1 */
danielk19773b8a05f2007-03-19 17:44:26 +00004387 rc = sqlite3PagerWrite(pPage1->pDbPage);
drh3aac2dd2004-04-26 14:10:20 +00004388 if( rc ) return rc;
4389 n = get4byte(&pPage1->aData[36]);
4390 put4byte(&pPage1->aData[36], n+1);
4391
drhfcce93f2006-02-22 03:08:32 +00004392#ifdef SQLITE_SECURE_DELETE
4393 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4394 ** always fully overwrite deleted information with zeros.
4395 */
danielk19773b8a05f2007-03-19 17:44:26 +00004396 rc = sqlite3PagerWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004397 if( rc ) return rc;
4398 memset(pPage->aData, 0, pPage->pBt->pageSize);
4399#endif
4400
danielk1977687566d2004-11-02 12:56:41 +00004401 /* If the database supports auto-vacuum, write an entry in the pointer-map
danielk1977cb1a7eb2004-11-05 12:27:02 +00004402 ** to indicate that the page is free.
danielk1977687566d2004-11-02 12:56:41 +00004403 */
danielk197785d90ca2008-07-19 14:25:15 +00004404 if( ISAUTOVACUUM ){
danielk1977687566d2004-11-02 12:56:41 +00004405 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
danielk1977a64a0352004-11-05 01:45:13 +00004406 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00004407 }
danielk1977687566d2004-11-02 12:56:41 +00004408
drh3aac2dd2004-04-26 14:10:20 +00004409 if( n==0 ){
4410 /* This is the first free page */
danielk19773b8a05f2007-03-19 17:44:26 +00004411 rc = sqlite3PagerWrite(pPage->pDbPage);
drhda200cc2004-05-09 11:51:38 +00004412 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004413 memset(pPage->aData, 0, 8);
drha34b6762004-05-07 13:30:42 +00004414 put4byte(&pPage1->aData[32], pPage->pgno);
drh3a4c1412004-05-09 20:40:11 +00004415 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004416 }else{
4417 /* Other free pages already exist. Retrive the first trunk page
4418 ** of the freelist and find out how many leaves it has. */
drha34b6762004-05-07 13:30:42 +00004419 MemPage *pTrunk;
drh16a9b832007-05-05 18:39:25 +00004420 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
drh3b7511c2001-05-26 13:15:44 +00004421 if( rc ) return rc;
drh3aac2dd2004-04-26 14:10:20 +00004422 k = get4byte(&pTrunk->aData[4]);
drhee696e22004-08-30 16:52:17 +00004423 if( k>=pBt->usableSize/4 - 8 ){
drh3aac2dd2004-04-26 14:10:20 +00004424 /* The trunk is full. Turn the page being freed into a new
drh45b1fac2008-07-04 17:52:42 +00004425 ** trunk page with no leaves.
4426 **
4427 ** Note that the trunk page is not really full until it contains
4428 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4429 ** coded. But due to a coding error in versions of SQLite prior to
4430 ** 3.6.0, databases with freelist trunk pages holding more than
4431 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4432 ** to maintain backwards compatibility with older versions of SQLite,
4433 ** we will contain to restrict the number of entries to usableSize/4 - 8
4434 ** for now. At some point in the future (once everyone has upgraded
4435 ** to 3.6.0 or later) we should consider fixing the conditional above
4436 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4437 */
danielk19773b8a05f2007-03-19 17:44:26 +00004438 rc = sqlite3PagerWrite(pPage->pDbPage);
drhb9ee4932007-09-07 14:32:06 +00004439 if( rc==SQLITE_OK ){
4440 put4byte(pPage->aData, pTrunk->pgno);
4441 put4byte(&pPage->aData[4], 0);
4442 put4byte(&pPage1->aData[32], pPage->pgno);
4443 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4444 pPage->pgno, pTrunk->pgno));
4445 }
4446 }else if( k<0 ){
4447 rc = SQLITE_CORRUPT;
drh3aac2dd2004-04-26 14:10:20 +00004448 }else{
4449 /* Add the newly freed page as a leaf on the current trunk */
danielk19773b8a05f2007-03-19 17:44:26 +00004450 rc = sqlite3PagerWrite(pTrunk->pDbPage);
drhf5345442007-04-09 12:45:02 +00004451 if( rc==SQLITE_OK ){
4452 put4byte(&pTrunk->aData[4], k+1);
4453 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
drhfcce93f2006-02-22 03:08:32 +00004454#ifndef SQLITE_SECURE_DELETE
danielk1977a1fa00d2008-08-27 15:16:33 +00004455 rc = sqlite3PagerDontWrite(pPage->pDbPage);
drhfcce93f2006-02-22 03:08:32 +00004456#endif
drhf5345442007-04-09 12:45:02 +00004457 }
drh3a4c1412004-05-09 20:40:11 +00004458 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
drh3aac2dd2004-04-26 14:10:20 +00004459 }
4460 releasePage(pTrunk);
drh3b7511c2001-05-26 13:15:44 +00004461 }
drh3b7511c2001-05-26 13:15:44 +00004462 return rc;
4463}
4464
4465/*
drh3aac2dd2004-04-26 14:10:20 +00004466** Free any overflow pages associated with the given Cell.
drh3b7511c2001-05-26 13:15:44 +00004467*/
drh3aac2dd2004-04-26 14:10:20 +00004468static int clearCell(MemPage *pPage, unsigned char *pCell){
danielk1977aef0bf62005-12-30 16:28:01 +00004469 BtShared *pBt = pPage->pBt;
drh6f11bef2004-05-13 01:12:56 +00004470 CellInfo info;
drh3aac2dd2004-04-26 14:10:20 +00004471 Pgno ovflPgno;
drh6f11bef2004-05-13 01:12:56 +00004472 int rc;
drh94440812007-03-06 11:42:19 +00004473 int nOvfl;
4474 int ovflPageSize;
drh3b7511c2001-05-26 13:15:44 +00004475
drh1fee73e2007-08-29 04:00:57 +00004476 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh16a9b832007-05-05 18:39:25 +00004477 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004478 if( info.iOverflow==0 ){
drha34b6762004-05-07 13:30:42 +00004479 return SQLITE_OK; /* No overflow pages. Return without doing anything */
drh3aac2dd2004-04-26 14:10:20 +00004480 }
drh6f11bef2004-05-13 01:12:56 +00004481 ovflPgno = get4byte(&pCell[info.iOverflow]);
drh94440812007-03-06 11:42:19 +00004482 ovflPageSize = pBt->usableSize - 4;
drh72365832007-03-06 15:53:44 +00004483 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4484 assert( ovflPgno==0 || nOvfl>0 );
4485 while( nOvfl-- ){
drh3aac2dd2004-04-26 14:10:20 +00004486 MemPage *pOvfl;
danielk197789d40042008-11-17 14:20:56 +00004487 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt) ){
drh49285702005-09-17 15:20:26 +00004488 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00004489 }
danielk19778c0a9592007-04-30 16:55:00 +00004490
4491 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
drh3b7511c2001-05-26 13:15:44 +00004492 if( rc ) return rc;
drha34b6762004-05-07 13:30:42 +00004493 rc = freePage(pOvfl);
danielk19773b8a05f2007-03-19 17:44:26 +00004494 sqlite3PagerUnref(pOvfl->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00004495 if( rc ) return rc;
drh3b7511c2001-05-26 13:15:44 +00004496 }
drh5e2f8b92001-05-28 00:41:15 +00004497 return SQLITE_OK;
drh3b7511c2001-05-26 13:15:44 +00004498}
4499
4500/*
drh91025292004-05-03 19:49:32 +00004501** Create the byte sequence used to represent a cell on page pPage
4502** and write that byte sequence into pCell[]. Overflow pages are
4503** allocated and filled in as necessary. The calling procedure
4504** is responsible for making sure sufficient space has been allocated
4505** for pCell[].
4506**
4507** Note that pCell does not necessary need to point to the pPage->aData
4508** area. pCell might point to some temporary storage. The cell will
4509** be constructed in this temporary area then copied into pPage->aData
4510** later.
drh3b7511c2001-05-26 13:15:44 +00004511*/
4512static int fillInCell(
drh3aac2dd2004-04-26 14:10:20 +00004513 MemPage *pPage, /* The page that contains the cell */
drh4b70f112004-05-02 21:12:19 +00004514 unsigned char *pCell, /* Complete text of the cell */
drh4a1c3802004-05-12 15:15:47 +00004515 const void *pKey, i64 nKey, /* The key */
drh4b70f112004-05-02 21:12:19 +00004516 const void *pData,int nData, /* The data */
drhb026e052007-05-02 01:34:31 +00004517 int nZero, /* Extra zero bytes to append to pData */
drh4b70f112004-05-02 21:12:19 +00004518 int *pnSize /* Write cell size here */
drh3b7511c2001-05-26 13:15:44 +00004519){
drh3b7511c2001-05-26 13:15:44 +00004520 int nPayload;
drh8c6fa9b2004-05-26 00:01:53 +00004521 const u8 *pSrc;
drha34b6762004-05-07 13:30:42 +00004522 int nSrc, n, rc;
drh3aac2dd2004-04-26 14:10:20 +00004523 int spaceLeft;
4524 MemPage *pOvfl = 0;
drh9b171272004-05-08 02:03:22 +00004525 MemPage *pToRelease = 0;
drh3aac2dd2004-04-26 14:10:20 +00004526 unsigned char *pPrior;
4527 unsigned char *pPayload;
danielk1977aef0bf62005-12-30 16:28:01 +00004528 BtShared *pBt = pPage->pBt;
drh3aac2dd2004-04-26 14:10:20 +00004529 Pgno pgnoOvfl = 0;
drh4b70f112004-05-02 21:12:19 +00004530 int nHeader;
drh6f11bef2004-05-13 01:12:56 +00004531 CellInfo info;
drh3b7511c2001-05-26 13:15:44 +00004532
drh1fee73e2007-08-29 04:00:57 +00004533 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004534
drhc5053fb2008-11-27 02:22:10 +00004535 /* pPage is not necessarily writeable since pCell might be auxiliary
4536 ** buffer space that is separate from the pPage buffer area */
4537 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
4538 || sqlite3PagerIswriteable(pPage->pDbPage) );
4539
drh91025292004-05-03 19:49:32 +00004540 /* Fill in the header. */
drh43605152004-05-29 21:46:49 +00004541 nHeader = 0;
drh91025292004-05-03 19:49:32 +00004542 if( !pPage->leaf ){
4543 nHeader += 4;
4544 }
drh8b18dd42004-05-12 19:18:15 +00004545 if( pPage->hasData ){
drhb026e052007-05-02 01:34:31 +00004546 nHeader += putVarint(&pCell[nHeader], nData+nZero);
drh6f11bef2004-05-13 01:12:56 +00004547 }else{
drhb026e052007-05-02 01:34:31 +00004548 nData = nZero = 0;
drh91025292004-05-03 19:49:32 +00004549 }
drh6f11bef2004-05-13 01:12:56 +00004550 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
drh16a9b832007-05-05 18:39:25 +00004551 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00004552 assert( info.nHeader==nHeader );
4553 assert( info.nKey==nKey );
danielk197789d40042008-11-17 14:20:56 +00004554 assert( info.nData==(u32)(nData+nZero) );
drh6f11bef2004-05-13 01:12:56 +00004555
4556 /* Fill in the payload */
drhb026e052007-05-02 01:34:31 +00004557 nPayload = nData + nZero;
drh3aac2dd2004-04-26 14:10:20 +00004558 if( pPage->intKey ){
4559 pSrc = pData;
4560 nSrc = nData;
drh91025292004-05-03 19:49:32 +00004561 nData = 0;
drhf49661a2008-12-10 16:45:50 +00004562 }else{
4563 /* TBD: Perhaps raise SQLITE_CORRUPT if nKey is larger than 31 bits? */
4564 nPayload += (int)nKey;
drh3aac2dd2004-04-26 14:10:20 +00004565 pSrc = pKey;
drhf49661a2008-12-10 16:45:50 +00004566 nSrc = (int)nKey;
drh3aac2dd2004-04-26 14:10:20 +00004567 }
drh6f11bef2004-05-13 01:12:56 +00004568 *pnSize = info.nSize;
4569 spaceLeft = info.nLocal;
drh3aac2dd2004-04-26 14:10:20 +00004570 pPayload = &pCell[nHeader];
drh6f11bef2004-05-13 01:12:56 +00004571 pPrior = &pCell[info.iOverflow];
drh3b7511c2001-05-26 13:15:44 +00004572
drh3b7511c2001-05-26 13:15:44 +00004573 while( nPayload>0 ){
4574 if( spaceLeft==0 ){
danielk1977afcdd022004-10-31 16:25:42 +00004575#ifndef SQLITE_OMIT_AUTOVACUUM
4576 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
danielk1977b39f70b2007-05-17 18:28:11 +00004577 if( pBt->autoVacuum ){
4578 do{
4579 pgnoOvfl++;
4580 } while(
4581 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4582 );
danielk1977b39f70b2007-05-17 18:28:11 +00004583 }
danielk1977afcdd022004-10-31 16:25:42 +00004584#endif
drhf49661a2008-12-10 16:45:50 +00004585 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
danielk1977afcdd022004-10-31 16:25:42 +00004586#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977a19df672004-11-03 11:37:07 +00004587 /* If the database supports auto-vacuum, and the second or subsequent
4588 ** overflow page is being allocated, add an entry to the pointer-map
danielk19774ef24492007-05-23 09:52:41 +00004589 ** for that page now.
4590 **
4591 ** If this is the first overflow page, then write a partial entry
4592 ** to the pointer-map. If we write nothing to this pointer-map slot,
4593 ** then the optimistic overflow chain processing in clearCell()
4594 ** may misinterpret the uninitialised values and delete the
4595 ** wrong pages from the database.
danielk1977afcdd022004-10-31 16:25:42 +00004596 */
danielk19774ef24492007-05-23 09:52:41 +00004597 if( pBt->autoVacuum && rc==SQLITE_OK ){
4598 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4599 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
danielk197789a4be82007-05-23 13:34:32 +00004600 if( rc ){
4601 releasePage(pOvfl);
4602 }
danielk1977afcdd022004-10-31 16:25:42 +00004603 }
4604#endif
drh3b7511c2001-05-26 13:15:44 +00004605 if( rc ){
drh9b171272004-05-08 02:03:22 +00004606 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004607 return rc;
4608 }
drhc5053fb2008-11-27 02:22:10 +00004609
4610 /* If pToRelease is not zero than pPrior points into the data area
4611 ** of pToRelease. Make sure pToRelease is still writeable. */
4612 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
4613
4614 /* If pPrior is part of the data area of pPage, then make sure pPage
4615 ** is still writeable */
4616 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
4617 || sqlite3PagerIswriteable(pPage->pDbPage) );
4618
drh3aac2dd2004-04-26 14:10:20 +00004619 put4byte(pPrior, pgnoOvfl);
drh9b171272004-05-08 02:03:22 +00004620 releasePage(pToRelease);
4621 pToRelease = pOvfl;
drh3aac2dd2004-04-26 14:10:20 +00004622 pPrior = pOvfl->aData;
4623 put4byte(pPrior, 0);
4624 pPayload = &pOvfl->aData[4];
drhb6f41482004-05-14 01:58:11 +00004625 spaceLeft = pBt->usableSize - 4;
drh3b7511c2001-05-26 13:15:44 +00004626 }
4627 n = nPayload;
4628 if( n>spaceLeft ) n = spaceLeft;
drhc5053fb2008-11-27 02:22:10 +00004629
4630 /* If pToRelease is not zero than pPayload points into the data area
4631 ** of pToRelease. Make sure pToRelease is still writeable. */
4632 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
4633
4634 /* If pPayload is part of the data area of pPage, then make sure pPage
4635 ** is still writeable */
4636 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
4637 || sqlite3PagerIswriteable(pPage->pDbPage) );
4638
drhb026e052007-05-02 01:34:31 +00004639 if( nSrc>0 ){
4640 if( n>nSrc ) n = nSrc;
4641 assert( pSrc );
4642 memcpy(pPayload, pSrc, n);
4643 }else{
4644 memset(pPayload, 0, n);
4645 }
drh3b7511c2001-05-26 13:15:44 +00004646 nPayload -= n;
drhde647132004-05-07 17:57:49 +00004647 pPayload += n;
drh9b171272004-05-08 02:03:22 +00004648 pSrc += n;
drh3aac2dd2004-04-26 14:10:20 +00004649 nSrc -= n;
drh3b7511c2001-05-26 13:15:44 +00004650 spaceLeft -= n;
drh3aac2dd2004-04-26 14:10:20 +00004651 if( nSrc==0 ){
4652 nSrc = nData;
4653 pSrc = pData;
4654 }
drhdd793422001-06-28 01:54:48 +00004655 }
drh9b171272004-05-08 02:03:22 +00004656 releasePage(pToRelease);
drh3b7511c2001-05-26 13:15:44 +00004657 return SQLITE_OK;
4658}
4659
drh14acc042001-06-10 19:56:58 +00004660/*
4661** Remove the i-th cell from pPage. This routine effects pPage only.
4662** The cell content is not freed or deallocated. It is assumed that
4663** the cell content has been copied someplace else. This routine just
4664** removes the reference to the cell from pPage.
4665**
4666** "sz" must be the number of bytes in the cell.
drh14acc042001-06-10 19:56:58 +00004667*/
shane0af3f892008-11-12 04:55:34 +00004668static int dropCell(MemPage *pPage, int idx, int sz){
drh43605152004-05-29 21:46:49 +00004669 int i; /* Loop counter */
4670 int pc; /* Offset to cell content of cell being deleted */
4671 u8 *data; /* pPage->aData */
4672 u8 *ptr; /* Used to move bytes around within data[] */
shanedcc50b72008-11-13 18:29:50 +00004673 int rc; /* The return code */
drh43605152004-05-29 21:46:49 +00004674
drh8c42ca92001-06-22 19:15:00 +00004675 assert( idx>=0 && idx<pPage->nCell );
drh43605152004-05-29 21:46:49 +00004676 assert( sz==cellSize(pPage, idx) );
danielk19773b8a05f2007-03-19 17:44:26 +00004677 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh1fee73e2007-08-29 04:00:57 +00004678 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhda200cc2004-05-09 11:51:38 +00004679 data = pPage->aData;
drh43605152004-05-29 21:46:49 +00004680 ptr = &data[pPage->cellOffset + 2*idx];
shane0af3f892008-11-12 04:55:34 +00004681 pc = get2byte(ptr);
drhc5053fb2008-11-27 02:22:10 +00004682 if( (pc<pPage->hdrOffset+6+(pPage->leaf?0:4))
4683 || (pc+sz>pPage->pBt->usableSize) ){
shane0af3f892008-11-12 04:55:34 +00004684 return SQLITE_CORRUPT_BKPT;
4685 }
shanedcc50b72008-11-13 18:29:50 +00004686 rc = freeSpace(pPage, pc, sz);
4687 if( rc!=SQLITE_OK ){
4688 return rc;
4689 }
drh43605152004-05-29 21:46:49 +00004690 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4691 ptr[0] = ptr[2];
4692 ptr[1] = ptr[3];
drh14acc042001-06-10 19:56:58 +00004693 }
4694 pPage->nCell--;
drh43605152004-05-29 21:46:49 +00004695 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4696 pPage->nFree += 2;
shane0af3f892008-11-12 04:55:34 +00004697 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00004698}
4699
4700/*
4701** Insert a new cell on pPage at cell index "i". pCell points to the
4702** content of the cell.
4703**
4704** If the cell content will fit on the page, then put it there. If it
drh43605152004-05-29 21:46:49 +00004705** will not fit, then make a copy of the cell content into pTemp if
4706** pTemp is not null. Regardless of pTemp, allocate a new entry
4707** in pPage->aOvfl[] and make it point to the cell content (either
4708** in pTemp or the original pCell) and also record its index.
4709** Allocating a new entry in pPage->aCell[] implies that
4710** pPage->nOverflow is incremented.
danielk1977a3ad5e72005-01-07 08:56:44 +00004711**
4712** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4713** cell. The caller will overwrite them after this function returns. If
drh4b238df2005-01-08 15:43:18 +00004714** nSkip is non-zero, then pCell may not point to an invalid memory location
danielk1977a3ad5e72005-01-07 08:56:44 +00004715** (but pCell+nSkip is always valid).
drh14acc042001-06-10 19:56:58 +00004716*/
danielk1977e80463b2004-11-03 03:01:16 +00004717static int insertCell(
drh24cd67e2004-05-10 16:18:47 +00004718 MemPage *pPage, /* Page into which we are copying */
drh43605152004-05-29 21:46:49 +00004719 int i, /* New cell becomes the i-th cell of the page */
4720 u8 *pCell, /* Content of the new cell */
4721 int sz, /* Bytes of content in pCell */
danielk1977a3ad5e72005-01-07 08:56:44 +00004722 u8 *pTemp, /* Temp storage space for pCell, if needed */
4723 u8 nSkip /* Do not write the first nSkip bytes of the cell */
drh24cd67e2004-05-10 16:18:47 +00004724){
drh43605152004-05-29 21:46:49 +00004725 int idx; /* Where to write new cell content in data[] */
4726 int j; /* Loop counter */
4727 int top; /* First byte of content for any cell in data[] */
4728 int end; /* First byte past the last cell pointer in data[] */
4729 int ins; /* Index in data[] where new cell pointer is inserted */
4730 int hdr; /* Offset into data[] of the page header */
4731 int cellOffset; /* Address of first cell pointer in data[] */
4732 u8 *data; /* The content of the whole page */
4733 u8 *ptr; /* Used for moving information around in data[] */
4734
4735 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
drhf49661a2008-12-10 16:45:50 +00004736 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
4737 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
drh43605152004-05-29 21:46:49 +00004738 assert( sz==cellSizePtr(pPage, pCell) );
drh1fee73e2007-08-29 04:00:57 +00004739 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh43605152004-05-29 21:46:49 +00004740 if( pPage->nOverflow || sz+2>pPage->nFree ){
drh24cd67e2004-05-10 16:18:47 +00004741 if( pTemp ){
danielk1977a3ad5e72005-01-07 08:56:44 +00004742 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004743 pCell = pTemp;
drh24cd67e2004-05-10 16:18:47 +00004744 }
drh43605152004-05-29 21:46:49 +00004745 j = pPage->nOverflow++;
danielk197789d40042008-11-17 14:20:56 +00004746 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
drh43605152004-05-29 21:46:49 +00004747 pPage->aOvfl[j].pCell = pCell;
drhf49661a2008-12-10 16:45:50 +00004748 pPage->aOvfl[j].idx = (u16)i;
drh43605152004-05-29 21:46:49 +00004749 pPage->nFree = 0;
drh14acc042001-06-10 19:56:58 +00004750 }else{
danielk19776e465eb2007-08-21 13:11:00 +00004751 int rc = sqlite3PagerWrite(pPage->pDbPage);
4752 if( rc!=SQLITE_OK ){
4753 return rc;
4754 }
4755 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +00004756 data = pPage->aData;
4757 hdr = pPage->hdrOffset;
4758 top = get2byte(&data[hdr+5]);
4759 cellOffset = pPage->cellOffset;
4760 end = cellOffset + 2*pPage->nCell + 2;
4761 ins = cellOffset + 2*i;
4762 if( end > top - sz ){
shane0af3f892008-11-12 04:55:34 +00004763 rc = defragmentPage(pPage);
4764 if( rc!=SQLITE_OK ){
4765 return rc;
4766 }
drh43605152004-05-29 21:46:49 +00004767 top = get2byte(&data[hdr+5]);
4768 assert( end + sz <= top );
4769 }
4770 idx = allocateSpace(pPage, sz);
4771 assert( idx>0 );
4772 assert( end <= get2byte(&data[hdr+5]) );
shane0af3f892008-11-12 04:55:34 +00004773 if (idx+sz > pPage->pBt->usableSize) {
shane34ac18d2008-11-11 22:18:20 +00004774 return SQLITE_CORRUPT_BKPT;
shane0af3f892008-11-12 04:55:34 +00004775 }
drh43605152004-05-29 21:46:49 +00004776 pPage->nCell++;
4777 pPage->nFree -= 2;
danielk1977a3ad5e72005-01-07 08:56:44 +00004778 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
drh43605152004-05-29 21:46:49 +00004779 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4780 ptr[0] = ptr[-2];
4781 ptr[1] = ptr[-1];
drhda200cc2004-05-09 11:51:38 +00004782 }
drh43605152004-05-29 21:46:49 +00004783 put2byte(&data[ins], idx);
4784 put2byte(&data[hdr+3], pPage->nCell);
danielk1977a19df672004-11-03 11:37:07 +00004785#ifndef SQLITE_OMIT_AUTOVACUUM
4786 if( pPage->pBt->autoVacuum ){
4787 /* The cell may contain a pointer to an overflow page. If so, write
4788 ** the entry for the overflow page into the pointer map.
4789 */
4790 CellInfo info;
drh16a9b832007-05-05 18:39:25 +00004791 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh72365832007-03-06 15:53:44 +00004792 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
danielk1977a19df672004-11-03 11:37:07 +00004793 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4794 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
danielk19776e465eb2007-08-21 13:11:00 +00004795 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
danielk1977a19df672004-11-03 11:37:07 +00004796 if( rc!=SQLITE_OK ) return rc;
4797 }
4798 }
4799#endif
drh14acc042001-06-10 19:56:58 +00004800 }
danielk1977e80463b2004-11-03 03:01:16 +00004801
danielk1977e80463b2004-11-03 03:01:16 +00004802 return SQLITE_OK;
drh14acc042001-06-10 19:56:58 +00004803}
4804
4805/*
drhfa1a98a2004-05-14 19:08:17 +00004806** Add a list of cells to a page. The page should be initially empty.
4807** The cells are guaranteed to fit on the page.
4808*/
4809static void assemblePage(
4810 MemPage *pPage, /* The page to be assemblied */
4811 int nCell, /* The number of cells to add to this page */
drh43605152004-05-29 21:46:49 +00004812 u8 **apCell, /* Pointers to cell bodies */
drha9121e42008-02-19 14:59:35 +00004813 u16 *aSize /* Sizes of the cells */
drhfa1a98a2004-05-14 19:08:17 +00004814){
4815 int i; /* Loop counter */
4816 int totalSize; /* Total size of all cells */
4817 int hdr; /* Index of page header */
drh43605152004-05-29 21:46:49 +00004818 int cellptr; /* Address of next cell pointer */
4819 int cellbody; /* Address of next cell body */
drhfa1a98a2004-05-14 19:08:17 +00004820 u8 *data; /* Data for the page */
4821
drh43605152004-05-29 21:46:49 +00004822 assert( pPage->nOverflow==0 );
drh1fee73e2007-08-29 04:00:57 +00004823 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf49661a2008-12-10 16:45:50 +00004824 assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
drhfa1a98a2004-05-14 19:08:17 +00004825 totalSize = 0;
4826 for(i=0; i<nCell; i++){
4827 totalSize += aSize[i];
4828 }
drh43605152004-05-29 21:46:49 +00004829 assert( totalSize+2*nCell<=pPage->nFree );
drhfa1a98a2004-05-14 19:08:17 +00004830 assert( pPage->nCell==0 );
drhc5053fb2008-11-27 02:22:10 +00004831 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh43605152004-05-29 21:46:49 +00004832 cellptr = pPage->cellOffset;
drhfa1a98a2004-05-14 19:08:17 +00004833 data = pPage->aData;
4834 hdr = pPage->hdrOffset;
drh43605152004-05-29 21:46:49 +00004835 put2byte(&data[hdr+3], nCell);
drh09d0deb2005-08-02 17:13:09 +00004836 if( nCell ){
4837 cellbody = allocateSpace(pPage, totalSize);
4838 assert( cellbody>0 );
4839 assert( pPage->nFree >= 2*nCell );
4840 pPage->nFree -= 2*nCell;
4841 for(i=0; i<nCell; i++){
4842 put2byte(&data[cellptr], cellbody);
4843 memcpy(&data[cellbody], apCell[i], aSize[i]);
4844 cellptr += 2;
4845 cellbody += aSize[i];
4846 }
4847 assert( cellbody==pPage->pBt->usableSize );
drhfa1a98a2004-05-14 19:08:17 +00004848 }
drhf49661a2008-12-10 16:45:50 +00004849 pPage->nCell = (u16)nCell;
drhfa1a98a2004-05-14 19:08:17 +00004850}
4851
drh14acc042001-06-10 19:56:58 +00004852/*
drhc3b70572003-01-04 19:44:07 +00004853** The following parameters determine how many adjacent pages get involved
4854** in a balancing operation. NN is the number of neighbors on either side
4855** of the page that participate in the balancing operation. NB is the
4856** total number of pages that participate, including the target page and
4857** NN neighbors on either side.
4858**
4859** The minimum value of NN is 1 (of course). Increasing NN above 1
4860** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4861** in exchange for a larger degradation in INSERT and UPDATE performance.
4862** The value of NN appears to give the best results overall.
4863*/
4864#define NN 1 /* Number of neighbors on either side of pPage */
4865#define NB (NN*2+1) /* Total pages involved in the balance */
4866
drh43605152004-05-29 21:46:49 +00004867/* Forward reference */
danielk197771d5d2c2008-09-29 11:49:47 +00004868static int balance(BtCursor*, int);
danielk1977ac245ec2005-01-14 13:50:11 +00004869
drh615ae552005-01-16 23:21:00 +00004870#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00004871/*
4872** This version of balance() handles the common special case where
4873** a new entry is being inserted on the extreme right-end of the
4874** tree, in other words, when the new entry will become the largest
4875** entry in the tree.
4876**
4877** Instead of trying balance the 3 right-most leaf pages, just add
4878** a new page to the right-hand side and put the one new entry in
4879** that page. This leaves the right side of the tree somewhat
4880** unbalanced. But odds are that we will be inserting new entries
4881** at the end soon afterwards so the nearly empty page will quickly
4882** fill up. On average.
4883**
4884** pPage is the leaf page which is the right-most page in the tree.
4885** pParent is its parent. pPage must have a single overflow entry
4886** which is also the right-most entry on the page.
4887*/
danielk197771d5d2c2008-09-29 11:49:47 +00004888static int balance_quick(BtCursor *pCur){
danielk1977ac245ec2005-01-14 13:50:11 +00004889 int rc;
danielk1977eaa06f62008-09-18 17:34:44 +00004890 MemPage *pNew = 0;
danielk1977ac245ec2005-01-14 13:50:11 +00004891 Pgno pgnoNew;
4892 u8 *pCell;
drha9121e42008-02-19 14:59:35 +00004893 u16 szCell;
danielk1977ac245ec2005-01-14 13:50:11 +00004894 CellInfo info;
danielk197771d5d2c2008-09-29 11:49:47 +00004895 MemPage *pPage = pCur->apPage[pCur->iPage];
4896 MemPage *pParent = pCur->apPage[pCur->iPage-1];
danielk1977aef0bf62005-12-30 16:28:01 +00004897 BtShared *pBt = pPage->pBt;
danielk197779a40da2005-01-16 08:00:01 +00004898 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4899 int parentSize; /* Size of new divider cell */
4900 u8 parentCell[64]; /* Space for the new divider cell */
danielk1977ac245ec2005-01-14 13:50:11 +00004901
drh1fee73e2007-08-29 04:00:57 +00004902 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhd677b3d2007-08-20 22:48:41 +00004903
danielk1977ac245ec2005-01-14 13:50:11 +00004904 /* Allocate a new page. Insert the overflow cell from pPage
4905 ** into it. Then remove the overflow cell from pPage.
4906 */
drh4f0c5872007-03-26 22:05:01 +00004907 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004908 if( rc==SQLITE_OK ){
4909 pCell = pPage->aOvfl[0].pCell;
4910 szCell = cellSizePtr(pPage, pCell);
drhc5053fb2008-11-27 02:22:10 +00004911 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
danielk1977eaa06f62008-09-18 17:34:44 +00004912 zeroPage(pNew, pPage->aData[0]);
4913 assemblePage(pNew, 1, &pCell, &szCell);
4914 pPage->nOverflow = 0;
4915
danielk1977eaa06f62008-09-18 17:34:44 +00004916 /* pPage is currently the right-child of pParent. Change this
4917 ** so that the right-child is the new page allocated above and
4918 ** pPage is the next-to-right child.
4919 **
4920 ** Ignore the return value of the call to fillInCell(). fillInCell()
4921 ** may only return other than SQLITE_OK if it is required to allocate
4922 ** one or more overflow pages. Since an internal table B-Tree cell
4923 ** may never spill over onto an overflow page (it is a maximum of
4924 ** 13 bytes in size), it is not neccessary to check the return code.
4925 **
4926 ** Similarly, the insertCell() function cannot fail if the page
4927 ** being inserted into is already writable and the cell does not
4928 ** contain an overflow pointer. So ignore this return code too.
4929 */
4930 assert( pPage->nCell>0 );
4931 pCell = findCell(pPage, pPage->nCell-1);
4932 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4933 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4934 assert( parentSize<64 );
4935 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4936 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4937 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4938 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4939
4940 /* If this is an auto-vacuum database, update the pointer map
4941 ** with entries for the new page, and any pointer from the
4942 ** cell on the page to an overflow page.
4943 */
4944 if( ISAUTOVACUUM ){
4945 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4946 if( rc==SQLITE_OK ){
4947 rc = ptrmapPutOvfl(pNew, 0);
4948 }
danielk1977ac11ee62005-01-15 12:45:51 +00004949 }
danielk1977e08a3c42008-09-18 18:17:03 +00004950
4951 /* Release the reference to the new page. */
4952 releasePage(pNew);
danielk1977ac11ee62005-01-15 12:45:51 +00004953 }
4954
danielk1977eaa06f62008-09-18 17:34:44 +00004955 /* At this point the pPage->nFree variable is not set correctly with
4956 ** respect to the content of the page (because it was set to 0 by
4957 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
4958 ** correct.
4959 **
4960 ** This has to be done even if an error will be returned. Normally, if
4961 ** an error occurs during tree balancing, the contents of MemPage are
4962 ** not important, as they will be recalculated when the page is rolled
4963 ** back. But here, in balance_quick(), it is possible that pPage has
4964 ** not yet been marked dirty or written into the journal file. Therefore
4965 ** it will not be rolled back and so it is important to make sure that
4966 ** the page data and contents of MemPage are consistent.
4967 */
4968 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00004969 sqlite3BtreeInitPage(pPage);
danielk1977eaa06f62008-09-18 17:34:44 +00004970
danielk1977e08a3c42008-09-18 18:17:03 +00004971 /* If everything else succeeded, balance the parent page, in
4972 ** case the divider cell inserted caused it to become overfull.
danielk197779a40da2005-01-16 08:00:01 +00004973 */
danielk1977eaa06f62008-09-18 17:34:44 +00004974 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00004975 releasePage(pPage);
4976 pCur->iPage--;
4977 rc = balance(pCur, 0);
danielk1977eaa06f62008-09-18 17:34:44 +00004978 }
4979 return rc;
danielk1977ac245ec2005-01-14 13:50:11 +00004980}
drh615ae552005-01-16 23:21:00 +00004981#endif /* SQLITE_OMIT_QUICKBALANCE */
drh43605152004-05-29 21:46:49 +00004982
drhc3b70572003-01-04 19:44:07 +00004983/*
drhab01f612004-05-22 02:55:23 +00004984** This routine redistributes Cells on pPage and up to NN*2 siblings
drh8b2f49b2001-06-08 00:21:52 +00004985** of pPage so that all pages have about the same amount of free space.
drh0c6cc4e2004-06-15 02:13:26 +00004986** Usually NN siblings on either side of pPage is used in the balancing,
4987** though more siblings might come from one side if pPage is the first
drhab01f612004-05-22 02:55:23 +00004988** or last child of its parent. If pPage has fewer than 2*NN siblings
drh8b2f49b2001-06-08 00:21:52 +00004989** (something which can only happen if pPage is the root page or a
drh14acc042001-06-10 19:56:58 +00004990** child of root) then all available siblings participate in the balancing.
drh8b2f49b2001-06-08 00:21:52 +00004991**
drh0c6cc4e2004-06-15 02:13:26 +00004992** The number of siblings of pPage might be increased or decreased by one or
4993** two in an effort to keep pages nearly full but not over full. The root page
drhab01f612004-05-22 02:55:23 +00004994** is special and is allowed to be nearly empty. If pPage is
drh8c42ca92001-06-22 19:15:00 +00004995** the root page, then the depth of the tree might be increased
drh8b2f49b2001-06-08 00:21:52 +00004996** or decreased by one, as necessary, to keep the root page from being
drhab01f612004-05-22 02:55:23 +00004997** overfull or completely empty.
drh14acc042001-06-10 19:56:58 +00004998**
drh8b2f49b2001-06-08 00:21:52 +00004999** Note that when this routine is called, some of the Cells on pPage
drh4b70f112004-05-02 21:12:19 +00005000** might not actually be stored in pPage->aData[]. This can happen
drh8b2f49b2001-06-08 00:21:52 +00005001** if the page is overfull. Part of the job of this routine is to
drh4b70f112004-05-02 21:12:19 +00005002** make sure all Cells for pPage once again fit in pPage->aData[].
drh14acc042001-06-10 19:56:58 +00005003**
drh8c42ca92001-06-22 19:15:00 +00005004** In the course of balancing the siblings of pPage, the parent of pPage
5005** might become overfull or underfull. If that happens, then this routine
5006** is called recursively on the parent.
5007**
drh5e00f6c2001-09-13 13:46:56 +00005008** If this routine fails for any reason, it might leave the database
5009** in a corrupted state. So if this routine fails, the database should
5010** be rolled back.
drh8b2f49b2001-06-08 00:21:52 +00005011*/
danielk197771d5d2c2008-09-29 11:49:47 +00005012static int balance_nonroot(BtCursor *pCur){
5013 MemPage *pPage; /* The over or underfull page to balance */
drh8b2f49b2001-06-08 00:21:52 +00005014 MemPage *pParent; /* The parent of pPage */
drh16a9b832007-05-05 18:39:25 +00005015 BtShared *pBt; /* The whole database */
danielk1977634f2982005-03-28 08:44:07 +00005016 int nCell = 0; /* Number of cells in apCell[] */
5017 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
drh8b2f49b2001-06-08 00:21:52 +00005018 int nOld; /* Number of pages in apOld[] */
5019 int nNew; /* Number of pages in apNew[] */
drh8b2f49b2001-06-08 00:21:52 +00005020 int nDiv; /* Number of cells in apDiv[] */
drh14acc042001-06-10 19:56:58 +00005021 int i, j, k; /* Loop counters */
drha34b6762004-05-07 13:30:42 +00005022 int idx; /* Index of pPage in pParent->aCell[] */
5023 int nxDiv; /* Next divider slot in pParent->aCell[] */
drh14acc042001-06-10 19:56:58 +00005024 int rc; /* The return code */
drh91025292004-05-03 19:49:32 +00005025 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
drh8b18dd42004-05-12 19:18:15 +00005026 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
drh91025292004-05-03 19:49:32 +00005027 int usableSpace; /* Bytes in pPage beyond the header */
5028 int pageFlags; /* Value of pPage->aData[0] */
drh6019e162001-07-02 17:51:45 +00005029 int subtotal; /* Subtotal of bytes in cells on one page */
drhe5ae5732008-06-15 02:51:47 +00005030 int iSpace1 = 0; /* First unused byte of aSpace1[] */
5031 int iSpace2 = 0; /* First unused byte of aSpace2[] */
drhfacf0302008-06-17 15:12:00 +00005032 int szScratch; /* Size of scratch memory requested */
drhc3b70572003-01-04 19:44:07 +00005033 MemPage *apOld[NB]; /* pPage and up to two siblings */
5034 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
drh4b70f112004-05-02 21:12:19 +00005035 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
drha2fce642004-06-05 00:01:44 +00005036 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
5037 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
drh4b70f112004-05-02 21:12:19 +00005038 u8 *apDiv[NB]; /* Divider cells in pParent */
drha2fce642004-06-05 00:01:44 +00005039 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
5040 int szNew[NB+2]; /* Combined size of cells place on i-th page */
danielk197750f059b2005-03-29 02:54:03 +00005041 u8 **apCell = 0; /* All cells begin balanced */
drha9121e42008-02-19 14:59:35 +00005042 u16 *szCell; /* Local size of all cells in apCell[] */
drhe5ae5732008-06-15 02:51:47 +00005043 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
5044 u8 *aSpace1; /* Space for copies of dividers cells before balance */
5045 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
danielk1977ac11ee62005-01-15 12:45:51 +00005046 u8 *aFrom = 0;
drh8b2f49b2001-06-08 00:21:52 +00005047
danielk197771d5d2c2008-09-29 11:49:47 +00005048 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00005049 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drhf94a1732008-09-30 17:18:17 +00005050 VVA_ONLY( pCur->pagesShuffled = 1 );
drhd677b3d2007-08-20 22:48:41 +00005051
drh14acc042001-06-10 19:56:58 +00005052 /*
drh43605152004-05-29 21:46:49 +00005053 ** Find the parent page.
drh8b2f49b2001-06-08 00:21:52 +00005054 */
danielk197771d5d2c2008-09-29 11:49:47 +00005055 assert( pCur->iPage>0 );
5056 assert( pPage->isInit );
danielk19776e465eb2007-08-21 13:11:00 +00005057 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
drh4b70f112004-05-02 21:12:19 +00005058 pBt = pPage->pBt;
danielk197771d5d2c2008-09-29 11:49:47 +00005059 pParent = pCur->apPage[pCur->iPage-1];
drh43605152004-05-29 21:46:49 +00005060 assert( pParent );
danielk19773b8a05f2007-03-19 17:44:26 +00005061 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
danielk197707cb5602006-01-20 10:55:05 +00005062 return rc;
5063 }
danielk1977474b7cc2008-07-09 11:49:46 +00005064
drh43605152004-05-29 21:46:49 +00005065 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
drh2e38c322004-09-03 18:38:44 +00005066
drh615ae552005-01-16 23:21:00 +00005067#ifndef SQLITE_OMIT_QUICKBALANCE
drhf222e712005-01-14 22:55:49 +00005068 /*
5069 ** A special case: If a new entry has just been inserted into a
5070 ** table (that is, a btree with integer keys and all data at the leaves)
drh09d0deb2005-08-02 17:13:09 +00005071 ** and the new entry is the right-most entry in the tree (it has the
drhf222e712005-01-14 22:55:49 +00005072 ** largest key) then use the special balance_quick() routine for
5073 ** balancing. balance_quick() is much faster and results in a tighter
5074 ** packing of data in the common case.
5075 */
danielk1977ac245ec2005-01-14 13:50:11 +00005076 if( pPage->leaf &&
5077 pPage->intKey &&
danielk1977ac245ec2005-01-14 13:50:11 +00005078 pPage->nOverflow==1 &&
5079 pPage->aOvfl[0].idx==pPage->nCell &&
danielk197771d5d2c2008-09-29 11:49:47 +00005080 pParent->pgno!=1 &&
danielk1977ac245ec2005-01-14 13:50:11 +00005081 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
5082 ){
drh44845222008-07-17 18:39:57 +00005083 assert( pPage->intKey );
danielk1977ac11ee62005-01-15 12:45:51 +00005084 /*
5085 ** TODO: Check the siblings to the left of pPage. It may be that
5086 ** they are not full and no new page is required.
5087 */
danielk197771d5d2c2008-09-29 11:49:47 +00005088 return balance_quick(pCur);
danielk1977ac245ec2005-01-14 13:50:11 +00005089 }
5090#endif
5091
danielk19776e465eb2007-08-21 13:11:00 +00005092 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
5093 return rc;
5094 }
5095
drh2e38c322004-09-03 18:38:44 +00005096 /*
drh4b70f112004-05-02 21:12:19 +00005097 ** Find the cell in the parent page whose left child points back
drh14acc042001-06-10 19:56:58 +00005098 ** to pPage. The "idx" variable is the index of that cell. If pPage
5099 ** is the rightmost child of pParent then set idx to pParent->nCell
drh8b2f49b2001-06-08 00:21:52 +00005100 */
danielk1977bf93c562008-09-29 15:53:25 +00005101 idx = pCur->aiIdx[pCur->iPage-1];
5102 assertParentIndex(pParent, idx, pPage->pgno);
drh8b2f49b2001-06-08 00:21:52 +00005103
5104 /*
drh14acc042001-06-10 19:56:58 +00005105 ** Initialize variables so that it will be safe to jump
drh5edc3122001-09-13 21:53:09 +00005106 ** directly to balance_cleanup at any moment.
drh8b2f49b2001-06-08 00:21:52 +00005107 */
drh14acc042001-06-10 19:56:58 +00005108 nOld = nNew = 0;
drh14acc042001-06-10 19:56:58 +00005109
5110 /*
drh4b70f112004-05-02 21:12:19 +00005111 ** Find sibling pages to pPage and the cells in pParent that divide
drhc3b70572003-01-04 19:44:07 +00005112 ** the siblings. An attempt is made to find NN siblings on either
5113 ** side of pPage. More siblings are taken from one side, however, if
5114 ** pPage there are fewer than NN siblings on the other side. If pParent
5115 ** has NB or fewer children then all children of pParent are taken.
drh14acc042001-06-10 19:56:58 +00005116 */
drhc3b70572003-01-04 19:44:07 +00005117 nxDiv = idx - NN;
5118 if( nxDiv + NB > pParent->nCell ){
5119 nxDiv = pParent->nCell - NB + 1;
drh8b2f49b2001-06-08 00:21:52 +00005120 }
drhc3b70572003-01-04 19:44:07 +00005121 if( nxDiv<0 ){
5122 nxDiv = 0;
5123 }
drh8b2f49b2001-06-08 00:21:52 +00005124 nDiv = 0;
drhc3b70572003-01-04 19:44:07 +00005125 for(i=0, k=nxDiv; i<NB; i++, k++){
drh14acc042001-06-10 19:56:58 +00005126 if( k<pParent->nCell ){
danielk19771cc5ed82007-05-16 17:28:43 +00005127 apDiv[i] = findCell(pParent, k);
drh8b2f49b2001-06-08 00:21:52 +00005128 nDiv++;
drha34b6762004-05-07 13:30:42 +00005129 assert( !pParent->leaf );
drh43605152004-05-29 21:46:49 +00005130 pgnoOld[i] = get4byte(apDiv[i]);
drh14acc042001-06-10 19:56:58 +00005131 }else if( k==pParent->nCell ){
drh43605152004-05-29 21:46:49 +00005132 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
drh14acc042001-06-10 19:56:58 +00005133 }else{
5134 break;
drh8b2f49b2001-06-08 00:21:52 +00005135 }
danielk197771d5d2c2008-09-29 11:49:47 +00005136 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
drh6019e162001-07-02 17:51:45 +00005137 if( rc ) goto balance_cleanup;
danielk197771d5d2c2008-09-29 11:49:47 +00005138 /* apOld[i]->idxParent = k; */
drh91025292004-05-03 19:49:32 +00005139 apCopy[i] = 0;
5140 assert( i==nOld );
drh14acc042001-06-10 19:56:58 +00005141 nOld++;
danielk1977634f2982005-03-28 08:44:07 +00005142 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
drh8b2f49b2001-06-08 00:21:52 +00005143 }
5144
drha9121e42008-02-19 14:59:35 +00005145 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
drh8d97f1f2005-05-05 18:14:13 +00005146 ** alignment */
drha9121e42008-02-19 14:59:35 +00005147 nMaxCells = (nMaxCells + 3)&~3;
drh8d97f1f2005-05-05 18:14:13 +00005148
drh8b2f49b2001-06-08 00:21:52 +00005149 /*
danielk1977634f2982005-03-28 08:44:07 +00005150 ** Allocate space for memory structures
5151 */
drhfacf0302008-06-17 15:12:00 +00005152 szScratch =
drha9121e42008-02-19 14:59:35 +00005153 nMaxCells*sizeof(u8*) /* apCell */
5154 + nMaxCells*sizeof(u16) /* szCell */
5155 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
drhe5ae5732008-06-15 02:51:47 +00005156 + pBt->pageSize /* aSpace1 */
drhfacf0302008-06-17 15:12:00 +00005157 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5158 apCell = sqlite3ScratchMalloc( szScratch );
danielk1977634f2982005-03-28 08:44:07 +00005159 if( apCell==0 ){
5160 rc = SQLITE_NOMEM;
5161 goto balance_cleanup;
5162 }
drha9121e42008-02-19 14:59:35 +00005163 szCell = (u16*)&apCell[nMaxCells];
danielk1977634f2982005-03-28 08:44:07 +00005164 aCopy[0] = (u8*)&szCell[nMaxCells];
drh66e80082008-12-16 13:46:29 +00005165 assert( ((aCopy[0] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005166 for(i=1; i<NB; i++){
drhc96d8532005-05-03 12:30:33 +00005167 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
drh66e80082008-12-16 13:46:29 +00005168 assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
danielk1977634f2982005-03-28 08:44:07 +00005169 }
drhe5ae5732008-06-15 02:51:47 +00005170 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
drh66e80082008-12-16 13:46:29 +00005171 assert( ((aSpace1 - (u8*)0) & 7)==0 ); /* 8-byte alignment required */
danielk197785d90ca2008-07-19 14:25:15 +00005172 if( ISAUTOVACUUM ){
drhe5ae5732008-06-15 02:51:47 +00005173 aFrom = &aSpace1[pBt->pageSize];
danielk1977634f2982005-03-28 08:44:07 +00005174 }
drhfacf0302008-06-17 15:12:00 +00005175 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
drhe5ae5732008-06-15 02:51:47 +00005176 if( aSpace2==0 ){
5177 rc = SQLITE_NOMEM;
5178 goto balance_cleanup;
5179 }
danielk1977634f2982005-03-28 08:44:07 +00005180
5181 /*
drh14acc042001-06-10 19:56:58 +00005182 ** Make copies of the content of pPage and its siblings into aOld[].
5183 ** The rest of this function will use data from the copies rather
5184 ** that the original pages since the original pages will be in the
5185 ** process of being overwritten.
5186 */
5187 for(i=0; i<nOld; i++){
drhbf4bca52007-09-06 22:19:14 +00005188 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5189 memcpy(p, apOld[i], sizeof(MemPage));
5190 p->aData = (void*)&p[1];
5191 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
drh14acc042001-06-10 19:56:58 +00005192 }
5193
5194 /*
5195 ** Load pointers to all cells on sibling pages and the divider cells
5196 ** into the local apCell[] array. Make copies of the divider cells
drhe5ae5732008-06-15 02:51:47 +00005197 ** into space obtained form aSpace1[] and remove the the divider Cells
drhb6f41482004-05-14 01:58:11 +00005198 ** from pParent.
drh4b70f112004-05-02 21:12:19 +00005199 **
5200 ** If the siblings are on leaf pages, then the child pointers of the
5201 ** divider cells are stripped from the cells before they are copied
drhe5ae5732008-06-15 02:51:47 +00005202 ** into aSpace1[]. In this way, all cells in apCell[] are without
drh4b70f112004-05-02 21:12:19 +00005203 ** child pointers. If siblings are not leaves, then all cell in
5204 ** apCell[] include child pointers. Either way, all cells in apCell[]
5205 ** are alike.
drh96f5b762004-05-16 16:24:36 +00005206 **
5207 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5208 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
drh8b2f49b2001-06-08 00:21:52 +00005209 */
5210 nCell = 0;
drh4b70f112004-05-02 21:12:19 +00005211 leafCorrection = pPage->leaf*4;
drh44845222008-07-17 18:39:57 +00005212 leafData = pPage->hasData;
drh8b2f49b2001-06-08 00:21:52 +00005213 for(i=0; i<nOld; i++){
drh4b70f112004-05-02 21:12:19 +00005214 MemPage *pOld = apCopy[i];
drh43605152004-05-29 21:46:49 +00005215 int limit = pOld->nCell+pOld->nOverflow;
5216 for(j=0; j<limit; j++){
danielk1977634f2982005-03-28 08:44:07 +00005217 assert( nCell<nMaxCells );
drh43605152004-05-29 21:46:49 +00005218 apCell[nCell] = findOverflowCell(pOld, j);
5219 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
danielk197785d90ca2008-07-19 14:25:15 +00005220 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005221 int a;
drhf49661a2008-12-10 16:45:50 +00005222 aFrom[nCell] = (u8)i; assert( i>=0 && i<6 );
danielk1977ac11ee62005-01-15 12:45:51 +00005223 for(a=0; a<pOld->nOverflow; a++){
5224 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5225 aFrom[nCell] = 0xFF;
5226 break;
5227 }
5228 }
5229 }
drh14acc042001-06-10 19:56:58 +00005230 nCell++;
drh8b2f49b2001-06-08 00:21:52 +00005231 }
5232 if( i<nOld-1 ){
drha9121e42008-02-19 14:59:35 +00005233 u16 sz = cellSizePtr(pParent, apDiv[i]);
drh8b18dd42004-05-12 19:18:15 +00005234 if( leafData ){
drh96f5b762004-05-16 16:24:36 +00005235 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5236 ** are duplicates of keys on the child pages. We need to remove
5237 ** the divider cells from pParent, but the dividers cells are not
5238 ** added to apCell[] because they are duplicates of child cells.
5239 */
drh8b18dd42004-05-12 19:18:15 +00005240 dropCell(pParent, nxDiv, sz);
drh4b70f112004-05-02 21:12:19 +00005241 }else{
drhb6f41482004-05-14 01:58:11 +00005242 u8 *pTemp;
danielk1977634f2982005-03-28 08:44:07 +00005243 assert( nCell<nMaxCells );
drhb6f41482004-05-14 01:58:11 +00005244 szCell[nCell] = sz;
drhe5ae5732008-06-15 02:51:47 +00005245 pTemp = &aSpace1[iSpace1];
5246 iSpace1 += sz;
5247 assert( sz<=pBt->pageSize/4 );
5248 assert( iSpace1<=pBt->pageSize );
drhb6f41482004-05-14 01:58:11 +00005249 memcpy(pTemp, apDiv[i], sz);
5250 apCell[nCell] = pTemp+leafCorrection;
danielk197785d90ca2008-07-19 14:25:15 +00005251 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005252 aFrom[nCell] = 0xFF;
5253 }
drhb6f41482004-05-14 01:58:11 +00005254 dropCell(pParent, nxDiv, sz);
drhf49661a2008-12-10 16:45:50 +00005255 assert( leafCorrection==0 || leafCorrection==4 );
5256 szCell[nCell] -= (u16)leafCorrection;
drh43605152004-05-29 21:46:49 +00005257 assert( get4byte(pTemp)==pgnoOld[i] );
drh8b18dd42004-05-12 19:18:15 +00005258 if( !pOld->leaf ){
5259 assert( leafCorrection==0 );
5260 /* The right pointer of the child page pOld becomes the left
5261 ** pointer of the divider cell */
drh43605152004-05-29 21:46:49 +00005262 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
drh8b18dd42004-05-12 19:18:15 +00005263 }else{
5264 assert( leafCorrection==4 );
danielk197739c96042007-05-12 10:41:47 +00005265 if( szCell[nCell]<4 ){
5266 /* Do not allow any cells smaller than 4 bytes. */
5267 szCell[nCell] = 4;
5268 }
drh8b18dd42004-05-12 19:18:15 +00005269 }
5270 nCell++;
drh4b70f112004-05-02 21:12:19 +00005271 }
drh8b2f49b2001-06-08 00:21:52 +00005272 }
5273 }
5274
5275 /*
drh6019e162001-07-02 17:51:45 +00005276 ** Figure out the number of pages needed to hold all nCell cells.
5277 ** Store this number in "k". Also compute szNew[] which is the total
5278 ** size of all cells on the i-th page and cntNew[] which is the index
drh4b70f112004-05-02 21:12:19 +00005279 ** in apCell[] of the cell that divides page i from page i+1.
drh6019e162001-07-02 17:51:45 +00005280 ** cntNew[k] should equal nCell.
5281 **
drh96f5b762004-05-16 16:24:36 +00005282 ** Values computed by this block:
5283 **
5284 ** k: The total number of sibling pages
5285 ** szNew[i]: Spaced used on the i-th sibling page.
5286 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5287 ** the right of the i-th sibling page.
5288 ** usableSpace: Number of bytes of space available on each sibling.
5289 **
drh8b2f49b2001-06-08 00:21:52 +00005290 */
drh43605152004-05-29 21:46:49 +00005291 usableSpace = pBt->usableSize - 12 + leafCorrection;
drh6019e162001-07-02 17:51:45 +00005292 for(subtotal=k=i=0; i<nCell; i++){
danielk1977634f2982005-03-28 08:44:07 +00005293 assert( i<nMaxCells );
drh43605152004-05-29 21:46:49 +00005294 subtotal += szCell[i] + 2;
drh4b70f112004-05-02 21:12:19 +00005295 if( subtotal > usableSpace ){
drh6019e162001-07-02 17:51:45 +00005296 szNew[k] = subtotal - szCell[i];
5297 cntNew[k] = i;
drh8b18dd42004-05-12 19:18:15 +00005298 if( leafData ){ i--; }
drh6019e162001-07-02 17:51:45 +00005299 subtotal = 0;
5300 k++;
5301 }
5302 }
5303 szNew[k] = subtotal;
5304 cntNew[k] = nCell;
5305 k++;
drh96f5b762004-05-16 16:24:36 +00005306
5307 /*
5308 ** The packing computed by the previous block is biased toward the siblings
5309 ** on the left side. The left siblings are always nearly full, while the
5310 ** right-most sibling might be nearly empty. This block of code attempts
5311 ** to adjust the packing of siblings to get a better balance.
5312 **
5313 ** This adjustment is more than an optimization. The packing above might
5314 ** be so out of balance as to be illegal. For example, the right-most
5315 ** sibling might be completely empty. This adjustment is not optional.
5316 */
drh6019e162001-07-02 17:51:45 +00005317 for(i=k-1; i>0; i--){
drh96f5b762004-05-16 16:24:36 +00005318 int szRight = szNew[i]; /* Size of sibling on the right */
5319 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5320 int r; /* Index of right-most cell in left sibling */
5321 int d; /* Index of first cell to the left of right sibling */
5322
5323 r = cntNew[i-1] - 1;
5324 d = r + 1 - leafData;
danielk1977634f2982005-03-28 08:44:07 +00005325 assert( d<nMaxCells );
5326 assert( r<nMaxCells );
drh43605152004-05-29 21:46:49 +00005327 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5328 szRight += szCell[d] + 2;
5329 szLeft -= szCell[r] + 2;
drh6019e162001-07-02 17:51:45 +00005330 cntNew[i-1]--;
drh96f5b762004-05-16 16:24:36 +00005331 r = cntNew[i-1] - 1;
5332 d = r + 1 - leafData;
drh6019e162001-07-02 17:51:45 +00005333 }
drh96f5b762004-05-16 16:24:36 +00005334 szNew[i] = szRight;
5335 szNew[i-1] = szLeft;
drh6019e162001-07-02 17:51:45 +00005336 }
drh09d0deb2005-08-02 17:13:09 +00005337
5338 /* Either we found one or more cells (cntnew[0])>0) or we are the
5339 ** a virtual root page. A virtual root page is when the real root
5340 ** page is page 1 and we are the only child of that page.
5341 */
5342 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
drh8b2f49b2001-06-08 00:21:52 +00005343
5344 /*
drh6b308672002-07-08 02:16:37 +00005345 ** Allocate k new pages. Reuse old pages where possible.
drh8b2f49b2001-06-08 00:21:52 +00005346 */
drh4b70f112004-05-02 21:12:19 +00005347 assert( pPage->pgno>1 );
5348 pageFlags = pPage->aData[0];
drh14acc042001-06-10 19:56:58 +00005349 for(i=0; i<k; i++){
drhda200cc2004-05-09 11:51:38 +00005350 MemPage *pNew;
drh6b308672002-07-08 02:16:37 +00005351 if( i<nOld ){
drhda200cc2004-05-09 11:51:38 +00005352 pNew = apNew[i] = apOld[i];
drh6b308672002-07-08 02:16:37 +00005353 pgnoNew[i] = pgnoOld[i];
5354 apOld[i] = 0;
danielk19773b8a05f2007-03-19 17:44:26 +00005355 rc = sqlite3PagerWrite(pNew->pDbPage);
drhf5345442007-04-09 12:45:02 +00005356 nNew++;
danielk197728129562005-01-11 10:25:06 +00005357 if( rc ) goto balance_cleanup;
drh6b308672002-07-08 02:16:37 +00005358 }else{
drh7aa8f852006-03-28 00:24:44 +00005359 assert( i>0 );
drh4f0c5872007-03-26 22:05:01 +00005360 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
drh6b308672002-07-08 02:16:37 +00005361 if( rc ) goto balance_cleanup;
drhda200cc2004-05-09 11:51:38 +00005362 apNew[i] = pNew;
drhf5345442007-04-09 12:45:02 +00005363 nNew++;
drh6b308672002-07-08 02:16:37 +00005364 }
drh8b2f49b2001-06-08 00:21:52 +00005365 }
5366
danielk1977299b1872004-11-22 10:02:10 +00005367 /* Free any old pages that were not reused as new pages.
5368 */
5369 while( i<nOld ){
5370 rc = freePage(apOld[i]);
5371 if( rc ) goto balance_cleanup;
5372 releasePage(apOld[i]);
5373 apOld[i] = 0;
5374 i++;
5375 }
5376
drh8b2f49b2001-06-08 00:21:52 +00005377 /*
drhf9ffac92002-03-02 19:00:31 +00005378 ** Put the new pages in accending order. This helps to
5379 ** keep entries in the disk file in order so that a scan
5380 ** of the table is a linear scan through the file. That
5381 ** in turn helps the operating system to deliver pages
5382 ** from the disk more rapidly.
5383 **
5384 ** An O(n^2) insertion sort algorithm is used, but since
drhc3b70572003-01-04 19:44:07 +00005385 ** n is never more than NB (a small constant), that should
5386 ** not be a problem.
drhf9ffac92002-03-02 19:00:31 +00005387 **
drhc3b70572003-01-04 19:44:07 +00005388 ** When NB==3, this one optimization makes the database
5389 ** about 25% faster for large insertions and deletions.
drhf9ffac92002-03-02 19:00:31 +00005390 */
5391 for(i=0; i<k-1; i++){
5392 int minV = pgnoNew[i];
5393 int minI = i;
5394 for(j=i+1; j<k; j++){
drh7d02cb72003-06-04 16:24:39 +00005395 if( pgnoNew[j]<(unsigned)minV ){
drhf9ffac92002-03-02 19:00:31 +00005396 minI = j;
5397 minV = pgnoNew[j];
5398 }
5399 }
5400 if( minI>i ){
5401 int t;
5402 MemPage *pT;
5403 t = pgnoNew[i];
5404 pT = apNew[i];
5405 pgnoNew[i] = pgnoNew[minI];
5406 apNew[i] = apNew[minI];
5407 pgnoNew[minI] = t;
5408 apNew[minI] = pT;
5409 }
5410 }
drha2fce642004-06-05 00:01:44 +00005411 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
drh24cd67e2004-05-10 16:18:47 +00005412 pgnoOld[0],
5413 nOld>=2 ? pgnoOld[1] : 0,
5414 nOld>=3 ? pgnoOld[2] : 0,
drh10c0fa62004-05-18 12:50:17 +00005415 pgnoNew[0], szNew[0],
5416 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5417 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
drha2fce642004-06-05 00:01:44 +00005418 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5419 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
drh24cd67e2004-05-10 16:18:47 +00005420
drhf9ffac92002-03-02 19:00:31 +00005421 /*
drh14acc042001-06-10 19:56:58 +00005422 ** Evenly distribute the data in apCell[] across the new pages.
5423 ** Insert divider cells into pParent as necessary.
5424 */
5425 j = 0;
5426 for(i=0; i<nNew; i++){
danielk1977ac11ee62005-01-15 12:45:51 +00005427 /* Assemble the new sibling page. */
drh14acc042001-06-10 19:56:58 +00005428 MemPage *pNew = apNew[i];
drh19642e52005-03-29 13:17:45 +00005429 assert( j<nMaxCells );
drh4b70f112004-05-02 21:12:19 +00005430 assert( pNew->pgno==pgnoNew[i] );
drh10131482008-07-11 03:34:09 +00005431 zeroPage(pNew, pageFlags);
drhfa1a98a2004-05-14 19:08:17 +00005432 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
drh09d0deb2005-08-02 17:13:09 +00005433 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
drh43605152004-05-29 21:46:49 +00005434 assert( pNew->nOverflow==0 );
danielk1977ac11ee62005-01-15 12:45:51 +00005435
danielk1977ac11ee62005-01-15 12:45:51 +00005436 /* If this is an auto-vacuum database, update the pointer map entries
5437 ** that point to the siblings that were rearranged. These can be: left
5438 ** children of cells, the right-child of the page, or overflow pages
5439 ** pointed to by cells.
5440 */
danielk197785d90ca2008-07-19 14:25:15 +00005441 if( ISAUTOVACUUM ){
danielk1977ac11ee62005-01-15 12:45:51 +00005442 for(k=j; k<cntNew[i]; k++){
danielk1977634f2982005-03-28 08:44:07 +00005443 assert( k<nMaxCells );
danielk1977ac11ee62005-01-15 12:45:51 +00005444 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
danielk197779a40da2005-01-16 08:00:01 +00005445 rc = ptrmapPutOvfl(pNew, k-j);
danielk197787c52b52008-07-19 11:49:07 +00005446 if( rc==SQLITE_OK && leafCorrection==0 ){
5447 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5448 }
danielk197779a40da2005-01-16 08:00:01 +00005449 if( rc!=SQLITE_OK ){
5450 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005451 }
5452 }
5453 }
5454 }
danielk1977ac11ee62005-01-15 12:45:51 +00005455
5456 j = cntNew[i];
5457
5458 /* If the sibling page assembled above was not the right-most sibling,
5459 ** insert a divider cell into the parent page.
5460 */
drh14acc042001-06-10 19:56:58 +00005461 if( i<nNew-1 && j<nCell ){
drh8b18dd42004-05-12 19:18:15 +00005462 u8 *pCell;
drh24cd67e2004-05-10 16:18:47 +00005463 u8 *pTemp;
drh8b18dd42004-05-12 19:18:15 +00005464 int sz;
danielk1977634f2982005-03-28 08:44:07 +00005465
5466 assert( j<nMaxCells );
drh8b18dd42004-05-12 19:18:15 +00005467 pCell = apCell[j];
5468 sz = szCell[j] + leafCorrection;
drhe5ae5732008-06-15 02:51:47 +00005469 pTemp = &aSpace2[iSpace2];
drh4b70f112004-05-02 21:12:19 +00005470 if( !pNew->leaf ){
drh43605152004-05-29 21:46:49 +00005471 memcpy(&pNew->aData[8], pCell, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005472 if( ISAUTOVACUUM
danielk197787c52b52008-07-19 11:49:07 +00005473 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5474 ){
5475 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5476 if( rc!=SQLITE_OK ){
5477 goto balance_cleanup;
5478 }
5479 }
drh8b18dd42004-05-12 19:18:15 +00005480 }else if( leafData ){
drhfd131da2007-08-07 17:13:03 +00005481 /* If the tree is a leaf-data tree, and the siblings are leaves,
danielk1977ac11ee62005-01-15 12:45:51 +00005482 ** then there is no divider cell in apCell[]. Instead, the divider
5483 ** cell consists of the integer key for the right-most cell of
5484 ** the sibling-page assembled above only.
5485 */
drh6f11bef2004-05-13 01:12:56 +00005486 CellInfo info;
drh8b18dd42004-05-12 19:18:15 +00005487 j--;
drh16a9b832007-05-05 18:39:25 +00005488 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
drhe5ae5732008-06-15 02:51:47 +00005489 pCell = pTemp;
drhb026e052007-05-02 01:34:31 +00005490 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
drh8b18dd42004-05-12 19:18:15 +00005491 pTemp = 0;
drh4b70f112004-05-02 21:12:19 +00005492 }else{
5493 pCell -= 4;
danielk19774aeff622007-05-12 09:30:47 +00005494 /* Obscure case for non-leaf-data trees: If the cell at pCell was
drh85b623f2007-12-13 21:54:09 +00005495 ** previously stored on a leaf node, and its reported size was 4
danielk19774aeff622007-05-12 09:30:47 +00005496 ** bytes, then it may actually be smaller than this
5497 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
drh85b623f2007-12-13 21:54:09 +00005498 ** any cell). But it is important to pass the correct size to
danielk19774aeff622007-05-12 09:30:47 +00005499 ** insertCell(), so reparse the cell now.
5500 **
5501 ** Note that this can never happen in an SQLite data file, as all
5502 ** cells are at least 4 bytes. It only happens in b-trees used
5503 ** to evaluate "IN (SELECT ...)" and similar clauses.
5504 */
5505 if( szCell[j]==4 ){
5506 assert(leafCorrection==4);
5507 sz = cellSizePtr(pParent, pCell);
5508 }
drh4b70f112004-05-02 21:12:19 +00005509 }
drhe5ae5732008-06-15 02:51:47 +00005510 iSpace2 += sz;
5511 assert( sz<=pBt->pageSize/4 );
5512 assert( iSpace2<=pBt->pageSize );
danielk1977a3ad5e72005-01-07 08:56:44 +00005513 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
danielk1977e80463b2004-11-03 03:01:16 +00005514 if( rc!=SQLITE_OK ) goto balance_cleanup;
drhc5053fb2008-11-27 02:22:10 +00005515 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
drh43605152004-05-29 21:46:49 +00005516 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
danielk197785d90ca2008-07-19 14:25:15 +00005517
danielk1977ac11ee62005-01-15 12:45:51 +00005518 /* If this is an auto-vacuum database, and not a leaf-data tree,
5519 ** then update the pointer map with an entry for the overflow page
5520 ** that the cell just inserted points to (if any).
5521 */
danielk197785d90ca2008-07-19 14:25:15 +00005522 if( ISAUTOVACUUM && !leafData ){
danielk197779a40da2005-01-16 08:00:01 +00005523 rc = ptrmapPutOvfl(pParent, nxDiv);
5524 if( rc!=SQLITE_OK ){
5525 goto balance_cleanup;
danielk1977ac11ee62005-01-15 12:45:51 +00005526 }
5527 }
drh14acc042001-06-10 19:56:58 +00005528 j++;
5529 nxDiv++;
5530 }
danielk197787c52b52008-07-19 11:49:07 +00005531
danielk197787c52b52008-07-19 11:49:07 +00005532 /* Set the pointer-map entry for the new sibling page. */
danielk197785d90ca2008-07-19 14:25:15 +00005533 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005534 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5535 if( rc!=SQLITE_OK ){
5536 goto balance_cleanup;
5537 }
5538 }
drh14acc042001-06-10 19:56:58 +00005539 }
drh6019e162001-07-02 17:51:45 +00005540 assert( j==nCell );
drh7aa8f852006-03-28 00:24:44 +00005541 assert( nOld>0 );
5542 assert( nNew>0 );
drh4b70f112004-05-02 21:12:19 +00005543 if( (pageFlags & PTF_LEAF)==0 ){
danielk197787c52b52008-07-19 11:49:07 +00005544 u8 *zChild = &apCopy[nOld-1]->aData[8];
5545 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
danielk197785d90ca2008-07-19 14:25:15 +00005546 if( ISAUTOVACUUM ){
danielk197787c52b52008-07-19 11:49:07 +00005547 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5548 if( rc!=SQLITE_OK ){
5549 goto balance_cleanup;
5550 }
5551 }
drh14acc042001-06-10 19:56:58 +00005552 }
drhc5053fb2008-11-27 02:22:10 +00005553 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
drh43605152004-05-29 21:46:49 +00005554 if( nxDiv==pParent->nCell+pParent->nOverflow ){
drh4b70f112004-05-02 21:12:19 +00005555 /* Right-most sibling is the right-most child of pParent */
drh43605152004-05-29 21:46:49 +00005556 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
drh4b70f112004-05-02 21:12:19 +00005557 }else{
5558 /* Right-most sibling is the left child of the first entry in pParent
5559 ** past the right-most divider entry */
drh43605152004-05-29 21:46:49 +00005560 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
drh14acc042001-06-10 19:56:58 +00005561 }
5562
5563 /*
drh3a4c1412004-05-09 20:40:11 +00005564 ** Balance the parent page. Note that the current page (pPage) might
danielk1977ac11ee62005-01-15 12:45:51 +00005565 ** have been added to the freelist so it might no longer be initialized.
drh3a4c1412004-05-09 20:40:11 +00005566 ** But the parent page will always be initialized.
drh8b2f49b2001-06-08 00:21:52 +00005567 */
danielk197771d5d2c2008-09-29 11:49:47 +00005568 assert( pParent->isInit );
drhfacf0302008-06-17 15:12:00 +00005569 sqlite3ScratchFree(apCell);
drhe5ae5732008-06-15 02:51:47 +00005570 apCell = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005571 releasePage(pPage);
5572 pCur->iPage--;
5573 rc = balance(pCur, 0);
drhda200cc2004-05-09 11:51:38 +00005574
drh8b2f49b2001-06-08 00:21:52 +00005575 /*
drh14acc042001-06-10 19:56:58 +00005576 ** Cleanup before returning.
drh8b2f49b2001-06-08 00:21:52 +00005577 */
drh14acc042001-06-10 19:56:58 +00005578balance_cleanup:
drhfacf0302008-06-17 15:12:00 +00005579 sqlite3PageFree(aSpace2);
5580 sqlite3ScratchFree(apCell);
drh8b2f49b2001-06-08 00:21:52 +00005581 for(i=0; i<nOld; i++){
drh91025292004-05-03 19:49:32 +00005582 releasePage(apOld[i]);
drh8b2f49b2001-06-08 00:21:52 +00005583 }
drh14acc042001-06-10 19:56:58 +00005584 for(i=0; i<nNew; i++){
drh91025292004-05-03 19:49:32 +00005585 releasePage(apNew[i]);
drh8b2f49b2001-06-08 00:21:52 +00005586 }
drh9bf9e9c2008-12-05 20:01:43 +00005587 pPage->nOverflow = 0;
danielk1977eaa06f62008-09-18 17:34:44 +00005588
danielk197771d5d2c2008-09-29 11:49:47 +00005589 /* releasePage(pParent); */
drh3a4c1412004-05-09 20:40:11 +00005590 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5591 pPage->pgno, nOld, nNew, nCell));
danielk1977eaa06f62008-09-18 17:34:44 +00005592
drh8b2f49b2001-06-08 00:21:52 +00005593 return rc;
5594}
5595
5596/*
drh43605152004-05-29 21:46:49 +00005597** This routine is called for the root page of a btree when the root
5598** page contains no cells. This is an opportunity to make the tree
5599** shallower by one level.
5600*/
danielk197771d5d2c2008-09-29 11:49:47 +00005601static int balance_shallower(BtCursor *pCur){
5602 MemPage *pPage; /* Root page of B-Tree */
drh43605152004-05-29 21:46:49 +00005603 MemPage *pChild; /* The only child page of pPage */
5604 Pgno pgnoChild; /* Page number for pChild */
drh2e38c322004-09-03 18:38:44 +00005605 int rc = SQLITE_OK; /* Return code from subprocedures */
danielk1977aef0bf62005-12-30 16:28:01 +00005606 BtShared *pBt; /* The main BTree structure */
drh2e38c322004-09-03 18:38:44 +00005607 int mxCellPerPage; /* Maximum number of cells per page */
5608 u8 **apCell; /* All cells from pages being balanced */
drha9121e42008-02-19 14:59:35 +00005609 u16 *szCell; /* Local size of all cells */
drh43605152004-05-29 21:46:49 +00005610
danielk197771d5d2c2008-09-29 11:49:47 +00005611 assert( pCur->iPage==0 );
5612 pPage = pCur->apPage[0];
5613
drh43605152004-05-29 21:46:49 +00005614 assert( pPage->nCell==0 );
drh1fee73e2007-08-29 04:00:57 +00005615 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
drh2e38c322004-09-03 18:38:44 +00005616 pBt = pPage->pBt;
5617 mxCellPerPage = MX_CELL(pBt);
drhe5ae5732008-06-15 02:51:47 +00005618 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
drh2e38c322004-09-03 18:38:44 +00005619 if( apCell==0 ) return SQLITE_NOMEM;
drha9121e42008-02-19 14:59:35 +00005620 szCell = (u16*)&apCell[mxCellPerPage];
drh43605152004-05-29 21:46:49 +00005621 if( pPage->leaf ){
5622 /* The table is completely empty */
5623 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5624 }else{
5625 /* The root page is empty but has one child. Transfer the
5626 ** information from that one child into the root page if it
5627 ** will fit. This reduces the depth of the tree by one.
5628 **
5629 ** If the root page is page 1, it has less space available than
5630 ** its child (due to the 100 byte header that occurs at the beginning
5631 ** of the database fle), so it might not be able to hold all of the
5632 ** information currently contained in the child. If this is the
5633 ** case, then do not do the transfer. Leave page 1 empty except
5634 ** for the right-pointer to the child page. The child page becomes
5635 ** the virtual root of the tree.
5636 */
drhf94a1732008-09-30 17:18:17 +00005637 VVA_ONLY( pCur->pagesShuffled = 1 );
drh43605152004-05-29 21:46:49 +00005638 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5639 assert( pgnoChild>0 );
danielk197789d40042008-11-17 14:20:56 +00005640 assert( pgnoChild<=pagerPagecount(pPage->pBt) );
drh16a9b832007-05-05 18:39:25 +00005641 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
drh2e38c322004-09-03 18:38:44 +00005642 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005643 if( pPage->pgno==1 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005644 rc = sqlite3BtreeInitPage(pChild);
drh2e38c322004-09-03 18:38:44 +00005645 if( rc ) goto end_shallow_balance;
drh43605152004-05-29 21:46:49 +00005646 assert( pChild->nOverflow==0 );
5647 if( pChild->nFree>=100 ){
5648 /* The child information will fit on the root page, so do the
5649 ** copy */
5650 int i;
5651 zeroPage(pPage, pChild->aData[0]);
5652 for(i=0; i<pChild->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00005653 apCell[i] = findCell(pChild,i);
drh43605152004-05-29 21:46:49 +00005654 szCell[i] = cellSizePtr(pChild, apCell[i]);
5655 }
5656 assemblePage(pPage, pChild->nCell, apCell, szCell);
danielk1977ae825582004-11-23 09:06:55 +00005657 /* Copy the right-pointer of the child to the parent. */
drhc5053fb2008-11-27 02:22:10 +00005658 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk1977ae825582004-11-23 09:06:55 +00005659 put4byte(&pPage->aData[pPage->hdrOffset+8],
5660 get4byte(&pChild->aData[pChild->hdrOffset+8]));
drh9bf9e9c2008-12-05 20:01:43 +00005661 rc = freePage(pChild);
drh43605152004-05-29 21:46:49 +00005662 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5663 }else{
5664 /* The child has more information that will fit on the root.
5665 ** The tree is already balanced. Do nothing. */
5666 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5667 }
5668 }else{
5669 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5670 pPage->isInit = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005671 rc = sqlite3BtreeInitPage(pPage);
drh43605152004-05-29 21:46:49 +00005672 assert( rc==SQLITE_OK );
5673 freePage(pChild);
5674 TRACE(("BALANCE: transfer child %d into root %d\n",
5675 pChild->pgno, pPage->pgno));
5676 }
danielk1977ac11ee62005-01-15 12:45:51 +00005677 assert( pPage->nOverflow==0 );
shane831c3292008-11-10 17:14:58 +00005678#ifndef SQLITE_OMIT_AUTOVACUUM
drh9bf9e9c2008-12-05 20:01:43 +00005679 if( ISAUTOVACUUM && rc==SQLITE_OK ){
danielk197700a696d2008-09-29 16:41:31 +00005680 rc = setChildPtrmaps(pPage);
danielk1977ac11ee62005-01-15 12:45:51 +00005681 }
shane831c3292008-11-10 17:14:58 +00005682#endif
drh43605152004-05-29 21:46:49 +00005683 releasePage(pChild);
5684 }
drh2e38c322004-09-03 18:38:44 +00005685end_shallow_balance:
drh17435752007-08-16 04:30:38 +00005686 sqlite3_free(apCell);
drh2e38c322004-09-03 18:38:44 +00005687 return rc;
drh43605152004-05-29 21:46:49 +00005688}
5689
5690
5691/*
5692** The root page is overfull
5693**
5694** When this happens, Create a new child page and copy the
5695** contents of the root into the child. Then make the root
5696** page an empty page with rightChild pointing to the new
5697** child. Finally, call balance_internal() on the new child
5698** to cause it to split.
5699*/
danielk197771d5d2c2008-09-29 11:49:47 +00005700static int balance_deeper(BtCursor *pCur){
drh43605152004-05-29 21:46:49 +00005701 int rc; /* Return value from subprocedures */
danielk197771d5d2c2008-09-29 11:49:47 +00005702 MemPage *pPage; /* Pointer to the root page */
drh43605152004-05-29 21:46:49 +00005703 MemPage *pChild; /* Pointer to a new child page */
5704 Pgno pgnoChild; /* Page number of the new child page */
danielk1977aef0bf62005-12-30 16:28:01 +00005705 BtShared *pBt; /* The BTree */
drh43605152004-05-29 21:46:49 +00005706 int usableSize; /* Total usable size of a page */
5707 u8 *data; /* Content of the parent page */
5708 u8 *cdata; /* Content of the child page */
5709 int hdr; /* Offset to page header in parent */
drh281b21d2008-08-22 12:57:08 +00005710 int cbrk; /* Offset to content of first cell in parent */
drh43605152004-05-29 21:46:49 +00005711
danielk197771d5d2c2008-09-29 11:49:47 +00005712 assert( pCur->iPage==0 );
5713 assert( pCur->apPage[0]->nOverflow>0 );
5714
drhf94a1732008-09-30 17:18:17 +00005715 VVA_ONLY( pCur->pagesShuffled = 1 );
danielk197771d5d2c2008-09-29 11:49:47 +00005716 pPage = pCur->apPage[0];
drh43605152004-05-29 21:46:49 +00005717 pBt = pPage->pBt;
drh1fee73e2007-08-29 04:00:57 +00005718 assert( sqlite3_mutex_held(pBt->mutex) );
drhc5053fb2008-11-27 02:22:10 +00005719 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
drh4f0c5872007-03-26 22:05:01 +00005720 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
drh43605152004-05-29 21:46:49 +00005721 if( rc ) return rc;
danielk19773b8a05f2007-03-19 17:44:26 +00005722 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
drh43605152004-05-29 21:46:49 +00005723 usableSize = pBt->usableSize;
5724 data = pPage->aData;
5725 hdr = pPage->hdrOffset;
drh281b21d2008-08-22 12:57:08 +00005726 cbrk = get2byte(&data[hdr+5]);
drh43605152004-05-29 21:46:49 +00005727 cdata = pChild->aData;
5728 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
drh281b21d2008-08-22 12:57:08 +00005729 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
danielk1977bc2ca9e2008-11-13 14:28:28 +00005730
5731 assert( pChild->isInit==0 );
danielk197771d5d2c2008-09-29 11:49:47 +00005732 rc = sqlite3BtreeInitPage(pChild);
5733 if( rc==SQLITE_OK ){
5734 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5735 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5736 pChild->nOverflow = pPage->nOverflow;
5737 if( pChild->nOverflow ){
5738 pChild->nFree = 0;
5739 }
5740 assert( pChild->nCell==pPage->nCell );
drhc5053fb2008-11-27 02:22:10 +00005741 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +00005742 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5743 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5744 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5745 if( ISAUTOVACUUM ){
danielk197771d5d2c2008-09-29 11:49:47 +00005746 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
shane831c3292008-11-10 17:14:58 +00005747#ifndef SQLITE_OMIT_AUTOVACUUM
danielk197771d5d2c2008-09-29 11:49:47 +00005748 if( rc==SQLITE_OK ){
danielk197700a696d2008-09-29 16:41:31 +00005749 rc = setChildPtrmaps(pChild);
danielk1977ac11ee62005-01-15 12:45:51 +00005750 }
shane831c3292008-11-10 17:14:58 +00005751#endif
danielk1977ac11ee62005-01-15 12:45:51 +00005752 }
danielk197787c52b52008-07-19 11:49:07 +00005753 }
danielk19776b456a22005-03-21 04:04:02 +00005754
danielk197771d5d2c2008-09-29 11:49:47 +00005755 if( rc==SQLITE_OK ){
5756 pCur->iPage++;
5757 pCur->apPage[1] = pChild;
danielk1977bf93c562008-09-29 15:53:25 +00005758 pCur->aiIdx[0] = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00005759 rc = balance_nonroot(pCur);
5760 }else{
5761 releasePage(pChild);
5762 }
5763
drh43605152004-05-29 21:46:49 +00005764 return rc;
5765}
5766
5767/*
danielk197771d5d2c2008-09-29 11:49:47 +00005768** The page that pCur currently points to has just been modified in
5769** some way. This function figures out if this modification means the
5770** tree needs to be balanced, and if so calls the appropriate balancing
5771** routine.
5772**
5773** Parameter isInsert is true if a new cell was just inserted into the
5774** page, or false otherwise.
drh43605152004-05-29 21:46:49 +00005775*/
danielk197771d5d2c2008-09-29 11:49:47 +00005776static int balance(BtCursor *pCur, int isInsert){
drh43605152004-05-29 21:46:49 +00005777 int rc = SQLITE_OK;
danielk197771d5d2c2008-09-29 11:49:47 +00005778 MemPage *pPage = pCur->apPage[pCur->iPage];
5779
drh1fee73e2007-08-29 04:00:57 +00005780 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
danielk197771d5d2c2008-09-29 11:49:47 +00005781 if( pCur->iPage==0 ){
danielk19776e465eb2007-08-21 13:11:00 +00005782 rc = sqlite3PagerWrite(pPage->pDbPage);
5783 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005784 rc = balance_deeper(pCur);
drh9bf9e9c2008-12-05 20:01:43 +00005785 assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
drh43605152004-05-29 21:46:49 +00005786 }
danielk1977687566d2004-11-02 12:56:41 +00005787 if( rc==SQLITE_OK && pPage->nCell==0 ){
danielk197771d5d2c2008-09-29 11:49:47 +00005788 rc = balance_shallower(pCur);
drh9bf9e9c2008-12-05 20:01:43 +00005789 assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
drh43605152004-05-29 21:46:49 +00005790 }
5791 }else{
danielk1977ac245ec2005-01-14 13:50:11 +00005792 if( pPage->nOverflow>0 ||
danielk197771d5d2c2008-09-29 11:49:47 +00005793 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5794 rc = balance_nonroot(pCur);
drh9bf9e9c2008-12-05 20:01:43 +00005795 assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
drh43605152004-05-29 21:46:49 +00005796 }
5797 }
5798 return rc;
5799}
5800
5801/*
drh8dcd7ca2004-08-08 19:43:29 +00005802** This routine checks all cursors that point to table pgnoRoot.
drh980b1a72006-08-16 16:42:48 +00005803** If any of those cursors were opened with wrFlag==0 in a different
5804** database connection (a database connection that shares the pager
5805** cache with the current connection) and that other connection
5806** is not in the ReadUncommmitted state, then this routine returns
5807** SQLITE_LOCKED.
danielk1977299b1872004-11-22 10:02:10 +00005808**
danielk19773588ceb2008-06-10 17:30:26 +00005809** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5810** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5811** blob cursors are used for both reading and writing.
5812**
5813** When pgnoRoot is the root page of an intkey table, this function is also
5814** responsible for invalidating incremental blob cursors when the table row
5815** on which they are opened is deleted or modified. Cursors are invalidated
5816** according to the following rules:
5817**
5818** 1) When BtreeClearTable() is called to completely delete the contents
5819** of a B-Tree table, pExclude is set to zero and parameter iRow is
5820** set to non-zero. In this case all incremental blob cursors open
5821** on the table rooted at pgnoRoot are invalidated.
5822**
5823** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5824** modify a table row via an SQL statement, pExclude is set to the
5825** write cursor used to do the modification and parameter iRow is set
5826** to the integer row id of the B-Tree entry being modified. Unless
5827** pExclude is itself an incremental blob cursor, then all incremental
5828** blob cursors open on row iRow of the B-Tree are invalidated.
5829**
5830** 3) If both pExclude and iRow are set to zero, no incremental blob
5831** cursors are invalidated.
drhf74b8d92002-09-01 23:20:45 +00005832*/
danielk19773588ceb2008-06-10 17:30:26 +00005833static int checkReadLocks(
5834 Btree *pBtree,
5835 Pgno pgnoRoot,
5836 BtCursor *pExclude,
5837 i64 iRow
5838){
danielk1977299b1872004-11-22 10:02:10 +00005839 BtCursor *p;
drh980b1a72006-08-16 16:42:48 +00005840 BtShared *pBt = pBtree->pBt;
drhe5fe6902007-12-07 18:55:28 +00005841 sqlite3 *db = pBtree->db;
drh1fee73e2007-08-29 04:00:57 +00005842 assert( sqlite3BtreeHoldsMutex(pBtree) );
danielk1977299b1872004-11-22 10:02:10 +00005843 for(p=pBt->pCursor; p; p=p->pNext){
drh980b1a72006-08-16 16:42:48 +00005844 if( p==pExclude ) continue;
drh980b1a72006-08-16 16:42:48 +00005845 if( p->pgnoRoot!=pgnoRoot ) continue;
danielk19773588ceb2008-06-10 17:30:26 +00005846#ifndef SQLITE_OMIT_INCRBLOB
5847 if( p->isIncrblobHandle && (
5848 (!pExclude && iRow)
5849 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5850 )){
5851 p->eState = CURSOR_INVALID;
5852 }
5853#endif
5854 if( p->eState!=CURSOR_VALID ) continue;
5855 if( p->wrFlag==0
5856#ifndef SQLITE_OMIT_INCRBLOB
5857 || p->isIncrblobHandle
5858#endif
5859 ){
drhe5fe6902007-12-07 18:55:28 +00005860 sqlite3 *dbOther = p->pBtree->db;
drh980b1a72006-08-16 16:42:48 +00005861 if( dbOther==0 ||
5862 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5863 return SQLITE_LOCKED;
5864 }
danielk1977299b1872004-11-22 10:02:10 +00005865 }
5866 }
drhf74b8d92002-09-01 23:20:45 +00005867 return SQLITE_OK;
5868}
5869
5870/*
drh3b7511c2001-05-26 13:15:44 +00005871** Insert a new record into the BTree. The key is given by (pKey,nKey)
5872** and the data is given by (pData,nData). The cursor is used only to
drh91025292004-05-03 19:49:32 +00005873** define what table the record should be inserted into. The cursor
drh4b70f112004-05-02 21:12:19 +00005874** is left pointing at a random location.
5875**
5876** For an INTKEY table, only the nKey value of the key is used. pKey is
5877** ignored. For a ZERODATA table, the pData and nData are both ignored.
drh3b7511c2001-05-26 13:15:44 +00005878*/
drh3aac2dd2004-04-26 14:10:20 +00005879int sqlite3BtreeInsert(
drh5c4d9702001-08-20 00:33:58 +00005880 BtCursor *pCur, /* Insert data into the table of this cursor */
drh4a1c3802004-05-12 15:15:47 +00005881 const void *pKey, i64 nKey, /* The key of the new record */
drhe4d90812007-03-29 05:51:49 +00005882 const void *pData, int nData, /* The data of the new record */
drhb026e052007-05-02 01:34:31 +00005883 int nZero, /* Number of extra 0 bytes to append to data */
drhe4d90812007-03-29 05:51:49 +00005884 int appendBias /* True if this is likely an append */
drh3b7511c2001-05-26 13:15:44 +00005885){
drh3b7511c2001-05-26 13:15:44 +00005886 int rc;
5887 int loc;
drh14acc042001-06-10 19:56:58 +00005888 int szNew;
danielk197771d5d2c2008-09-29 11:49:47 +00005889 int idx;
drh3b7511c2001-05-26 13:15:44 +00005890 MemPage *pPage;
drhd677b3d2007-08-20 22:48:41 +00005891 Btree *p = pCur->pBtree;
5892 BtShared *pBt = p->pBt;
drha34b6762004-05-07 13:30:42 +00005893 unsigned char *oldCell;
drh2e38c322004-09-03 18:38:44 +00005894 unsigned char *newCell = 0;
drh3b7511c2001-05-26 13:15:44 +00005895
drh1fee73e2007-08-29 04:00:57 +00005896 assert( cursorHoldsMutex(pCur) );
danielk1977aef0bf62005-12-30 16:28:01 +00005897 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005898 /* Must start a transaction before doing an insert */
drhd677b3d2007-08-20 22:48:41 +00005899 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005900 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005901 }
drhf74b8d92002-09-01 23:20:45 +00005902 assert( !pBt->readOnly );
drhecdc7532001-09-23 02:35:53 +00005903 if( !pCur->wrFlag ){
5904 return SQLITE_PERM; /* Cursor not open for writing */
5905 }
danielk19773588ceb2008-06-10 17:30:26 +00005906 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
drhf74b8d92002-09-01 23:20:45 +00005907 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5908 }
drhfb982642007-08-30 01:19:59 +00005909 if( pCur->eState==CURSOR_FAULT ){
5910 return pCur->skip;
5911 }
danielk1977da184232006-01-05 11:34:32 +00005912
5913 /* Save the positions of any other cursors open on this table */
danielk1977be51a652008-10-08 17:58:48 +00005914 sqlite3BtreeClearCursor(pCur);
danielk19772e94d4d2006-01-09 05:36:27 +00005915 if(
danielk19772e94d4d2006-01-09 05:36:27 +00005916 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
drhe63d9992008-08-13 19:11:48 +00005917 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
danielk19772e94d4d2006-01-09 05:36:27 +00005918 ){
danielk1977da184232006-01-05 11:34:32 +00005919 return rc;
5920 }
5921
danielk197771d5d2c2008-09-29 11:49:47 +00005922 pPage = pCur->apPage[pCur->iPage];
drh4a1c3802004-05-12 15:15:47 +00005923 assert( pPage->intKey || nKey>=0 );
drh44845222008-07-17 18:39:57 +00005924 assert( pPage->leaf || !pPage->intKey );
drh3a4c1412004-05-09 20:40:11 +00005925 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5926 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5927 loc==0 ? "overwrite" : "new entry"));
danielk197771d5d2c2008-09-29 11:49:47 +00005928 assert( pPage->isInit );
danielk197752ae7242008-03-25 14:24:56 +00005929 allocateTempSpace(pBt);
5930 newCell = pBt->pTmpSpace;
drh2e38c322004-09-03 18:38:44 +00005931 if( newCell==0 ) return SQLITE_NOMEM;
drhb026e052007-05-02 01:34:31 +00005932 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
drh2e38c322004-09-03 18:38:44 +00005933 if( rc ) goto end_insert;
drh43605152004-05-29 21:46:49 +00005934 assert( szNew==cellSizePtr(pPage, newCell) );
drh2e38c322004-09-03 18:38:44 +00005935 assert( szNew<=MX_CELL_SIZE(pBt) );
danielk197771d5d2c2008-09-29 11:49:47 +00005936 idx = pCur->aiIdx[pCur->iPage];
danielk1977da184232006-01-05 11:34:32 +00005937 if( loc==0 && CURSOR_VALID==pCur->eState ){
drha9121e42008-02-19 14:59:35 +00005938 u16 szOld;
danielk197771d5d2c2008-09-29 11:49:47 +00005939 assert( idx<pPage->nCell );
danielk19776e465eb2007-08-21 13:11:00 +00005940 rc = sqlite3PagerWrite(pPage->pDbPage);
5941 if( rc ){
5942 goto end_insert;
5943 }
danielk197771d5d2c2008-09-29 11:49:47 +00005944 oldCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00005945 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00005946 memcpy(newCell, oldCell, 4);
drh4b70f112004-05-02 21:12:19 +00005947 }
drh43605152004-05-29 21:46:49 +00005948 szOld = cellSizePtr(pPage, oldCell);
drh4b70f112004-05-02 21:12:19 +00005949 rc = clearCell(pPage, oldCell);
drh2e38c322004-09-03 18:38:44 +00005950 if( rc ) goto end_insert;
shane0af3f892008-11-12 04:55:34 +00005951 rc = dropCell(pPage, idx, szOld);
5952 if( rc!=SQLITE_OK ) {
5953 goto end_insert;
5954 }
drh7c717f72001-06-24 20:39:41 +00005955 }else if( loc<0 && pPage->nCell>0 ){
drh4b70f112004-05-02 21:12:19 +00005956 assert( pPage->leaf );
danielk197771d5d2c2008-09-29 11:49:47 +00005957 idx = ++pCur->aiIdx[pCur->iPage];
drh271efa52004-05-30 19:19:05 +00005958 pCur->info.nSize = 0;
drha2c20e42008-03-29 16:01:04 +00005959 pCur->validNKey = 0;
drh14acc042001-06-10 19:56:58 +00005960 }else{
drh4b70f112004-05-02 21:12:19 +00005961 assert( pPage->leaf );
drh3b7511c2001-05-26 13:15:44 +00005962 }
danielk197771d5d2c2008-09-29 11:49:47 +00005963 rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
drh9bf9e9c2008-12-05 20:01:43 +00005964 if( rc==SQLITE_OK ){
5965 rc = balance(pCur, 1);
5966 }
5967
5968 /* Must make sure nOverflow is reset to zero even if the balance()
5969 ** fails. Internal data structure corruption will result otherwise. */
5970 assert( pPage->nOverflow==0 || rc!=SQLITE_OK );
5971 pPage->nOverflow = 0;
5972
danielk1977299b1872004-11-22 10:02:10 +00005973 if( rc==SQLITE_OK ){
5974 moveToRoot(pCur);
5975 }
drh2e38c322004-09-03 18:38:44 +00005976end_insert:
drh5e2f8b92001-05-28 00:41:15 +00005977 return rc;
5978}
5979
5980/*
drh4b70f112004-05-02 21:12:19 +00005981** Delete the entry that the cursor is pointing to. The cursor
drhf94a1732008-09-30 17:18:17 +00005982** is left pointing at a arbitrary location.
drh3b7511c2001-05-26 13:15:44 +00005983*/
drh3aac2dd2004-04-26 14:10:20 +00005984int sqlite3BtreeDelete(BtCursor *pCur){
danielk197771d5d2c2008-09-29 11:49:47 +00005985 MemPage *pPage = pCur->apPage[pCur->iPage];
5986 int idx;
drh4b70f112004-05-02 21:12:19 +00005987 unsigned char *pCell;
drh5e2f8b92001-05-28 00:41:15 +00005988 int rc;
danielk1977cfe9a692004-06-16 12:00:29 +00005989 Pgno pgnoChild = 0;
drhd677b3d2007-08-20 22:48:41 +00005990 Btree *p = pCur->pBtree;
5991 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00005992
drh1fee73e2007-08-29 04:00:57 +00005993 assert( cursorHoldsMutex(pCur) );
danielk197771d5d2c2008-09-29 11:49:47 +00005994 assert( pPage->isInit );
danielk1977aef0bf62005-12-30 16:28:01 +00005995 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00005996 /* Must start a transaction before doing a delete */
drhd677b3d2007-08-20 22:48:41 +00005997 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drhd677b3d2007-08-20 22:48:41 +00005998 return rc;
drh8b2f49b2001-06-08 00:21:52 +00005999 }
drhf74b8d92002-09-01 23:20:45 +00006000 assert( !pBt->readOnly );
drhfb982642007-08-30 01:19:59 +00006001 if( pCur->eState==CURSOR_FAULT ){
6002 return pCur->skip;
6003 }
danielk197771d5d2c2008-09-29 11:49:47 +00006004 if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
drhbd03cae2001-06-02 02:40:57 +00006005 return SQLITE_ERROR; /* The cursor is not pointing to anything */
6006 }
drhecdc7532001-09-23 02:35:53 +00006007 if( !pCur->wrFlag ){
6008 return SQLITE_PERM; /* Did not open this cursor for writing */
6009 }
danielk19773588ceb2008-06-10 17:30:26 +00006010 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
drhf74b8d92002-09-01 23:20:45 +00006011 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
6012 }
danielk1977da184232006-01-05 11:34:32 +00006013
6014 /* Restore the current cursor position (a no-op if the cursor is not in
6015 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
danielk19773b8a05f2007-03-19 17:44:26 +00006016 ** open on the same table. Then call sqlite3PagerWrite() on the page
danielk1977da184232006-01-05 11:34:32 +00006017 ** that the entry will be deleted from.
6018 */
6019 if(
drha3460582008-07-11 21:02:53 +00006020 (rc = restoreCursorPosition(pCur))!=0 ||
drhd1167392006-01-23 13:00:35 +00006021 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
danielk19773b8a05f2007-03-19 17:44:26 +00006022 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
danielk1977da184232006-01-05 11:34:32 +00006023 ){
6024 return rc;
6025 }
danielk1977e6efa742004-11-10 11:55:10 +00006026
drh85b623f2007-12-13 21:54:09 +00006027 /* Locate the cell within its page and leave pCell pointing to the
danielk1977e6efa742004-11-10 11:55:10 +00006028 ** data. The clearCell() call frees any overflow pages associated with the
6029 ** cell. The cell itself is still intact.
6030 */
danielk197771d5d2c2008-09-29 11:49:47 +00006031 idx = pCur->aiIdx[pCur->iPage];
6032 pCell = findCell(pPage, idx);
drh4b70f112004-05-02 21:12:19 +00006033 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006034 pgnoChild = get4byte(pCell);
drh4b70f112004-05-02 21:12:19 +00006035 }
danielk197728129562005-01-11 10:25:06 +00006036 rc = clearCell(pPage, pCell);
drhd677b3d2007-08-20 22:48:41 +00006037 if( rc ){
drhd677b3d2007-08-20 22:48:41 +00006038 return rc;
6039 }
danielk1977e6efa742004-11-10 11:55:10 +00006040
drh4b70f112004-05-02 21:12:19 +00006041 if( !pPage->leaf ){
drh14acc042001-06-10 19:56:58 +00006042 /*
drh5e00f6c2001-09-13 13:46:56 +00006043 ** The entry we are about to delete is not a leaf so if we do not
drh9ca7d3b2001-06-28 11:50:21 +00006044 ** do something we will leave a hole on an internal page.
6045 ** We have to fill the hole by moving in a cell from a leaf. The
6046 ** next Cell after the one to be deleted is guaranteed to exist and
danielk1977299b1872004-11-22 10:02:10 +00006047 ** to be a leaf so we can use it.
drh5e2f8b92001-05-28 00:41:15 +00006048 */
drh14acc042001-06-10 19:56:58 +00006049 BtCursor leafCur;
drh1bd10f82008-12-10 21:19:56 +00006050 MemPage *pLeafPage = 0;
danielk197771d5d2c2008-09-29 11:49:47 +00006051
drh4b70f112004-05-02 21:12:19 +00006052 unsigned char *pNext;
danielk1977299b1872004-11-22 10:02:10 +00006053 int notUsed;
danielk19776b456a22005-03-21 04:04:02 +00006054 unsigned char *tempCell = 0;
drh44845222008-07-17 18:39:57 +00006055 assert( !pPage->intKey );
drh16a9b832007-05-05 18:39:25 +00006056 sqlite3BtreeGetTempCursor(pCur, &leafCur);
danielk1977299b1872004-11-22 10:02:10 +00006057 rc = sqlite3BtreeNext(&leafCur, &notUsed);
danielk19776b456a22005-03-21 04:04:02 +00006058 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00006059 assert( leafCur.aiIdx[leafCur.iPage]==0 );
danielk197771d5d2c2008-09-29 11:49:47 +00006060 pLeafPage = leafCur.apPage[leafCur.iPage];
danielk197771d5d2c2008-09-29 11:49:47 +00006061 rc = sqlite3PagerWrite(pLeafPage->pDbPage);
danielk19776b456a22005-03-21 04:04:02 +00006062 }
6063 if( rc==SQLITE_OK ){
danielk19772f78fc62008-09-30 09:31:45 +00006064 int leafCursorInvalid = 0;
drha9121e42008-02-19 14:59:35 +00006065 u16 szNext;
danielk19776b456a22005-03-21 04:04:02 +00006066 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
danielk197771d5d2c2008-09-29 11:49:47 +00006067 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
6068 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
danielk19772f78fc62008-09-30 09:31:45 +00006069 pNext = findCell(pLeafPage, 0);
danielk197771d5d2c2008-09-29 11:49:47 +00006070 szNext = cellSizePtr(pLeafPage, pNext);
danielk19776b456a22005-03-21 04:04:02 +00006071 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
danielk197752ae7242008-03-25 14:24:56 +00006072 allocateTempSpace(pBt);
6073 tempCell = pBt->pTmpSpace;
danielk19776b456a22005-03-21 04:04:02 +00006074 if( tempCell==0 ){
6075 rc = SQLITE_NOMEM;
6076 }
danielk19778ea1cfa2008-01-01 06:19:02 +00006077 if( rc==SQLITE_OK ){
danielk197771d5d2c2008-09-29 11:49:47 +00006078 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00006079 }
danielk19772f78fc62008-09-30 09:31:45 +00006080
drhf94a1732008-09-30 17:18:17 +00006081
6082 /* The "if" statement in the next code block is critical. The
6083 ** slightest error in that statement would allow SQLite to operate
6084 ** correctly most of the time but produce very rare failures. To
6085 ** guard against this, the following macros help to verify that
6086 ** the "if" statement is well tested.
6087 */
6088 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
6089 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6090 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
6091 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6092 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
6093 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6094 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
6095 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
6096 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
6097 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
6098
6099
danielk19772f78fc62008-09-30 09:31:45 +00006100 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
6101 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
6102 ){
drhf94a1732008-09-30 17:18:17 +00006103 /* This branch is taken if the internal node is now either overflowing
6104 ** or underfull and the leaf node will be underfull after the just cell
danielk19772f78fc62008-09-30 09:31:45 +00006105 ** copied to the internal node is deleted from it. This is a special
6106 ** case because the call to balance() to correct the internal node
6107 ** may change the tree structure and invalidate the contents of
6108 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
6109 ** used by the balance() required to correct the underfull leaf
6110 ** node.
6111 **
6112 ** The formula used in the expression above are based on facets of
6113 ** the SQLite file-format that do not change over time.
6114 */
drhf94a1732008-09-30 17:18:17 +00006115 testcase( pPage->nFree==pBt->usableSize*2/3+1 );
6116 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
danielk19772f78fc62008-09-30 09:31:45 +00006117 leafCursorInvalid = 1;
6118 }
6119
danielk19778ea1cfa2008-01-01 06:19:02 +00006120 if( rc==SQLITE_OK ){
drhc5053fb2008-11-27 02:22:10 +00006121 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
danielk197771d5d2c2008-09-29 11:49:47 +00006122 put4byte(findOverflowCell(pPage, idx), pgnoChild);
drhf94a1732008-09-30 17:18:17 +00006123 VVA_ONLY( pCur->pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00006124 rc = balance(pCur, 0);
danielk19778ea1cfa2008-01-01 06:19:02 +00006125 }
danielk19772f78fc62008-09-30 09:31:45 +00006126
6127 if( rc==SQLITE_OK && leafCursorInvalid ){
6128 /* The leaf-node is now underfull and so the tree needs to be
6129 ** rebalanced. However, the balance() operation on the internal
6130 ** node above may have modified the structure of the B-Tree and
6131 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
6132 ** may not be trusted.
6133 **
6134 ** It is not possible to copy the ancestry from pCur, as the same
6135 ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
6136 ** arrays.
drh7b682802008-09-30 14:06:28 +00006137 **
6138 ** The call to saveCursorPosition() below internally saves the
6139 ** key that leafCur is currently pointing to. Currently, there
6140 ** are two copies of that key in the tree - one here on the leaf
6141 ** page and one on some internal node in the tree. The copy on
6142 ** the leaf node is always the next key in tree-order after the
6143 ** copy on the internal node. So, the call to sqlite3BtreeNext()
6144 ** calls restoreCursorPosition() to point the cursor to the copy
6145 ** stored on the internal node, then advances to the next entry,
6146 ** which happens to be the copy of the key on the internal node.
danielk1977a69fda22008-09-30 16:48:10 +00006147 ** Net effect: leafCur is pointing back to the duplicate cell
6148 ** that needs to be removed, and the leafCur.apPage[] and
6149 ** leafCur.aiIdx[] arrays are correct.
danielk19772f78fc62008-09-30 09:31:45 +00006150 */
drhf94a1732008-09-30 17:18:17 +00006151 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
danielk19772f78fc62008-09-30 09:31:45 +00006152 rc = saveCursorPosition(&leafCur);
6153 if( rc==SQLITE_OK ){
6154 rc = sqlite3BtreeNext(&leafCur, &notUsed);
6155 }
6156 pLeafPage = leafCur.apPage[leafCur.iPage];
6157 assert( pLeafPage->pgno==leafPgno );
6158 assert( leafCur.aiIdx[leafCur.iPage]==0 );
6159 }
6160
danielk19770cd1bbd2008-11-26 07:25:52 +00006161 if( SQLITE_OK==rc
6162 && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage))
6163 ){
danielk19772f78fc62008-09-30 09:31:45 +00006164 dropCell(pLeafPage, 0, szNext);
drhf94a1732008-09-30 17:18:17 +00006165 VVA_ONLY( leafCur.pagesShuffled = 0 );
danielk197771d5d2c2008-09-29 11:49:47 +00006166 rc = balance(&leafCur, 0);
drhf94a1732008-09-30 17:18:17 +00006167 assert( leafCursorInvalid || !leafCur.pagesShuffled
6168 || !pCur->pagesShuffled );
danielk19778ea1cfa2008-01-01 06:19:02 +00006169 }
danielk19776b456a22005-03-21 04:04:02 +00006170 }
drh16a9b832007-05-05 18:39:25 +00006171 sqlite3BtreeReleaseTempCursor(&leafCur);
drh5e2f8b92001-05-28 00:41:15 +00006172 }else{
danielk1977299b1872004-11-22 10:02:10 +00006173 TRACE(("DELETE: table=%d delete from leaf %d\n",
6174 pCur->pgnoRoot, pPage->pgno));
shanedcc50b72008-11-13 18:29:50 +00006175 rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6176 if( rc==SQLITE_OK ){
6177 rc = balance(pCur, 0);
6178 }
drh5e2f8b92001-05-28 00:41:15 +00006179 }
danielk19776b456a22005-03-21 04:04:02 +00006180 if( rc==SQLITE_OK ){
6181 moveToRoot(pCur);
6182 }
drh5e2f8b92001-05-28 00:41:15 +00006183 return rc;
drh3b7511c2001-05-26 13:15:44 +00006184}
drh8b2f49b2001-06-08 00:21:52 +00006185
6186/*
drhc6b52df2002-01-04 03:09:29 +00006187** Create a new BTree table. Write into *piTable the page
6188** number for the root page of the new table.
6189**
drhab01f612004-05-22 02:55:23 +00006190** The type of type is determined by the flags parameter. Only the
6191** following values of flags are currently in use. Other values for
6192** flags might not work:
6193**
6194** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6195** BTREE_ZERODATA Used for SQL indices
drh8b2f49b2001-06-08 00:21:52 +00006196*/
drhd677b3d2007-08-20 22:48:41 +00006197static int btreeCreateTable(Btree *p, int *piTable, int flags){
danielk1977aef0bf62005-12-30 16:28:01 +00006198 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006199 MemPage *pRoot;
6200 Pgno pgnoRoot;
6201 int rc;
drhd677b3d2007-08-20 22:48:41 +00006202
drh1fee73e2007-08-29 04:00:57 +00006203 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006204 if( pBt->inTransaction!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006205 /* Must start a transaction first */
drhd677b3d2007-08-20 22:48:41 +00006206 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6207 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006208 }
danielk197728129562005-01-11 10:25:06 +00006209 assert( !pBt->readOnly );
danielk1977e6efa742004-11-10 11:55:10 +00006210
danielk1977003ba062004-11-04 02:57:33 +00006211#ifdef SQLITE_OMIT_AUTOVACUUM
drh4f0c5872007-03-26 22:05:01 +00006212 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
drhd677b3d2007-08-20 22:48:41 +00006213 if( rc ){
6214 return rc;
6215 }
danielk1977003ba062004-11-04 02:57:33 +00006216#else
danielk1977687566d2004-11-02 12:56:41 +00006217 if( pBt->autoVacuum ){
danielk1977003ba062004-11-04 02:57:33 +00006218 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6219 MemPage *pPageMove; /* The page to move to. */
6220
danielk197720713f32007-05-03 11:43:33 +00006221 /* Creating a new table may probably require moving an existing database
6222 ** to make room for the new tables root page. In case this page turns
6223 ** out to be an overflow page, delete all overflow page-map caches
6224 ** held by open cursors.
6225 */
danielk197792d4d7a2007-05-04 12:05:56 +00006226 invalidateAllOverflowCache(pBt);
danielk197720713f32007-05-03 11:43:33 +00006227
danielk1977003ba062004-11-04 02:57:33 +00006228 /* Read the value of meta[3] from the database to determine where the
6229 ** root page of the new table should go. meta[3] is the largest root-page
6230 ** created so far, so the new root-page is (meta[3]+1).
6231 */
danielk1977aef0bf62005-12-30 16:28:01 +00006232 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
drhd677b3d2007-08-20 22:48:41 +00006233 if( rc!=SQLITE_OK ){
6234 return rc;
6235 }
danielk1977003ba062004-11-04 02:57:33 +00006236 pgnoRoot++;
6237
danielk1977599fcba2004-11-08 07:13:13 +00006238 /* The new root-page may not be allocated on a pointer-map page, or the
6239 ** PENDING_BYTE page.
6240 */
drh72190432008-01-31 14:54:43 +00006241 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
danielk1977599fcba2004-11-08 07:13:13 +00006242 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
danielk1977003ba062004-11-04 02:57:33 +00006243 pgnoRoot++;
6244 }
6245 assert( pgnoRoot>=3 );
6246
6247 /* Allocate a page. The page that currently resides at pgnoRoot will
6248 ** be moved to the allocated page (unless the allocated page happens
6249 ** to reside at pgnoRoot).
6250 */
drh4f0c5872007-03-26 22:05:01 +00006251 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
danielk1977003ba062004-11-04 02:57:33 +00006252 if( rc!=SQLITE_OK ){
danielk1977687566d2004-11-02 12:56:41 +00006253 return rc;
6254 }
danielk1977003ba062004-11-04 02:57:33 +00006255
6256 if( pgnoMove!=pgnoRoot ){
danielk1977f35843b2007-04-07 15:03:17 +00006257 /* pgnoRoot is the page that will be used for the root-page of
6258 ** the new table (assuming an error did not occur). But we were
6259 ** allocated pgnoMove. If required (i.e. if it was not allocated
6260 ** by extending the file), the current page at position pgnoMove
6261 ** is already journaled.
6262 */
danielk1977003ba062004-11-04 02:57:33 +00006263 u8 eType;
6264 Pgno iPtrPage;
6265
6266 releasePage(pPageMove);
danielk1977f35843b2007-04-07 15:03:17 +00006267
6268 /* Move the page currently at pgnoRoot to pgnoMove. */
drh16a9b832007-05-05 18:39:25 +00006269 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006270 if( rc!=SQLITE_OK ){
6271 return rc;
6272 }
6273 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
drhccae6022005-02-26 17:31:26 +00006274 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
danielk1977003ba062004-11-04 02:57:33 +00006275 releasePage(pRoot);
6276 return rc;
6277 }
drhccae6022005-02-26 17:31:26 +00006278 assert( eType!=PTRMAP_ROOTPAGE );
6279 assert( eType!=PTRMAP_FREEPAGE );
danielk19773b8a05f2007-03-19 17:44:26 +00006280 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk19775fd057a2005-03-09 13:09:43 +00006281 if( rc!=SQLITE_OK ){
6282 releasePage(pRoot);
6283 return rc;
6284 }
danielk19774c999992008-07-16 18:17:55 +00006285 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
danielk1977003ba062004-11-04 02:57:33 +00006286 releasePage(pRoot);
danielk1977f35843b2007-04-07 15:03:17 +00006287
6288 /* Obtain the page at pgnoRoot */
danielk1977003ba062004-11-04 02:57:33 +00006289 if( rc!=SQLITE_OK ){
6290 return rc;
6291 }
drh16a9b832007-05-05 18:39:25 +00006292 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
danielk1977003ba062004-11-04 02:57:33 +00006293 if( rc!=SQLITE_OK ){
6294 return rc;
6295 }
danielk19773b8a05f2007-03-19 17:44:26 +00006296 rc = sqlite3PagerWrite(pRoot->pDbPage);
danielk1977003ba062004-11-04 02:57:33 +00006297 if( rc!=SQLITE_OK ){
6298 releasePage(pRoot);
6299 return rc;
6300 }
6301 }else{
6302 pRoot = pPageMove;
6303 }
6304
danielk197742741be2005-01-08 12:42:39 +00006305 /* Update the pointer-map and meta-data with the new root-page number. */
danielk1977003ba062004-11-04 02:57:33 +00006306 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6307 if( rc ){
6308 releasePage(pRoot);
6309 return rc;
6310 }
danielk1977aef0bf62005-12-30 16:28:01 +00006311 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
danielk1977003ba062004-11-04 02:57:33 +00006312 if( rc ){
6313 releasePage(pRoot);
6314 return rc;
6315 }
danielk197742741be2005-01-08 12:42:39 +00006316
danielk1977003ba062004-11-04 02:57:33 +00006317 }else{
drh4f0c5872007-03-26 22:05:01 +00006318 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
danielk1977003ba062004-11-04 02:57:33 +00006319 if( rc ) return rc;
danielk1977687566d2004-11-02 12:56:41 +00006320 }
6321#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006322 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
drhde647132004-05-07 17:57:49 +00006323 zeroPage(pRoot, flags | PTF_LEAF);
danielk19773b8a05f2007-03-19 17:44:26 +00006324 sqlite3PagerUnref(pRoot->pDbPage);
drh8b2f49b2001-06-08 00:21:52 +00006325 *piTable = (int)pgnoRoot;
6326 return SQLITE_OK;
6327}
drhd677b3d2007-08-20 22:48:41 +00006328int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6329 int rc;
6330 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006331 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006332 rc = btreeCreateTable(p, piTable, flags);
6333 sqlite3BtreeLeave(p);
6334 return rc;
6335}
drh8b2f49b2001-06-08 00:21:52 +00006336
6337/*
6338** Erase the given database page and all its children. Return
6339** the page to the freelist.
6340*/
drh4b70f112004-05-02 21:12:19 +00006341static int clearDatabasePage(
danielk1977aef0bf62005-12-30 16:28:01 +00006342 BtShared *pBt, /* The BTree that contains the table */
drh4b70f112004-05-02 21:12:19 +00006343 Pgno pgno, /* Page number to clear */
danielk1977c7af4842008-10-27 13:59:33 +00006344 int freePageFlag, /* Deallocate page if true */
6345 int *pnChange
drh4b70f112004-05-02 21:12:19 +00006346){
danielk19776b456a22005-03-21 04:04:02 +00006347 MemPage *pPage = 0;
drh8b2f49b2001-06-08 00:21:52 +00006348 int rc;
drh4b70f112004-05-02 21:12:19 +00006349 unsigned char *pCell;
6350 int i;
drh8b2f49b2001-06-08 00:21:52 +00006351
drh1fee73e2007-08-29 04:00:57 +00006352 assert( sqlite3_mutex_held(pBt->mutex) );
danielk197789d40042008-11-17 14:20:56 +00006353 if( pgno>pagerPagecount(pBt) ){
drh49285702005-09-17 15:20:26 +00006354 return SQLITE_CORRUPT_BKPT;
danielk1977a1cb1832005-02-12 08:59:55 +00006355 }
6356
danielk197771d5d2c2008-09-29 11:49:47 +00006357 rc = getAndInitPage(pBt, pgno, &pPage);
danielk19776b456a22005-03-21 04:04:02 +00006358 if( rc ) goto cleardatabasepage_out;
drh4b70f112004-05-02 21:12:19 +00006359 for(i=0; i<pPage->nCell; i++){
danielk19771cc5ed82007-05-16 17:28:43 +00006360 pCell = findCell(pPage, i);
drh4b70f112004-05-02 21:12:19 +00006361 if( !pPage->leaf ){
danielk197762c14b32008-11-19 09:05:26 +00006362 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006363 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006364 }
drh4b70f112004-05-02 21:12:19 +00006365 rc = clearCell(pPage, pCell);
danielk19776b456a22005-03-21 04:04:02 +00006366 if( rc ) goto cleardatabasepage_out;
drh8b2f49b2001-06-08 00:21:52 +00006367 }
drha34b6762004-05-07 13:30:42 +00006368 if( !pPage->leaf ){
danielk197762c14b32008-11-19 09:05:26 +00006369 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
danielk19776b456a22005-03-21 04:04:02 +00006370 if( rc ) goto cleardatabasepage_out;
danielk1977c7af4842008-10-27 13:59:33 +00006371 }else if( pnChange ){
6372 assert( pPage->intKey );
6373 *pnChange += pPage->nCell;
drh2aa679f2001-06-25 02:11:07 +00006374 }
6375 if( freePageFlag ){
drh4b70f112004-05-02 21:12:19 +00006376 rc = freePage(pPage);
danielk19773b8a05f2007-03-19 17:44:26 +00006377 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
drh3a4c1412004-05-09 20:40:11 +00006378 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
drh2aa679f2001-06-25 02:11:07 +00006379 }
danielk19776b456a22005-03-21 04:04:02 +00006380
6381cleardatabasepage_out:
drh4b70f112004-05-02 21:12:19 +00006382 releasePage(pPage);
drh2aa679f2001-06-25 02:11:07 +00006383 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006384}
6385
6386/*
drhab01f612004-05-22 02:55:23 +00006387** Delete all information from a single table in the database. iTable is
6388** the page number of the root of the table. After this routine returns,
6389** the root page is empty, but still exists.
6390**
6391** This routine will fail with SQLITE_LOCKED if there are any open
6392** read cursors on the table. Open write cursors are moved to the
6393** root of the table.
danielk1977c7af4842008-10-27 13:59:33 +00006394**
6395** If pnChange is not NULL, then table iTable must be an intkey table. The
6396** integer value pointed to by pnChange is incremented by the number of
6397** entries in the table.
drh8b2f49b2001-06-08 00:21:52 +00006398*/
danielk1977c7af4842008-10-27 13:59:33 +00006399int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
drh8b2f49b2001-06-08 00:21:52 +00006400 int rc;
danielk1977aef0bf62005-12-30 16:28:01 +00006401 BtShared *pBt = p->pBt;
drhd677b3d2007-08-20 22:48:41 +00006402 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006403 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006404 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006405 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
danielk19773588ceb2008-06-10 17:30:26 +00006406 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006407 /* nothing to do */
6408 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6409 /* nothing to do */
6410 }else{
danielk197762c14b32008-11-19 09:05:26 +00006411 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
drh8b2f49b2001-06-08 00:21:52 +00006412 }
drhd677b3d2007-08-20 22:48:41 +00006413 sqlite3BtreeLeave(p);
6414 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006415}
6416
6417/*
6418** Erase all information in a table and add the root of the table to
6419** the freelist. Except, the root of the principle table (the one on
drhab01f612004-05-22 02:55:23 +00006420** page 1) is never added to the freelist.
6421**
6422** This routine will fail with SQLITE_LOCKED if there are any open
6423** cursors on the table.
drh205f48e2004-11-05 00:43:11 +00006424**
6425** If AUTOVACUUM is enabled and the page at iTable is not the last
6426** root page in the database file, then the last root page
6427** in the database file is moved into the slot formerly occupied by
6428** iTable and that last slot formerly occupied by the last root page
6429** is added to the freelist instead of iTable. In this say, all
6430** root pages are kept at the beginning of the database file, which
6431** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6432** page number that used to be the last root page in the file before
6433** the move. If no page gets moved, *piMoved is set to 0.
6434** The last root page is recorded in meta[3] and the value of
6435** meta[3] is updated by this procedure.
drh8b2f49b2001-06-08 00:21:52 +00006436*/
danielk197789d40042008-11-17 14:20:56 +00006437static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
drh8b2f49b2001-06-08 00:21:52 +00006438 int rc;
danielk1977a0bf2652004-11-04 14:30:04 +00006439 MemPage *pPage = 0;
danielk1977aef0bf62005-12-30 16:28:01 +00006440 BtShared *pBt = p->pBt;
danielk1977a0bf2652004-11-04 14:30:04 +00006441
drh1fee73e2007-08-29 04:00:57 +00006442 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00006443 if( p->inTrans!=TRANS_WRITE ){
drhf74b8d92002-09-01 23:20:45 +00006444 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
drh8b2f49b2001-06-08 00:21:52 +00006445 }
danielk1977a0bf2652004-11-04 14:30:04 +00006446
danielk1977e6efa742004-11-10 11:55:10 +00006447 /* It is illegal to drop a table if any cursors are open on the
6448 ** database. This is because in auto-vacuum mode the backend may
6449 ** need to move another root-page to fill a gap left by the deleted
6450 ** root page. If an open cursor was using this page a problem would
6451 ** occur.
6452 */
6453 if( pBt->pCursor ){
6454 return SQLITE_LOCKED;
drh5df72a52002-06-06 23:16:05 +00006455 }
danielk1977a0bf2652004-11-04 14:30:04 +00006456
drh16a9b832007-05-05 18:39:25 +00006457 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
drh2aa679f2001-06-25 02:11:07 +00006458 if( rc ) return rc;
danielk1977c7af4842008-10-27 13:59:33 +00006459 rc = sqlite3BtreeClearTable(p, iTable, 0);
danielk19776b456a22005-03-21 04:04:02 +00006460 if( rc ){
6461 releasePage(pPage);
6462 return rc;
6463 }
danielk1977a0bf2652004-11-04 14:30:04 +00006464
drh205f48e2004-11-05 00:43:11 +00006465 *piMoved = 0;
danielk1977a0bf2652004-11-04 14:30:04 +00006466
drh4b70f112004-05-02 21:12:19 +00006467 if( iTable>1 ){
danielk1977a0bf2652004-11-04 14:30:04 +00006468#ifdef SQLITE_OMIT_AUTOVACUUM
drha34b6762004-05-07 13:30:42 +00006469 rc = freePage(pPage);
danielk1977a0bf2652004-11-04 14:30:04 +00006470 releasePage(pPage);
6471#else
6472 if( pBt->autoVacuum ){
6473 Pgno maxRootPgno;
danielk1977aef0bf62005-12-30 16:28:01 +00006474 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006475 if( rc!=SQLITE_OK ){
6476 releasePage(pPage);
6477 return rc;
6478 }
6479
6480 if( iTable==maxRootPgno ){
6481 /* If the table being dropped is the table with the largest root-page
6482 ** number in the database, put the root page on the free list.
6483 */
6484 rc = freePage(pPage);
6485 releasePage(pPage);
6486 if( rc!=SQLITE_OK ){
6487 return rc;
6488 }
6489 }else{
6490 /* The table being dropped does not have the largest root-page
6491 ** number in the database. So move the page that does into the
6492 ** gap left by the deleted root-page.
6493 */
6494 MemPage *pMove;
6495 releasePage(pPage);
drh16a9b832007-05-05 18:39:25 +00006496 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006497 if( rc!=SQLITE_OK ){
6498 return rc;
6499 }
danielk19774c999992008-07-16 18:17:55 +00006500 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006501 releasePage(pMove);
6502 if( rc!=SQLITE_OK ){
6503 return rc;
6504 }
drh16a9b832007-05-05 18:39:25 +00006505 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
danielk1977a0bf2652004-11-04 14:30:04 +00006506 if( rc!=SQLITE_OK ){
6507 return rc;
6508 }
6509 rc = freePage(pMove);
6510 releasePage(pMove);
6511 if( rc!=SQLITE_OK ){
6512 return rc;
6513 }
6514 *piMoved = maxRootPgno;
6515 }
6516
danielk1977599fcba2004-11-08 07:13:13 +00006517 /* Set the new 'max-root-page' value in the database header. This
6518 ** is the old value less one, less one more if that happens to
6519 ** be a root-page number, less one again if that is the
6520 ** PENDING_BYTE_PAGE.
6521 */
danielk197787a6e732004-11-05 12:58:25 +00006522 maxRootPgno--;
danielk1977599fcba2004-11-08 07:13:13 +00006523 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6524 maxRootPgno--;
6525 }
danielk1977266664d2006-02-10 08:24:21 +00006526 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
danielk197787a6e732004-11-05 12:58:25 +00006527 maxRootPgno--;
6528 }
danielk1977599fcba2004-11-08 07:13:13 +00006529 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6530
danielk1977aef0bf62005-12-30 16:28:01 +00006531 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
danielk1977a0bf2652004-11-04 14:30:04 +00006532 }else{
6533 rc = freePage(pPage);
6534 releasePage(pPage);
6535 }
6536#endif
drh2aa679f2001-06-25 02:11:07 +00006537 }else{
danielk1977a0bf2652004-11-04 14:30:04 +00006538 /* If sqlite3BtreeDropTable was called on page 1. */
drha34b6762004-05-07 13:30:42 +00006539 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
danielk1977a0bf2652004-11-04 14:30:04 +00006540 releasePage(pPage);
drh8b2f49b2001-06-08 00:21:52 +00006541 }
drh8b2f49b2001-06-08 00:21:52 +00006542 return rc;
6543}
drhd677b3d2007-08-20 22:48:41 +00006544int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6545 int rc;
6546 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006547 p->pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006548 rc = btreeDropTable(p, iTable, piMoved);
6549 sqlite3BtreeLeave(p);
6550 return rc;
6551}
drh8b2f49b2001-06-08 00:21:52 +00006552
drh001bbcb2003-03-19 03:14:00 +00006553
drh8b2f49b2001-06-08 00:21:52 +00006554/*
drh23e11ca2004-05-04 17:27:28 +00006555** Read the meta-information out of a database file. Meta[0]
6556** is the number of free pages currently in the database. Meta[1]
drha3b321d2004-05-11 09:31:31 +00006557** through meta[15] are available for use by higher layers. Meta[0]
6558** is read-only, the others are read/write.
6559**
6560** The schema layer numbers meta values differently. At the schema
6561** layer (and the SetCookie and ReadCookie opcodes) the number of
6562** free pages is not visible. So Cookie[0] is the same as Meta[1].
drh8b2f49b2001-06-08 00:21:52 +00006563*/
danielk1977aef0bf62005-12-30 16:28:01 +00006564int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
drh1bd10f82008-12-10 21:19:56 +00006565 DbPage *pDbPage = 0;
drh8b2f49b2001-06-08 00:21:52 +00006566 int rc;
drh4b70f112004-05-02 21:12:19 +00006567 unsigned char *pP1;
danielk1977aef0bf62005-12-30 16:28:01 +00006568 BtShared *pBt = p->pBt;
drh8b2f49b2001-06-08 00:21:52 +00006569
drhd677b3d2007-08-20 22:48:41 +00006570 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006571 pBt->db = p->db;
drhd677b3d2007-08-20 22:48:41 +00006572
danielk1977da184232006-01-05 11:34:32 +00006573 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6574 ** the sqlite_master table. We grab this lock regardless of whether or
6575 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6576 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6577 */
6578 rc = queryTableLock(p, 1, READ_LOCK);
6579 if( rc!=SQLITE_OK ){
drhd677b3d2007-08-20 22:48:41 +00006580 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006581 return rc;
6582 }
6583
drh23e11ca2004-05-04 17:27:28 +00006584 assert( idx>=0 && idx<=15 );
danielk1977d9f6c532008-09-19 16:39:38 +00006585 if( pBt->pPage1 ){
6586 /* The b-tree is already holding a reference to page 1 of the database
6587 ** file. In this case the required meta-data value can be read directly
6588 ** from the page data of this reference. This is slightly faster than
6589 ** requesting a new reference from the pager layer.
6590 */
6591 pP1 = (unsigned char *)pBt->pPage1->aData;
6592 }else{
6593 /* The b-tree does not have a reference to page 1 of the database file.
6594 ** Obtain one from the pager layer.
6595 */
danielk1977ea897302008-09-19 15:10:58 +00006596 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6597 if( rc ){
6598 sqlite3BtreeLeave(p);
6599 return rc;
6600 }
6601 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
drhd677b3d2007-08-20 22:48:41 +00006602 }
drh23e11ca2004-05-04 17:27:28 +00006603 *pMeta = get4byte(&pP1[36 + idx*4]);
danielk1977ea897302008-09-19 15:10:58 +00006604
danielk1977d9f6c532008-09-19 16:39:38 +00006605 /* If the b-tree is not holding a reference to page 1, then one was
6606 ** requested from the pager layer in the above block. Release it now.
6607 */
danielk1977ea897302008-09-19 15:10:58 +00006608 if( !pBt->pPage1 ){
6609 sqlite3PagerUnref(pDbPage);
6610 }
drhae157872004-08-14 19:20:09 +00006611
danielk1977599fcba2004-11-08 07:13:13 +00006612 /* If autovacuumed is disabled in this build but we are trying to
6613 ** access an autovacuumed database, then make the database readonly.
6614 */
danielk1977003ba062004-11-04 02:57:33 +00006615#ifdef SQLITE_OMIT_AUTOVACUUM
drhae157872004-08-14 19:20:09 +00006616 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
danielk1977003ba062004-11-04 02:57:33 +00006617#endif
drhae157872004-08-14 19:20:09 +00006618
danielk1977da184232006-01-05 11:34:32 +00006619 /* Grab the read-lock on page 1. */
6620 rc = lockTable(p, 1, READ_LOCK);
drhd677b3d2007-08-20 22:48:41 +00006621 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00006622 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006623}
6624
6625/*
drh23e11ca2004-05-04 17:27:28 +00006626** Write meta-information back into the database. Meta[0] is
6627** read-only and may not be written.
drh8b2f49b2001-06-08 00:21:52 +00006628*/
danielk1977aef0bf62005-12-30 16:28:01 +00006629int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6630 BtShared *pBt = p->pBt;
drh4b70f112004-05-02 21:12:19 +00006631 unsigned char *pP1;
drha34b6762004-05-07 13:30:42 +00006632 int rc;
drh23e11ca2004-05-04 17:27:28 +00006633 assert( idx>=1 && idx<=15 );
drhd677b3d2007-08-20 22:48:41 +00006634 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00006635 pBt->db = p->db;
danielk1977aef0bf62005-12-30 16:28:01 +00006636 if( p->inTrans!=TRANS_WRITE ){
drhd677b3d2007-08-20 22:48:41 +00006637 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6638 }else{
6639 assert( pBt->pPage1!=0 );
6640 pP1 = pBt->pPage1->aData;
6641 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6642 if( rc==SQLITE_OK ){
6643 put4byte(&pP1[36 + idx*4], iMeta);
danielk19774152e672007-09-12 17:01:45 +00006644#ifndef SQLITE_OMIT_AUTOVACUUM
drhd677b3d2007-08-20 22:48:41 +00006645 if( idx==7 ){
6646 assert( pBt->autoVacuum || iMeta==0 );
6647 assert( iMeta==0 || iMeta==1 );
drhf49661a2008-12-10 16:45:50 +00006648 pBt->incrVacuum = (u8)iMeta;
drhd677b3d2007-08-20 22:48:41 +00006649 }
danielk19774152e672007-09-12 17:01:45 +00006650#endif
drhd677b3d2007-08-20 22:48:41 +00006651 }
drh5df72a52002-06-06 23:16:05 +00006652 }
drhd677b3d2007-08-20 22:48:41 +00006653 sqlite3BtreeLeave(p);
6654 return rc;
drh8b2f49b2001-06-08 00:21:52 +00006655}
drh8c42ca92001-06-22 19:15:00 +00006656
drhf328bc82004-05-10 23:29:49 +00006657/*
6658** Return the flag byte at the beginning of the page that the cursor
6659** is currently pointing to.
6660*/
6661int sqlite3BtreeFlags(BtCursor *pCur){
danielk1977da184232006-01-05 11:34:32 +00006662 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
drha3460582008-07-11 21:02:53 +00006663 ** restoreCursorPosition() here.
danielk1977da184232006-01-05 11:34:32 +00006664 */
danielk1977e448dc42008-01-02 11:50:51 +00006665 MemPage *pPage;
drha3460582008-07-11 21:02:53 +00006666 restoreCursorPosition(pCur);
danielk197771d5d2c2008-09-29 11:49:47 +00006667 pPage = pCur->apPage[pCur->iPage];
drh1fee73e2007-08-29 04:00:57 +00006668 assert( cursorHoldsMutex(pCur) );
drhd0679ed2007-08-28 22:24:34 +00006669 assert( pPage->pBt==pCur->pBt );
drhf328bc82004-05-10 23:29:49 +00006670 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6671}
6672
drhdd793422001-06-28 01:54:48 +00006673
drhdd793422001-06-28 01:54:48 +00006674/*
drh5eddca62001-06-30 21:53:53 +00006675** Return the pager associated with a BTree. This routine is used for
6676** testing and debugging only.
drhdd793422001-06-28 01:54:48 +00006677*/
danielk1977aef0bf62005-12-30 16:28:01 +00006678Pager *sqlite3BtreePager(Btree *p){
6679 return p->pBt->pPager;
drhdd793422001-06-28 01:54:48 +00006680}
drh5eddca62001-06-30 21:53:53 +00006681
drhb7f91642004-10-31 02:22:47 +00006682#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006683/*
6684** Append a message to the error message string.
6685*/
drh2e38c322004-09-03 18:38:44 +00006686static void checkAppendMsg(
6687 IntegrityCk *pCheck,
6688 char *zMsg1,
6689 const char *zFormat,
6690 ...
6691){
6692 va_list ap;
drh1dcdbc02007-01-27 02:24:54 +00006693 if( !pCheck->mxErr ) return;
6694 pCheck->mxErr--;
6695 pCheck->nErr++;
drh2e38c322004-09-03 18:38:44 +00006696 va_start(ap, zFormat);
drhf089aa42008-07-08 19:34:06 +00006697 if( pCheck->errMsg.nChar ){
6698 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
drh5eddca62001-06-30 21:53:53 +00006699 }
drhf089aa42008-07-08 19:34:06 +00006700 if( zMsg1 ){
6701 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6702 }
6703 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6704 va_end(ap);
drhc890fec2008-08-01 20:10:08 +00006705 if( pCheck->errMsg.mallocFailed ){
6706 pCheck->mallocFailed = 1;
6707 }
drh5eddca62001-06-30 21:53:53 +00006708}
drhb7f91642004-10-31 02:22:47 +00006709#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006710
drhb7f91642004-10-31 02:22:47 +00006711#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006712/*
6713** Add 1 to the reference count for page iPage. If this is the second
6714** reference to the page, add an error message to pCheck->zErrMsg.
6715** Return 1 if there are 2 ore more references to the page and 0 if
6716** if this is the first reference to the page.
6717**
6718** Also check that the page number is in bounds.
6719*/
danielk197789d40042008-11-17 14:20:56 +00006720static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
drh5eddca62001-06-30 21:53:53 +00006721 if( iPage==0 ) return 1;
danielk197789d40042008-11-17 14:20:56 +00006722 if( iPage>pCheck->nPage ){
drh2e38c322004-09-03 18:38:44 +00006723 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006724 return 1;
6725 }
6726 if( pCheck->anRef[iPage]==1 ){
drh2e38c322004-09-03 18:38:44 +00006727 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006728 return 1;
6729 }
6730 return (pCheck->anRef[iPage]++)>1;
6731}
6732
danielk1977afcdd022004-10-31 16:25:42 +00006733#ifndef SQLITE_OMIT_AUTOVACUUM
6734/*
6735** Check that the entry in the pointer-map for page iChild maps to
6736** page iParent, pointer type ptrType. If not, append an error message
6737** to pCheck.
6738*/
6739static void checkPtrmap(
6740 IntegrityCk *pCheck, /* Integrity check context */
6741 Pgno iChild, /* Child page number */
6742 u8 eType, /* Expected pointer map type */
6743 Pgno iParent, /* Expected pointer map parent page number */
6744 char *zContext /* Context description (used for error msg) */
6745){
6746 int rc;
6747 u8 ePtrmapType;
6748 Pgno iPtrmapParent;
6749
6750 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6751 if( rc!=SQLITE_OK ){
drhe43ba702008-12-05 22:40:08 +00006752 if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1;
danielk1977afcdd022004-10-31 16:25:42 +00006753 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6754 return;
6755 }
6756
6757 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6758 checkAppendMsg(pCheck, zContext,
6759 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6760 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6761 }
6762}
6763#endif
6764
drh5eddca62001-06-30 21:53:53 +00006765/*
6766** Check the integrity of the freelist or of an overflow page list.
6767** Verify that the number of pages on the list is N.
6768*/
drh30e58752002-03-02 20:41:57 +00006769static void checkList(
6770 IntegrityCk *pCheck, /* Integrity checking context */
6771 int isFreeList, /* True for a freelist. False for overflow page list */
6772 int iPage, /* Page number for first page in the list */
6773 int N, /* Expected number of pages in the list */
6774 char *zContext /* Context for error messages */
6775){
6776 int i;
drh3a4c1412004-05-09 20:40:11 +00006777 int expected = N;
6778 int iFirst = iPage;
drh1dcdbc02007-01-27 02:24:54 +00006779 while( N-- > 0 && pCheck->mxErr ){
danielk19773b8a05f2007-03-19 17:44:26 +00006780 DbPage *pOvflPage;
6781 unsigned char *pOvflData;
drh5eddca62001-06-30 21:53:53 +00006782 if( iPage<1 ){
drh2e38c322004-09-03 18:38:44 +00006783 checkAppendMsg(pCheck, zContext,
6784 "%d of %d pages missing from overflow list starting at %d",
drh3a4c1412004-05-09 20:40:11 +00006785 N+1, expected, iFirst);
drh5eddca62001-06-30 21:53:53 +00006786 break;
6787 }
6788 if( checkRef(pCheck, iPage, zContext) ) break;
danielk19773b8a05f2007-03-19 17:44:26 +00006789 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
drh2e38c322004-09-03 18:38:44 +00006790 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
drh5eddca62001-06-30 21:53:53 +00006791 break;
6792 }
danielk19773b8a05f2007-03-19 17:44:26 +00006793 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
drh30e58752002-03-02 20:41:57 +00006794 if( isFreeList ){
danielk19773b8a05f2007-03-19 17:44:26 +00006795 int n = get4byte(&pOvflData[4]);
danielk1977687566d2004-11-02 12:56:41 +00006796#ifndef SQLITE_OMIT_AUTOVACUUM
6797 if( pCheck->pBt->autoVacuum ){
6798 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6799 }
6800#endif
drh45b1fac2008-07-04 17:52:42 +00006801 if( n>pCheck->pBt->usableSize/4-2 ){
drh2e38c322004-09-03 18:38:44 +00006802 checkAppendMsg(pCheck, zContext,
6803 "freelist leaf count too big on page %d", iPage);
drhee696e22004-08-30 16:52:17 +00006804 N--;
6805 }else{
6806 for(i=0; i<n; i++){
danielk19773b8a05f2007-03-19 17:44:26 +00006807 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
danielk1977687566d2004-11-02 12:56:41 +00006808#ifndef SQLITE_OMIT_AUTOVACUUM
6809 if( pCheck->pBt->autoVacuum ){
6810 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6811 }
6812#endif
6813 checkRef(pCheck, iFreePage, zContext);
drhee696e22004-08-30 16:52:17 +00006814 }
6815 N -= n;
drh30e58752002-03-02 20:41:57 +00006816 }
drh30e58752002-03-02 20:41:57 +00006817 }
danielk1977afcdd022004-10-31 16:25:42 +00006818#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00006819 else{
6820 /* If this database supports auto-vacuum and iPage is not the last
6821 ** page in this overflow list, check that the pointer-map entry for
6822 ** the following page matches iPage.
6823 */
6824 if( pCheck->pBt->autoVacuum && N>0 ){
danielk19773b8a05f2007-03-19 17:44:26 +00006825 i = get4byte(pOvflData);
danielk1977687566d2004-11-02 12:56:41 +00006826 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6827 }
danielk1977afcdd022004-10-31 16:25:42 +00006828 }
6829#endif
danielk19773b8a05f2007-03-19 17:44:26 +00006830 iPage = get4byte(pOvflData);
6831 sqlite3PagerUnref(pOvflPage);
drh5eddca62001-06-30 21:53:53 +00006832 }
6833}
drhb7f91642004-10-31 02:22:47 +00006834#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00006835
drhb7f91642004-10-31 02:22:47 +00006836#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00006837/*
6838** Do various sanity checks on a single page of a tree. Return
6839** the tree depth. Root pages return 0. Parents of root pages
6840** return 1, and so forth.
6841**
6842** These checks are done:
6843**
6844** 1. Make sure that cells and freeblocks do not overlap
6845** but combine to completely cover the page.
drhda200cc2004-05-09 11:51:38 +00006846** NO 2. Make sure cell keys are in order.
6847** NO 3. Make sure no key is less than or equal to zLowerBound.
6848** NO 4. Make sure no key is greater than or equal to zUpperBound.
drh5eddca62001-06-30 21:53:53 +00006849** 5. Check the integrity of overflow pages.
6850** 6. Recursively call checkTreePage on all children.
6851** 7. Verify that the depth of all children is the same.
drh6019e162001-07-02 17:51:45 +00006852** 8. Make sure this page is at least 33% full or else it is
drh5eddca62001-06-30 21:53:53 +00006853** the root of the tree.
6854*/
6855static int checkTreePage(
drhaaab5722002-02-19 13:39:21 +00006856 IntegrityCk *pCheck, /* Context for the sanity check */
drh5eddca62001-06-30 21:53:53 +00006857 int iPage, /* Page number of the page to check */
drh74161702006-02-24 02:53:49 +00006858 char *zParentContext /* Parent context */
drh5eddca62001-06-30 21:53:53 +00006859){
6860 MemPage *pPage;
drhda200cc2004-05-09 11:51:38 +00006861 int i, rc, depth, d2, pgno, cnt;
drh43605152004-05-29 21:46:49 +00006862 int hdr, cellStart;
6863 int nCell;
drhda200cc2004-05-09 11:51:38 +00006864 u8 *data;
danielk1977aef0bf62005-12-30 16:28:01 +00006865 BtShared *pBt;
drh4f26bb62005-09-08 14:17:20 +00006866 int usableSize;
drh5eddca62001-06-30 21:53:53 +00006867 char zContext[100];
shane0af3f892008-11-12 04:55:34 +00006868 char *hit = 0;
drh5eddca62001-06-30 21:53:53 +00006869
drh5bb3eb92007-05-04 13:15:55 +00006870 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
danielk1977ef73ee92004-11-06 12:26:07 +00006871
drh5eddca62001-06-30 21:53:53 +00006872 /* Check that the page exists
6873 */
drhd9cb6ac2005-10-20 07:28:17 +00006874 pBt = pCheck->pBt;
drhb6f41482004-05-14 01:58:11 +00006875 usableSize = pBt->usableSize;
drh5eddca62001-06-30 21:53:53 +00006876 if( iPage==0 ) return 0;
6877 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
drh16a9b832007-05-05 18:39:25 +00006878 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
drhe43ba702008-12-05 22:40:08 +00006879 if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1;
drh2e38c322004-09-03 18:38:44 +00006880 checkAppendMsg(pCheck, zContext,
6881 "unable to get the page. error code=%d", rc);
drh5eddca62001-06-30 21:53:53 +00006882 return 0;
6883 }
danielk197771d5d2c2008-09-29 11:49:47 +00006884 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
drhe43ba702008-12-05 22:40:08 +00006885 if( rc==SQLITE_NOMEM ) pCheck->mallocFailed = 1;
drh16a9b832007-05-05 18:39:25 +00006886 checkAppendMsg(pCheck, zContext,
6887 "sqlite3BtreeInitPage() returns error code %d", rc);
drh91025292004-05-03 19:49:32 +00006888 releasePage(pPage);
drh5eddca62001-06-30 21:53:53 +00006889 return 0;
6890 }
6891
6892 /* Check out all the cells.
6893 */
6894 depth = 0;
drh1dcdbc02007-01-27 02:24:54 +00006895 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
drh6f11bef2004-05-13 01:12:56 +00006896 u8 *pCell;
danielk197789d40042008-11-17 14:20:56 +00006897 u32 sz;
drh6f11bef2004-05-13 01:12:56 +00006898 CellInfo info;
drh5eddca62001-06-30 21:53:53 +00006899
6900 /* Check payload overflow pages
6901 */
drh5bb3eb92007-05-04 13:15:55 +00006902 sqlite3_snprintf(sizeof(zContext), zContext,
6903 "On tree page %d cell %d: ", iPage, i);
danielk19771cc5ed82007-05-16 17:28:43 +00006904 pCell = findCell(pPage,i);
drh16a9b832007-05-05 18:39:25 +00006905 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
drh6f11bef2004-05-13 01:12:56 +00006906 sz = info.nData;
drhf49661a2008-12-10 16:45:50 +00006907 if( !pPage->intKey ) sz += (int)info.nKey;
drh72365832007-03-06 15:53:44 +00006908 assert( sz==info.nPayload );
drh6f11bef2004-05-13 01:12:56 +00006909 if( sz>info.nLocal ){
drhb6f41482004-05-14 01:58:11 +00006910 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
danielk1977afcdd022004-10-31 16:25:42 +00006911 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6912#ifndef SQLITE_OMIT_AUTOVACUUM
6913 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006914 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
danielk1977afcdd022004-10-31 16:25:42 +00006915 }
6916#endif
6917 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
drh5eddca62001-06-30 21:53:53 +00006918 }
6919
6920 /* Check sanity of left child page.
6921 */
drhda200cc2004-05-09 11:51:38 +00006922 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006923 pgno = get4byte(pCell);
danielk1977afcdd022004-10-31 16:25:42 +00006924#ifndef SQLITE_OMIT_AUTOVACUUM
6925 if( pBt->autoVacuum ){
6926 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6927 }
6928#endif
danielk197762c14b32008-11-19 09:05:26 +00006929 d2 = checkTreePage(pCheck, pgno, zContext);
drhda200cc2004-05-09 11:51:38 +00006930 if( i>0 && d2!=depth ){
6931 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6932 }
6933 depth = d2;
drh5eddca62001-06-30 21:53:53 +00006934 }
drh5eddca62001-06-30 21:53:53 +00006935 }
drhda200cc2004-05-09 11:51:38 +00006936 if( !pPage->leaf ){
drh43605152004-05-29 21:46:49 +00006937 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
drh5bb3eb92007-05-04 13:15:55 +00006938 sqlite3_snprintf(sizeof(zContext), zContext,
6939 "On page %d at right child: ", iPage);
danielk1977afcdd022004-10-31 16:25:42 +00006940#ifndef SQLITE_OMIT_AUTOVACUUM
6941 if( pBt->autoVacuum ){
danielk1977687566d2004-11-02 12:56:41 +00006942 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
danielk1977afcdd022004-10-31 16:25:42 +00006943 }
6944#endif
danielk197762c14b32008-11-19 09:05:26 +00006945 checkTreePage(pCheck, pgno, zContext);
drhda200cc2004-05-09 11:51:38 +00006946 }
drh5eddca62001-06-30 21:53:53 +00006947
6948 /* Check for complete coverage of the page
6949 */
drhda200cc2004-05-09 11:51:38 +00006950 data = pPage->aData;
6951 hdr = pPage->hdrOffset;
drhf7141992008-06-19 00:16:08 +00006952 hit = sqlite3PageMalloc( pBt->pageSize );
drhc890fec2008-08-01 20:10:08 +00006953 if( hit==0 ){
6954 pCheck->mallocFailed = 1;
6955 }else{
shane5780ebd2008-11-11 17:36:30 +00006956 u16 contentOffset = get2byte(&data[hdr+5]);
6957 if (contentOffset > usableSize) {
6958 checkAppendMsg(pCheck, 0,
6959 "Corruption detected in header on page %d",iPage,0);
shane0af3f892008-11-12 04:55:34 +00006960 goto check_page_abort;
shane5780ebd2008-11-11 17:36:30 +00006961 }
6962 memset(hit+contentOffset, 0, usableSize-contentOffset);
6963 memset(hit, 1, contentOffset);
drh2e38c322004-09-03 18:38:44 +00006964 nCell = get2byte(&data[hdr+3]);
6965 cellStart = hdr + 12 - 4*pPage->leaf;
6966 for(i=0; i<nCell; i++){
6967 int pc = get2byte(&data[cellStart+i*2]);
danielk1977daca5432008-08-25 11:57:16 +00006968 u16 size = 1024;
drh2e38c322004-09-03 18:38:44 +00006969 int j;
danielk1977daca5432008-08-25 11:57:16 +00006970 if( pc<=usableSize ){
6971 size = cellSizePtr(pPage, &data[pc]);
6972 }
danielk19777701e812005-01-10 12:59:51 +00006973 if( (pc+size-1)>=usableSize || pc<0 ){
6974 checkAppendMsg(pCheck, 0,
6975 "Corruption detected in cell %d on page %d",i,iPage,0);
6976 }else{
6977 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6978 }
drh2e38c322004-09-03 18:38:44 +00006979 }
6980 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6981 cnt++){
6982 int size = get2byte(&data[i+2]);
6983 int j;
danielk19777701e812005-01-10 12:59:51 +00006984 if( (i+size-1)>=usableSize || i<0 ){
6985 checkAppendMsg(pCheck, 0,
6986 "Corruption detected in cell %d on page %d",i,iPage,0);
6987 }else{
6988 for(j=i+size-1; j>=i; j--) hit[j]++;
6989 }
drh2e38c322004-09-03 18:38:44 +00006990 i = get2byte(&data[i]);
6991 }
6992 for(i=cnt=0; i<usableSize; i++){
6993 if( hit[i]==0 ){
6994 cnt++;
6995 }else if( hit[i]>1 ){
6996 checkAppendMsg(pCheck, 0,
6997 "Multiple uses for byte %d of page %d", i, iPage);
6998 break;
6999 }
7000 }
7001 if( cnt!=data[hdr+7] ){
7002 checkAppendMsg(pCheck, 0,
7003 "Fragmented space is %d byte reported as %d on page %d",
7004 cnt, data[hdr+7], iPage);
drh5eddca62001-06-30 21:53:53 +00007005 }
7006 }
shane0af3f892008-11-12 04:55:34 +00007007check_page_abort:
7008 if (hit) sqlite3PageFree(hit);
drh6019e162001-07-02 17:51:45 +00007009
drh4b70f112004-05-02 21:12:19 +00007010 releasePage(pPage);
drhda200cc2004-05-09 11:51:38 +00007011 return depth+1;
drh5eddca62001-06-30 21:53:53 +00007012}
drhb7f91642004-10-31 02:22:47 +00007013#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
drh5eddca62001-06-30 21:53:53 +00007014
drhb7f91642004-10-31 02:22:47 +00007015#ifndef SQLITE_OMIT_INTEGRITY_CHECK
drh5eddca62001-06-30 21:53:53 +00007016/*
7017** This routine does a complete check of the given BTree file. aRoot[] is
7018** an array of pages numbers were each page number is the root page of
7019** a table. nRoot is the number of entries in aRoot.
7020**
drhc890fec2008-08-01 20:10:08 +00007021** Write the number of error seen in *pnErr. Except for some memory
drhe43ba702008-12-05 22:40:08 +00007022** allocation errors, an error message held in memory obtained from
drhc890fec2008-08-01 20:10:08 +00007023** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
drhe43ba702008-12-05 22:40:08 +00007024** returned. If a memory allocation error occurs, NULL is returned.
drh5eddca62001-06-30 21:53:53 +00007025*/
drh1dcdbc02007-01-27 02:24:54 +00007026char *sqlite3BtreeIntegrityCheck(
7027 Btree *p, /* The btree to be checked */
7028 int *aRoot, /* An array of root pages numbers for individual trees */
7029 int nRoot, /* Number of entries in aRoot[] */
7030 int mxErr, /* Stop reporting errors after this many */
7031 int *pnErr /* Write number of errors seen to this variable */
7032){
danielk197789d40042008-11-17 14:20:56 +00007033 Pgno i;
drh5eddca62001-06-30 21:53:53 +00007034 int nRef;
drhaaab5722002-02-19 13:39:21 +00007035 IntegrityCk sCheck;
danielk1977aef0bf62005-12-30 16:28:01 +00007036 BtShared *pBt = p->pBt;
drhf089aa42008-07-08 19:34:06 +00007037 char zErr[100];
drh5eddca62001-06-30 21:53:53 +00007038
drhd677b3d2007-08-20 22:48:41 +00007039 sqlite3BtreeEnter(p);
drhe5fe6902007-12-07 18:55:28 +00007040 pBt->db = p->db;
danielk19773b8a05f2007-03-19 17:44:26 +00007041 nRef = sqlite3PagerRefcount(pBt->pPager);
danielk1977aef0bf62005-12-30 16:28:01 +00007042 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
drhc890fec2008-08-01 20:10:08 +00007043 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00007044 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00007045 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
drhefc251d2001-07-01 22:12:01 +00007046 }
drh5eddca62001-06-30 21:53:53 +00007047 sCheck.pBt = pBt;
7048 sCheck.pPager = pBt->pPager;
danielk197789d40042008-11-17 14:20:56 +00007049 sCheck.nPage = pagerPagecount(sCheck.pBt);
drh1dcdbc02007-01-27 02:24:54 +00007050 sCheck.mxErr = mxErr;
7051 sCheck.nErr = 0;
drhc890fec2008-08-01 20:10:08 +00007052 sCheck.mallocFailed = 0;
drh1dcdbc02007-01-27 02:24:54 +00007053 *pnErr = 0;
danielk1977e5321f02007-04-27 07:05:44 +00007054#ifndef SQLITE_OMIT_AUTOVACUUM
7055 if( pBt->nTrunc!=0 ){
7056 sCheck.nPage = pBt->nTrunc;
7057 }
7058#endif
drh0de8c112002-07-06 16:32:14 +00007059 if( sCheck.nPage==0 ){
7060 unlockBtreeIfUnused(pBt);
drhd677b3d2007-08-20 22:48:41 +00007061 sqlite3BtreeLeave(p);
drh0de8c112002-07-06 16:32:14 +00007062 return 0;
7063 }
drhe5ae5732008-06-15 02:51:47 +00007064 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
danielk1977ac245ec2005-01-14 13:50:11 +00007065 if( !sCheck.anRef ){
7066 unlockBtreeIfUnused(pBt);
drh1dcdbc02007-01-27 02:24:54 +00007067 *pnErr = 1;
drhd677b3d2007-08-20 22:48:41 +00007068 sqlite3BtreeLeave(p);
drhc890fec2008-08-01 20:10:08 +00007069 return 0;
danielk1977ac245ec2005-01-14 13:50:11 +00007070 }
drhda200cc2004-05-09 11:51:38 +00007071 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
drh42cac6d2004-11-20 20:31:11 +00007072 i = PENDING_BYTE_PAGE(pBt);
drh1f595712004-06-15 01:40:29 +00007073 if( i<=sCheck.nPage ){
7074 sCheck.anRef[i] = 1;
7075 }
drhf089aa42008-07-08 19:34:06 +00007076 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
drh5eddca62001-06-30 21:53:53 +00007077
7078 /* Check the integrity of the freelist
7079 */
drha34b6762004-05-07 13:30:42 +00007080 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7081 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
drh5eddca62001-06-30 21:53:53 +00007082
7083 /* Check all the tables.
7084 */
danielk197789d40042008-11-17 14:20:56 +00007085 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
drh4ff6dfa2002-03-03 23:06:00 +00007086 if( aRoot[i]==0 ) continue;
danielk1977687566d2004-11-02 12:56:41 +00007087#ifndef SQLITE_OMIT_AUTOVACUUM
danielk1977687566d2004-11-02 12:56:41 +00007088 if( pBt->autoVacuum && aRoot[i]>1 ){
7089 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7090 }
7091#endif
danielk197762c14b32008-11-19 09:05:26 +00007092 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ");
drh5eddca62001-06-30 21:53:53 +00007093 }
7094
7095 /* Make sure every page in the file is referenced
7096 */
drh1dcdbc02007-01-27 02:24:54 +00007097 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
danielk1977afcdd022004-10-31 16:25:42 +00007098#ifdef SQLITE_OMIT_AUTOVACUUM
drh5eddca62001-06-30 21:53:53 +00007099 if( sCheck.anRef[i]==0 ){
drh2e38c322004-09-03 18:38:44 +00007100 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
drh5eddca62001-06-30 21:53:53 +00007101 }
danielk1977afcdd022004-10-31 16:25:42 +00007102#else
7103 /* If the database supports auto-vacuum, make sure no tables contain
7104 ** references to pointer-map pages.
7105 */
7106 if( sCheck.anRef[i]==0 &&
danielk1977266664d2006-02-10 08:24:21 +00007107 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00007108 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7109 }
7110 if( sCheck.anRef[i]!=0 &&
danielk1977266664d2006-02-10 08:24:21 +00007111 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
danielk1977afcdd022004-10-31 16:25:42 +00007112 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7113 }
7114#endif
drh5eddca62001-06-30 21:53:53 +00007115 }
7116
7117 /* Make sure this analysis did not leave any unref() pages
7118 */
drh5e00f6c2001-09-13 13:46:56 +00007119 unlockBtreeIfUnused(pBt);
danielk19773b8a05f2007-03-19 17:44:26 +00007120 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
drh2e38c322004-09-03 18:38:44 +00007121 checkAppendMsg(&sCheck, 0,
drh5eddca62001-06-30 21:53:53 +00007122 "Outstanding page count goes from %d to %d during this analysis",
danielk19773b8a05f2007-03-19 17:44:26 +00007123 nRef, sqlite3PagerRefcount(pBt->pPager)
drh5eddca62001-06-30 21:53:53 +00007124 );
drh5eddca62001-06-30 21:53:53 +00007125 }
7126
7127 /* Clean up and report errors.
7128 */
drhd677b3d2007-08-20 22:48:41 +00007129 sqlite3BtreeLeave(p);
drh17435752007-08-16 04:30:38 +00007130 sqlite3_free(sCheck.anRef);
drhc890fec2008-08-01 20:10:08 +00007131 if( sCheck.mallocFailed ){
7132 sqlite3StrAccumReset(&sCheck.errMsg);
7133 *pnErr = sCheck.nErr+1;
7134 return 0;
7135 }
drh1dcdbc02007-01-27 02:24:54 +00007136 *pnErr = sCheck.nErr;
drhf089aa42008-07-08 19:34:06 +00007137 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7138 return sqlite3StrAccumFinish(&sCheck.errMsg);
drh5eddca62001-06-30 21:53:53 +00007139}
drhb7f91642004-10-31 02:22:47 +00007140#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
paulb95a8862003-04-01 21:16:41 +00007141
drh73509ee2003-04-06 20:44:45 +00007142/*
7143** Return the full pathname of the underlying database file.
drhd0679ed2007-08-28 22:24:34 +00007144**
7145** The pager filename is invariant as long as the pager is
7146** open so it is safe to access without the BtShared mutex.
drh73509ee2003-04-06 20:44:45 +00007147*/
danielk1977aef0bf62005-12-30 16:28:01 +00007148const char *sqlite3BtreeGetFilename(Btree *p){
7149 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007150 return sqlite3PagerFilename(p->pBt->pPager);
drh73509ee2003-04-06 20:44:45 +00007151}
7152
7153/*
danielk19775865e3d2004-06-14 06:03:57 +00007154** Return the pathname of the directory that contains the database file.
drhd0679ed2007-08-28 22:24:34 +00007155**
7156** The pager directory name is invariant as long as the pager is
7157** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007158*/
danielk1977aef0bf62005-12-30 16:28:01 +00007159const char *sqlite3BtreeGetDirname(Btree *p){
7160 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007161 return sqlite3PagerDirname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007162}
7163
7164/*
7165** Return the pathname of the journal file for this database. The return
7166** value of this routine is the same regardless of whether the journal file
7167** has been created or not.
drhd0679ed2007-08-28 22:24:34 +00007168**
7169** The pager journal filename is invariant as long as the pager is
7170** open so it is safe to access without the BtShared mutex.
danielk19775865e3d2004-06-14 06:03:57 +00007171*/
danielk1977aef0bf62005-12-30 16:28:01 +00007172const char *sqlite3BtreeGetJournalname(Btree *p){
7173 assert( p->pBt->pPager!=0 );
danielk19773b8a05f2007-03-19 17:44:26 +00007174 return sqlite3PagerJournalname(p->pBt->pPager);
danielk19775865e3d2004-06-14 06:03:57 +00007175}
7176
drhb7f91642004-10-31 02:22:47 +00007177#ifndef SQLITE_OMIT_VACUUM
danielk19775865e3d2004-06-14 06:03:57 +00007178/*
drhf7c57532003-04-25 13:22:51 +00007179** Copy the complete content of pBtFrom into pBtTo. A transaction
7180** must be active for both files.
7181**
danielk1977f653d782008-03-20 11:04:21 +00007182** The size of file pTo may be reduced by this operation.
7183** If anything goes wrong, the transaction on pTo is rolled back.
7184**
7185** If successful, CommitPhaseOne() may be called on pTo before returning.
7186** The caller should finish committing the transaction on pTo by calling
7187** sqlite3BtreeCommit().
drh73509ee2003-04-06 20:44:45 +00007188*/
drhd677b3d2007-08-20 22:48:41 +00007189static int btreeCopyFile(Btree *pTo, Btree *pFrom){
drhf7c57532003-04-25 13:22:51 +00007190 int rc = SQLITE_OK;
danielk1977f653d782008-03-20 11:04:21 +00007191 Pgno i;
7192
7193 Pgno nFromPage; /* Number of pages in pFrom */
7194 Pgno nToPage; /* Number of pages in pTo */
7195 Pgno nNewPage; /* Number of pages in pTo after the copy */
7196
7197 Pgno iSkip; /* Pending byte page in pTo */
7198 int nToPageSize; /* Page size of pTo in bytes */
7199 int nFromPageSize; /* Page size of pFrom in bytes */
drhf7c57532003-04-25 13:22:51 +00007200
danielk1977aef0bf62005-12-30 16:28:01 +00007201 BtShared *pBtTo = pTo->pBt;
7202 BtShared *pBtFrom = pFrom->pBt;
drhe5fe6902007-12-07 18:55:28 +00007203 pBtTo->db = pTo->db;
7204 pBtFrom->db = pFrom->db;
danielk1977f653d782008-03-20 11:04:21 +00007205
7206 nToPageSize = pBtTo->pageSize;
7207 nFromPageSize = pBtFrom->pageSize;
danielk1977aef0bf62005-12-30 16:28:01 +00007208
7209 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
danielk1977ee5741e2004-05-31 10:01:34 +00007210 return SQLITE_ERROR;
7211 }
danielk1977f653d782008-03-20 11:04:21 +00007212 if( pBtTo->pCursor ){
7213 return SQLITE_BUSY;
drhf7c57532003-04-25 13:22:51 +00007214 }
drh538f5702007-04-13 02:14:30 +00007215
danielk197789d40042008-11-17 14:20:56 +00007216 nToPage = pagerPagecount(pBtTo);
7217 nFromPage = pagerPagecount(pBtFrom);
danielk1977f653d782008-03-20 11:04:21 +00007218 iSkip = PENDING_BYTE_PAGE(pBtTo);
7219
7220 /* Variable nNewPage is the number of pages required to store the
7221 ** contents of pFrom using the current page-size of pTo.
drh538f5702007-04-13 02:14:30 +00007222 */
drhf49661a2008-12-10 16:45:50 +00007223 nNewPage = (Pgno)
7224 (((i64)nFromPage*(i64)nFromPageSize+(i64)nToPageSize-1)/(i64)nToPageSize);
danielk1977f653d782008-03-20 11:04:21 +00007225
7226 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7227
7228 /* Journal the original page.
7229 **
7230 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7231 ** in database *pTo (before the copy). This page is never written
7232 ** into the journal file. Unless i==iSkip or the page was not
7233 ** present in pTo before the copy operation, journal page i from pTo.
7234 */
7235 if( i!=iSkip && i<=nToPage ){
danielk19774abd5442008-05-05 15:26:50 +00007236 DbPage *pDbPage = 0;
danielk1977f653d782008-03-20 11:04:21 +00007237 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
danielk19774abd5442008-05-05 15:26:50 +00007238 if( rc==SQLITE_OK ){
7239 rc = sqlite3PagerWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007240 if( rc==SQLITE_OK && i>nFromPage ){
7241 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7242 ** that is because the names of those procedures do not exactly
7243 ** represent what they do. Write() really means "put this page in the
7244 ** rollback journal and mark it as dirty so that it will be written
7245 ** to the database file later." DontWrite() undoes the second part of
7246 ** that and prevents the page from being written to the database. The
7247 ** page is still on the rollback journal, though. And that is the
7248 ** whole point of this block: to put pages on the rollback journal.
7249 */
danielk1977a1fa00d2008-08-27 15:16:33 +00007250 rc = sqlite3PagerDontWrite(pDbPage);
danielk1977df2566a2008-05-07 19:11:03 +00007251 }
7252 sqlite3PagerUnref(pDbPage);
danielk1977f653d782008-03-20 11:04:21 +00007253 }
danielk1977f653d782008-03-20 11:04:21 +00007254 }
7255
7256 /* Overwrite the data in page i of the target database */
7257 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7258
7259 DbPage *pToPage = 0;
7260 sqlite3_int64 iOff;
7261
7262 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7263 if( rc==SQLITE_OK ){
7264 rc = sqlite3PagerWrite(pToPage);
7265 }
7266
7267 for(
7268 iOff=(i-1)*nToPageSize;
7269 rc==SQLITE_OK && iOff<i*nToPageSize;
7270 iOff += nFromPageSize
7271 ){
7272 DbPage *pFromPage = 0;
drhf49661a2008-12-10 16:45:50 +00007273 Pgno iFrom = (Pgno)(iOff/nFromPageSize)+1;
danielk1977f653d782008-03-20 11:04:21 +00007274
7275 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7276 continue;
7277 }
7278
7279 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7280 if( rc==SQLITE_OK ){
7281 char *zTo = sqlite3PagerGetData(pToPage);
7282 char *zFrom = sqlite3PagerGetData(pFromPage);
7283 int nCopy;
7284
7285 if( nFromPageSize>=nToPageSize ){
7286 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7287 nCopy = nToPageSize;
7288 }else{
7289 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7290 nCopy = nFromPageSize;
7291 }
7292
7293 memcpy(zTo, zFrom, nCopy);
danielk19772f78fc62008-09-30 09:31:45 +00007294 sqlite3PagerUnref(pFromPage);
danielk1977f653d782008-03-20 11:04:21 +00007295 }
7296 }
7297
danielk1977eaa06f62008-09-18 17:34:44 +00007298 if( pToPage ){
7299 MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
7300 p->isInit = 0;
7301 sqlite3PagerUnref(pToPage);
7302 }
danielk1977f653d782008-03-20 11:04:21 +00007303 }
drh2e6d11b2003-04-25 15:37:57 +00007304 }
danielk1977f653d782008-03-20 11:04:21 +00007305
7306 /* If things have worked so far, the database file may need to be
7307 ** truncated. The complex part is that it may need to be truncated to
7308 ** a size that is not an integer multiple of nToPageSize - the current
7309 ** page size used by the pager associated with B-Tree pTo.
7310 **
7311 ** For example, say the page-size of pTo is 2048 bytes and the original
7312 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7313 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7314 */
7315 if( rc==SQLITE_OK ){
7316 if( nFromPageSize!=nToPageSize ){
7317 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7318 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7319 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7320 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7321
7322 assert( iSize<=iNow );
7323
7324 /* Commit phase one syncs the journal file associated with pTo
7325 ** containing the original data. It does not sync the database file
7326 ** itself. After doing this it is safe to use OsTruncate() and other
7327 ** file APIs on the database file directly.
7328 */
7329 pBtTo->db = pTo->db;
7330 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7331 if( iSize<iNow && rc==SQLITE_OK ){
7332 rc = sqlite3OsTruncate(pFile, iSize);
7333 }
7334
7335 /* The loop that copied data from database pFrom to pTo did not
7336 ** populate the locking page of database pTo. If the page-size of
7337 ** pFrom is smaller than that of pTo, this means some data will
7338 ** not have been copied.
7339 **
7340 ** This block copies the missing data from database pFrom to pTo
7341 ** using file APIs. This is safe because at this point we know that
7342 ** all of the original data from pTo has been synced into the
7343 ** journal file. At this point it would be safe to do anything at
7344 ** all to the database file except truncate it to zero bytes.
7345 */
7346 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7347 i64 iOff;
7348 for(
7349 iOff=iPending;
7350 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7351 iOff += nFromPageSize
7352 ){
7353 DbPage *pFromPage = 0;
drhf49661a2008-12-10 16:45:50 +00007354 Pgno iFrom = (Pgno)(iOff/nFromPageSize)+1;
danielk1977f653d782008-03-20 11:04:21 +00007355
7356 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7357 continue;
7358 }
7359
7360 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7361 if( rc==SQLITE_OK ){
7362 char *zFrom = sqlite3PagerGetData(pFromPage);
danielk197706249db2008-08-23 16:17:55 +00007363 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
danielk1977f653d782008-03-20 11:04:21 +00007364 sqlite3PagerUnref(pFromPage);
7365 }
7366 }
7367 }
7368
7369 /* Sync the database file */
7370 if( rc==SQLITE_OK ){
7371 rc = sqlite3PagerSync(pBtTo->pPager);
7372 }
7373 }else{
7374 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7375 }
7376 if( rc==SQLITE_OK ){
7377 pBtTo->pageSizeFixed = 0;
7378 }
drh2e6d11b2003-04-25 15:37:57 +00007379 }
drh538f5702007-04-13 02:14:30 +00007380
drhf7c57532003-04-25 13:22:51 +00007381 if( rc ){
danielk1977aef0bf62005-12-30 16:28:01 +00007382 sqlite3BtreeRollback(pTo);
drhf7c57532003-04-25 13:22:51 +00007383 }
danielk1977f653d782008-03-20 11:04:21 +00007384
drhf7c57532003-04-25 13:22:51 +00007385 return rc;
drh73509ee2003-04-06 20:44:45 +00007386}
drhd677b3d2007-08-20 22:48:41 +00007387int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7388 int rc;
7389 sqlite3BtreeEnter(pTo);
7390 sqlite3BtreeEnter(pFrom);
7391 rc = btreeCopyFile(pTo, pFrom);
7392 sqlite3BtreeLeave(pFrom);
7393 sqlite3BtreeLeave(pTo);
7394 return rc;
7395}
7396
drhb7f91642004-10-31 02:22:47 +00007397#endif /* SQLITE_OMIT_VACUUM */
danielk19771d850a72004-05-31 08:26:49 +00007398
7399/*
7400** Return non-zero if a transaction is active.
7401*/
danielk1977aef0bf62005-12-30 16:28:01 +00007402int sqlite3BtreeIsInTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007403 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
danielk1977aef0bf62005-12-30 16:28:01 +00007404 return (p && (p->inTrans==TRANS_WRITE));
danielk19771d850a72004-05-31 08:26:49 +00007405}
7406
7407/*
7408** Return non-zero if a statement transaction is active.
7409*/
danielk1977aef0bf62005-12-30 16:28:01 +00007410int sqlite3BtreeIsInStmt(Btree *p){
drh1fee73e2007-08-29 04:00:57 +00007411 assert( sqlite3BtreeHoldsMutex(p) );
danielk1977aef0bf62005-12-30 16:28:01 +00007412 return (p->pBt && p->pBt->inStmt);
danielk19771d850a72004-05-31 08:26:49 +00007413}
danielk197713adf8a2004-06-03 16:08:41 +00007414
7415/*
danielk19772372c2b2006-06-27 16:34:56 +00007416** Return non-zero if a read (or write) transaction is active.
7417*/
7418int sqlite3BtreeIsInReadTrans(Btree *p){
drhe5fe6902007-12-07 18:55:28 +00007419 assert( sqlite3_mutex_held(p->db->mutex) );
danielk19772372c2b2006-06-27 16:34:56 +00007420 return (p && (p->inTrans!=TRANS_NONE));
7421}
7422
7423/*
danielk1977da184232006-01-05 11:34:32 +00007424** This function returns a pointer to a blob of memory associated with
drh85b623f2007-12-13 21:54:09 +00007425** a single shared-btree. The memory is used by client code for its own
danielk1977da184232006-01-05 11:34:32 +00007426** purposes (for example, to store a high-level schema associated with
7427** the shared-btree). The btree layer manages reference counting issues.
7428**
7429** The first time this is called on a shared-btree, nBytes bytes of memory
7430** are allocated, zeroed, and returned to the caller. For each subsequent
7431** call the nBytes parameter is ignored and a pointer to the same blob
7432** of memory returned.
7433**
danielk1977171bfed2008-06-23 09:50:50 +00007434** If the nBytes parameter is 0 and the blob of memory has not yet been
7435** allocated, a null pointer is returned. If the blob has already been
7436** allocated, it is returned as normal.
7437**
danielk1977da184232006-01-05 11:34:32 +00007438** Just before the shared-btree is closed, the function passed as the
7439** xFree argument when the memory allocation was made is invoked on the
drh17435752007-08-16 04:30:38 +00007440** blob of allocated memory. This function should not call sqlite3_free()
danielk1977da184232006-01-05 11:34:32 +00007441** on the memory, the btree layer does that.
7442*/
7443void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7444 BtShared *pBt = p->pBt;
drh27641702007-08-22 02:56:42 +00007445 sqlite3BtreeEnter(p);
danielk1977171bfed2008-06-23 09:50:50 +00007446 if( !pBt->pSchema && nBytes ){
drh17435752007-08-16 04:30:38 +00007447 pBt->pSchema = sqlite3MallocZero(nBytes);
danielk1977da184232006-01-05 11:34:32 +00007448 pBt->xFreeSchema = xFree;
7449 }
drh27641702007-08-22 02:56:42 +00007450 sqlite3BtreeLeave(p);
danielk1977da184232006-01-05 11:34:32 +00007451 return pBt->pSchema;
7452}
7453
danielk1977c87d34d2006-01-06 13:00:28 +00007454/*
7455** Return true if another user of the same shared btree as the argument
7456** handle holds an exclusive lock on the sqlite_master table.
7457*/
7458int sqlite3BtreeSchemaLocked(Btree *p){
drh27641702007-08-22 02:56:42 +00007459 int rc;
drhe5fe6902007-12-07 18:55:28 +00007460 assert( sqlite3_mutex_held(p->db->mutex) );
drh27641702007-08-22 02:56:42 +00007461 sqlite3BtreeEnter(p);
7462 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7463 sqlite3BtreeLeave(p);
7464 return rc;
danielk1977c87d34d2006-01-06 13:00:28 +00007465}
7466
drha154dcd2006-03-22 22:10:07 +00007467
7468#ifndef SQLITE_OMIT_SHARED_CACHE
7469/*
7470** Obtain a lock on the table whose root page is iTab. The
7471** lock is a write lock if isWritelock is true or a read lock
7472** if it is false.
7473*/
danielk1977c00da102006-01-07 13:21:04 +00007474int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
danielk19772e94d4d2006-01-09 05:36:27 +00007475 int rc = SQLITE_OK;
drh6a9ad3d2008-04-02 16:29:30 +00007476 if( p->sharable ){
7477 u8 lockType = READ_LOCK + isWriteLock;
7478 assert( READ_LOCK+1==WRITE_LOCK );
7479 assert( isWriteLock==0 || isWriteLock==1 );
7480 sqlite3BtreeEnter(p);
7481 rc = queryTableLock(p, iTab, lockType);
7482 if( rc==SQLITE_OK ){
7483 rc = lockTable(p, iTab, lockType);
7484 }
7485 sqlite3BtreeLeave(p);
danielk1977c00da102006-01-07 13:21:04 +00007486 }
7487 return rc;
7488}
drha154dcd2006-03-22 22:10:07 +00007489#endif
danielk1977b82e7ed2006-01-11 14:09:31 +00007490
danielk1977b4e9af92007-05-01 17:49:49 +00007491#ifndef SQLITE_OMIT_INCRBLOB
7492/*
7493** Argument pCsr must be a cursor opened for writing on an
7494** INTKEY table currently pointing at a valid table entry.
7495** This function modifies the data stored as part of that entry.
7496** Only the data content may only be modified, it is not possible
7497** to change the length of the data stored.
7498*/
danielk1977dcbb5d32007-05-04 18:36:44 +00007499int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
drh1fee73e2007-08-29 04:00:57 +00007500 assert( cursorHoldsMutex(pCsr) );
drhe5fe6902007-12-07 18:55:28 +00007501 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007502 assert(pCsr->isIncrblobHandle);
danielk19773588ceb2008-06-10 17:30:26 +00007503
drha3460582008-07-11 21:02:53 +00007504 restoreCursorPosition(pCsr);
danielk19773588ceb2008-06-10 17:30:26 +00007505 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7506 if( pCsr->eState!=CURSOR_VALID ){
7507 return SQLITE_ABORT;
danielk1977dcbb5d32007-05-04 18:36:44 +00007508 }
7509
danielk1977d04417962007-05-02 13:16:30 +00007510 /* Check some preconditions:
danielk1977dcbb5d32007-05-04 18:36:44 +00007511 ** (a) the cursor is open for writing,
7512 ** (b) there is no read-lock on the table being modified and
7513 ** (c) the cursor points at a valid row of an intKey table.
danielk1977d04417962007-05-02 13:16:30 +00007514 */
danielk1977d04417962007-05-02 13:16:30 +00007515 if( !pCsr->wrFlag ){
danielk1977dcbb5d32007-05-04 18:36:44 +00007516 return SQLITE_READONLY;
danielk1977d04417962007-05-02 13:16:30 +00007517 }
drhd0679ed2007-08-28 22:24:34 +00007518 assert( !pCsr->pBt->readOnly
7519 && pCsr->pBt->inTransaction==TRANS_WRITE );
danielk19773588ceb2008-06-10 17:30:26 +00007520 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
danielk1977d04417962007-05-02 13:16:30 +00007521 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7522 }
danielk197771d5d2c2008-09-29 11:49:47 +00007523 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
danielk1977d04417962007-05-02 13:16:30 +00007524 return SQLITE_ERROR;
danielk1977b4e9af92007-05-01 17:49:49 +00007525 }
7526
danielk19779f8d6402007-05-02 17:48:45 +00007527 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
danielk1977b4e9af92007-05-01 17:49:49 +00007528}
danielk19772dec9702007-05-02 16:48:37 +00007529
7530/*
7531** Set a flag on this cursor to cache the locations of pages from the
danielk1977da107192007-05-04 08:32:13 +00007532** overflow list for the current row. This is used by cursors opened
7533** for incremental blob IO only.
7534**
7535** This function sets a flag only. The actual page location cache
7536** (stored in BtCursor.aOverflow[]) is allocated and used by function
7537** accessPayload() (the worker function for sqlite3BtreeData() and
7538** sqlite3BtreePutData()).
danielk19772dec9702007-05-02 16:48:37 +00007539*/
7540void sqlite3BtreeCacheOverflow(BtCursor *pCur){
drh1fee73e2007-08-29 04:00:57 +00007541 assert( cursorHoldsMutex(pCur) );
drhe5fe6902007-12-07 18:55:28 +00007542 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
danielk1977dcbb5d32007-05-04 18:36:44 +00007543 assert(!pCur->isIncrblobHandle);
danielk19772dec9702007-05-02 16:48:37 +00007544 assert(!pCur->aOverflow);
danielk1977dcbb5d32007-05-04 18:36:44 +00007545 pCur->isIncrblobHandle = 1;
danielk19772dec9702007-05-02 16:48:37 +00007546}
danielk1977b4e9af92007-05-01 17:49:49 +00007547#endif