jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| 4 | // |
| 5 | // The representation of a DBImpl consists of a set of Versions. The |
| 6 | // newest version is called "current". Older versions may be kept |
| 7 | // around to provide a consistent view to live iterators. |
| 8 | // |
| 9 | // Each Version keeps track of a set of Table files per level. The |
| 10 | // entire set of versions is maintained in a VersionSet. |
| 11 | // |
| 12 | // Version,VersionSet are thread-compatible, but require external |
| 13 | // synchronization on all accesses. |
| 14 | |
| 15 | #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ |
| 16 | #define STORAGE_LEVELDB_DB_VERSION_SET_H_ |
| 17 | |
| 18 | #include <map> |
| 19 | #include <set> |
| 20 | #include <vector> |
| 21 | #include "db/dbformat.h" |
| 22 | #include "db/version_edit.h" |
| 23 | #include "port/port.h" |
| 24 | |
| 25 | namespace leveldb { |
| 26 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 27 | namespace log { class Writer; } |
| 28 | |
| 29 | class Compaction; |
| 30 | class Iterator; |
| 31 | class MemTable; |
| 32 | class TableBuilder; |
| 33 | class TableCache; |
| 34 | class Version; |
| 35 | class VersionSet; |
| 36 | class WritableFile; |
| 37 | |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 38 | // Return the smallest index i such that files[i]->largest >= key. |
| 39 | // Return files.size() if there is no such file. |
| 40 | // REQUIRES: "files" contains a sorted list of non-overlapping files. |
| 41 | extern int FindFile(const InternalKeyComparator& icmp, |
| 42 | const std::vector<FileMetaData*>& files, |
| 43 | const Slice& key); |
| 44 | |
gabor@google.com | 6699c7e | 2011-07-15 00:20:57 +0000 | [diff] [blame] | 45 | // Returns true iff some file in "files" overlaps the user key range |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 46 | // [smallest,largest]. |
| 47 | extern bool SomeFileOverlapsRange( |
| 48 | const InternalKeyComparator& icmp, |
| 49 | const std::vector<FileMetaData*>& files, |
gabor@google.com | 6699c7e | 2011-07-15 00:20:57 +0000 | [diff] [blame] | 50 | const Slice& smallest_user_key, |
| 51 | const Slice& largest_user_key); |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 52 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 53 | class Version { |
| 54 | public: |
| 55 | // Append to *iters a sequence of iterators that will |
| 56 | // yield the contents of this Version when merged together. |
| 57 | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
| 58 | void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); |
| 59 | |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 60 | // Lookup the value for key. If found, store it in *val and |
| 61 | // return OK. Else return a non-OK status. Fills *stats. |
| 62 | // REQUIRES: lock is not held |
| 63 | struct GetStats { |
| 64 | FileMetaData* seek_file; |
| 65 | int seek_file_level; |
| 66 | }; |
| 67 | Status Get(const ReadOptions&, const LookupKey& key, std::string* val, |
| 68 | GetStats* stats); |
| 69 | |
| 70 | // Adds "stats" into the current state. Returns true if a new |
| 71 | // compaction may need to be triggered, false otherwise. |
| 72 | // REQUIRES: lock is held |
| 73 | bool UpdateStats(const GetStats& stats); |
| 74 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 75 | // Reference count management (so Versions do not disappear out from |
| 76 | // under live iterators) |
| 77 | void Ref(); |
| 78 | void Unref(); |
| 79 | |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 80 | // Returns true iff some file in the specified level overlaps |
gabor@google.com | 6699c7e | 2011-07-15 00:20:57 +0000 | [diff] [blame] | 81 | // some part of [smallest_user_key,largest_user_key]. |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 82 | bool OverlapInLevel(int level, |
gabor@google.com | 6699c7e | 2011-07-15 00:20:57 +0000 | [diff] [blame] | 83 | const Slice& smallest_user_key, |
| 84 | const Slice& largest_user_key); |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 85 | |
| 86 | int NumFiles(int level) const { return files_[level].size(); } |
| 87 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 88 | // Return a human readable string that describes this version's contents. |
| 89 | std::string DebugString() const; |
| 90 | |
| 91 | private: |
| 92 | friend class Compaction; |
| 93 | friend class VersionSet; |
| 94 | |
| 95 | class LevelFileNumIterator; |
| 96 | Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; |
| 97 | |
| 98 | VersionSet* vset_; // VersionSet to which this Version belongs |
| 99 | Version* next_; // Next version in linked list |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 100 | Version* prev_; // Previous version in linked list |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 101 | int refs_; // Number of live refs to this version |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 102 | |
| 103 | // List of files per level |
| 104 | std::vector<FileMetaData*> files_[config::kNumLevels]; |
| 105 | |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 106 | // Next file to compact based on seek stats. |
| 107 | FileMetaData* file_to_compact_; |
| 108 | int file_to_compact_level_; |
| 109 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 110 | // Level that should be compacted next and its compaction score. |
| 111 | // Score < 1 means compaction is not strictly needed. These fields |
| 112 | // are initialized by Finalize(). |
| 113 | double compaction_score_; |
| 114 | int compaction_level_; |
| 115 | |
| 116 | explicit Version(VersionSet* vset) |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 117 | : vset_(vset), next_(this), prev_(this), refs_(0), |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 118 | file_to_compact_(NULL), |
| 119 | file_to_compact_level_(-1), |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 120 | compaction_score_(-1), |
| 121 | compaction_level_(-1) { |
| 122 | } |
| 123 | |
| 124 | ~Version(); |
| 125 | |
| 126 | // No copying allowed |
| 127 | Version(const Version&); |
| 128 | void operator=(const Version&); |
| 129 | }; |
| 130 | |
| 131 | class VersionSet { |
| 132 | public: |
| 133 | VersionSet(const std::string& dbname, |
| 134 | const Options* options, |
| 135 | TableCache* table_cache, |
| 136 | const InternalKeyComparator*); |
| 137 | ~VersionSet(); |
| 138 | |
| 139 | // Apply *edit to the current version to form a new descriptor that |
| 140 | // is both saved to persistent state and installed as the new |
gabor@google.com | 7263023 | 2011-09-01 19:08:02 +0000 | [diff] [blame^] | 141 | // current version. Will release *mu while actually writing to the file. |
| 142 | // REQUIRES: *mu is held on entry. |
| 143 | // REQUIRES: no other thread concurrently calls LogAndApply() |
| 144 | Status LogAndApply(VersionEdit* edit, port::Mutex* mu); |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 145 | |
| 146 | // Recover the last saved descriptor from persistent storage. |
dgrogan@chromium.org | f779e7a | 2011-04-12 19:38:58 +0000 | [diff] [blame] | 147 | Status Recover(); |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 148 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 149 | // Return the current version. |
| 150 | Version* current() const { return current_; } |
| 151 | |
| 152 | // Return the current manifest file number |
| 153 | uint64_t ManifestFileNumber() const { return manifest_file_number_; } |
| 154 | |
| 155 | // Allocate and return a new file number |
| 156 | uint64_t NewFileNumber() { return next_file_number_++; } |
| 157 | |
| 158 | // Return the number of Table files at the specified level. |
| 159 | int NumLevelFiles(int level) const; |
| 160 | |
dgrogan@chromium.org | f779e7a | 2011-04-12 19:38:58 +0000 | [diff] [blame] | 161 | // Return the combined file size of all files at the specified level. |
| 162 | int64_t NumLevelBytes(int level) const; |
| 163 | |
| 164 | // Return the last sequence number. |
| 165 | uint64_t LastSequence() const { return last_sequence_; } |
| 166 | |
| 167 | // Set the last sequence number to s. |
| 168 | void SetLastSequence(uint64_t s) { |
| 169 | assert(s >= last_sequence_); |
| 170 | last_sequence_ = s; |
| 171 | } |
| 172 | |
gabor@google.com | 7263023 | 2011-09-01 19:08:02 +0000 | [diff] [blame^] | 173 | // Mark the specified file number as used. |
| 174 | void MarkFileNumberUsed(uint64_t number); |
| 175 | |
dgrogan@chromium.org | f779e7a | 2011-04-12 19:38:58 +0000 | [diff] [blame] | 176 | // Return the current log file number. |
| 177 | uint64_t LogNumber() const { return log_number_; } |
| 178 | |
| 179 | // Return the log file number for the log file that is currently |
| 180 | // being compacted, or zero if there is no such log file. |
| 181 | uint64_t PrevLogNumber() const { return prev_log_number_; } |
| 182 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 183 | // Pick level and inputs for a new compaction. |
| 184 | // Returns NULL if there is no compaction to be done. |
| 185 | // Otherwise returns a pointer to a heap-allocated object that |
| 186 | // describes the compaction. Caller should delete the result. |
| 187 | Compaction* PickCompaction(); |
| 188 | |
| 189 | // Return a compaction object for compacting the range [begin,end] in |
| 190 | // the specified level. Returns NULL if there is nothing in that |
| 191 | // level that overlaps the specified range. Caller should delete |
| 192 | // the result. |
| 193 | Compaction* CompactRange( |
| 194 | int level, |
| 195 | const InternalKey& begin, |
| 196 | const InternalKey& end); |
| 197 | |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 198 | // Return the maximum overlapping data (in bytes) at next level for any |
| 199 | // file at a level >= 1. |
jorlow@chromium.org | 8303bb1 | 2011-03-22 23:24:02 +0000 | [diff] [blame] | 200 | int64_t MaxNextLevelOverlappingBytes(); |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 201 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 202 | // Create an iterator that reads over the compaction inputs for "*c". |
| 203 | // The caller should delete the iterator when no longer needed. |
| 204 | Iterator* MakeInputIterator(Compaction* c); |
| 205 | |
| 206 | // Returns true iff some level needs a compaction. |
gabor@google.com | ccf0fcd | 2011-06-22 02:36:45 +0000 | [diff] [blame] | 207 | bool NeedsCompaction() const { |
| 208 | Version* v = current_; |
| 209 | return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); |
| 210 | } |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 211 | |
| 212 | // Add all files listed in any live version to *live. |
| 213 | // May also mutate some internal state. |
| 214 | void AddLiveFiles(std::set<uint64_t>* live); |
| 215 | |
| 216 | // Return the approximate offset in the database of the data for |
| 217 | // "key" as of version "v". |
| 218 | uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); |
| 219 | |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 220 | // Return a human-readable short (single-line) summary of the number |
| 221 | // of files per level. Uses *scratch as backing store. |
| 222 | struct LevelSummaryStorage { |
| 223 | char buffer[100]; |
| 224 | }; |
| 225 | const char* LevelSummary(LevelSummaryStorage* scratch) const; |
| 226 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 227 | private: |
| 228 | class Builder; |
| 229 | |
| 230 | friend class Compaction; |
| 231 | friend class Version; |
| 232 | |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 233 | void Finalize(Version* v); |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 234 | |
| 235 | void GetOverlappingInputs( |
| 236 | int level, |
| 237 | const InternalKey& begin, |
| 238 | const InternalKey& end, |
| 239 | std::vector<FileMetaData*>* inputs); |
| 240 | |
| 241 | void GetRange(const std::vector<FileMetaData*>& inputs, |
| 242 | InternalKey* smallest, |
| 243 | InternalKey* largest); |
| 244 | |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 245 | void GetRange2(const std::vector<FileMetaData*>& inputs1, |
| 246 | const std::vector<FileMetaData*>& inputs2, |
| 247 | InternalKey* smallest, |
| 248 | InternalKey* largest); |
| 249 | |
| 250 | void SetupOtherInputs(Compaction* c); |
| 251 | |
gabor@google.com | 7263023 | 2011-09-01 19:08:02 +0000 | [diff] [blame^] | 252 | // Save current contents to *log |
| 253 | Status WriteSnapshot(log::Writer* log); |
| 254 | |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 255 | void AppendVersion(Version* v); |
| 256 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 257 | Env* const env_; |
| 258 | const std::string dbname_; |
| 259 | const Options* const options_; |
| 260 | TableCache* const table_cache_; |
| 261 | const InternalKeyComparator icmp_; |
| 262 | uint64_t next_file_number_; |
| 263 | uint64_t manifest_file_number_; |
dgrogan@chromium.org | f779e7a | 2011-04-12 19:38:58 +0000 | [diff] [blame] | 264 | uint64_t last_sequence_; |
| 265 | uint64_t log_number_; |
| 266 | uint64_t prev_log_number_; // 0 or backing store for memtable being compacted |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 267 | |
| 268 | // Opened lazily |
| 269 | WritableFile* descriptor_file_; |
| 270 | log::Writer* descriptor_log_; |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 271 | Version dummy_versions_; // Head of circular doubly-linked list of versions. |
| 272 | Version* current_; // == dummy_versions_.prev_ |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 273 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 274 | // Per-level key at which the next compaction at that level should start. |
| 275 | // Either an empty string, or a valid InternalKey. |
| 276 | std::string compact_pointer_[config::kNumLevels]; |
| 277 | |
| 278 | // No copying allowed |
| 279 | VersionSet(const VersionSet&); |
| 280 | void operator=(const VersionSet&); |
| 281 | }; |
| 282 | |
| 283 | // A Compaction encapsulates information about a compaction. |
| 284 | class Compaction { |
| 285 | public: |
| 286 | ~Compaction(); |
| 287 | |
| 288 | // Return the level that is being compacted. Inputs from "level" |
| 289 | // and "level+1" will be merged to produce a set of "level+1" files. |
| 290 | int level() const { return level_; } |
| 291 | |
| 292 | // Return the object that holds the edits to the descriptor done |
| 293 | // by this compaction. |
| 294 | VersionEdit* edit() { return &edit_; } |
| 295 | |
| 296 | // "which" must be either 0 or 1 |
| 297 | int num_input_files(int which) const { return inputs_[which].size(); } |
| 298 | |
| 299 | // Return the ith input file at "level()+which" ("which" must be 0 or 1). |
| 300 | FileMetaData* input(int which, int i) const { return inputs_[which][i]; } |
| 301 | |
| 302 | // Maximum size of files to build during this compaction. |
| 303 | uint64_t MaxOutputFileSize() const { return max_output_file_size_; } |
| 304 | |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 305 | // Is this a trivial compaction that can be implemented by just |
| 306 | // moving a single input file to the next level (no merging or splitting) |
| 307 | bool IsTrivialMove() const; |
| 308 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 309 | // Add all inputs to this compaction as delete operations to *edit. |
| 310 | void AddInputDeletions(VersionEdit* edit); |
| 311 | |
| 312 | // Returns true if the information we have available guarantees that |
| 313 | // the compaction is producing data in "level+1" for which no data exists |
| 314 | // in levels greater than "level+1". |
| 315 | bool IsBaseLevelForKey(const Slice& user_key); |
| 316 | |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 317 | // Returns true iff we should stop building the current output |
dgrogan@chromium.org | da79909 | 2011-05-21 02:17:43 +0000 | [diff] [blame] | 318 | // before processing "internal_key". |
| 319 | bool ShouldStopBefore(const Slice& internal_key); |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 320 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 321 | // Release the input version for the compaction, once the compaction |
| 322 | // is successful. |
| 323 | void ReleaseInputs(); |
| 324 | |
| 325 | private: |
| 326 | friend class Version; |
| 327 | friend class VersionSet; |
| 328 | |
| 329 | explicit Compaction(int level); |
| 330 | |
| 331 | int level_; |
| 332 | uint64_t max_output_file_size_; |
| 333 | Version* input_version_; |
| 334 | VersionEdit edit_; |
| 335 | |
| 336 | // Each compaction reads inputs from "level_" and "level_+1" |
| 337 | std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs |
| 338 | |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 339 | // State used to check for number of of overlapping grandparent files |
| 340 | // (parent == level_ + 1, grandparent == level_ + 2) |
| 341 | std::vector<FileMetaData*> grandparents_; |
dgrogan@chromium.org | ba6dac0 | 2011-04-20 22:48:11 +0000 | [diff] [blame] | 342 | size_t grandparent_index_; // Index in grandparent_starts_ |
jorlow@chromium.org | 8303bb1 | 2011-03-22 23:24:02 +0000 | [diff] [blame] | 343 | bool seen_key_; // Some output key has been seen |
| 344 | int64_t overlapped_bytes_; // Bytes of overlap between current output |
| 345 | // and grandparent files |
jorlow@chromium.org | 13b72af | 2011-03-22 18:32:49 +0000 | [diff] [blame] | 346 | |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 347 | // State for implementing IsBaseLevelForKey |
| 348 | |
| 349 | // level_ptrs_ holds indices into input_version_->levels_: our state |
| 350 | // is that we are positioned at one of the file ranges for each |
| 351 | // higher level than the ones involved in this compaction (i.e. for |
| 352 | // all L >= level_ + 2). |
dgrogan@chromium.org | ba6dac0 | 2011-04-20 22:48:11 +0000 | [diff] [blame] | 353 | size_t level_ptrs_[config::kNumLevels]; |
jorlow@chromium.org | f67e15e | 2011-03-18 22:37:00 +0000 | [diff] [blame] | 354 | }; |
| 355 | |
| 356 | } |
| 357 | |
| 358 | #endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ |