blob: 34688622c4da3eb518a9dd8b43f3b64272c38d7c [file] [log] [blame]
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5#include "db/db_impl.h"
6
costan0fa5a4f2018-03-16 10:06:35 -07007#include <stdint.h>
8#include <stdio.h>
9
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000010#include <algorithm>
costan7d8e41e2019-03-11 13:04:53 -070011#include <atomic>
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000012#include <set>
13#include <string>
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000014#include <vector>
costan0fa5a4f2018-03-16 10:06:35 -070015
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000016#include "db/builder.h"
17#include "db/db_iter.h"
18#include "db/dbformat.h"
19#include "db/filename.h"
20#include "db/log_reader.h"
21#include "db/log_writer.h"
22#include "db/memtable.h"
23#include "db/table_cache.h"
24#include "db/version_set.h"
25#include "db/write_batch_internal.h"
jorlow@chromium.org4671a692011-03-30 18:35:40 +000026#include "leveldb/db.h"
27#include "leveldb/env.h"
28#include "leveldb/status.h"
29#include "leveldb/table.h"
30#include "leveldb/table_builder.h"
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000031#include "port/port.h"
32#include "table/block.h"
33#include "table/merger.h"
34#include "table/two_level_iterator.h"
35#include "util/coding.h"
36#include "util/logging.h"
37#include "util/mutexlock.h"
38
39namespace leveldb {
40
David Grogan7b094f12013-06-13 16:14:06 -070041const int kNumNonTableCacheFiles = 10;
42
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -080043// Information kept for every waiting writer
44struct DBImpl::Writer {
45 Status status;
46 WriteBatch* batch;
47 bool sync;
48 bool done;
49 port::CondVar cv;
50
51 explicit Writer(port::Mutex* mu) : cv(mu) { }
52};
53
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000054struct DBImpl::CompactionState {
55 Compaction* const compaction;
56
57 // Sequence numbers < smallest_snapshot are not significant since we
58 // will never have to service a snapshot below smallest_snapshot.
59 // Therefore if we have seen a sequence number S <= smallest_snapshot,
60 // we can drop all entries for the same key with sequence numbers < S.
61 SequenceNumber smallest_snapshot;
62
63 // Files produced by compaction
64 struct Output {
65 uint64_t number;
66 uint64_t file_size;
67 InternalKey smallest, largest;
68 };
69 std::vector<Output> outputs;
70
71 // State kept for output being generated
72 WritableFile* outfile;
73 TableBuilder* builder;
74
75 uint64_t total_bytes;
76
77 Output* current_output() { return &outputs[outputs.size()-1]; }
78
79 explicit CompactionState(Compaction* c)
80 : compaction(c),
costan09217fd2018-04-10 16:18:06 -070081 outfile(nullptr),
82 builder(nullptr),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000083 total_bytes(0) {
84 }
85};
86
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000087// Fix user-supplied options to be reasonable
costan0fa5a4f2018-03-16 10:06:35 -070088template <class T, class V>
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000089static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +000090 if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
91 if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000092}
93Options SanitizeOptions(const std::string& dbname,
94 const InternalKeyComparator* icmp,
Sanjay Ghemawat85584d42012-04-17 08:36:46 -070095 const InternalFilterPolicy* ipolicy,
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +000096 const Options& src) {
97 Options result = src;
98 result.comparator = icmp;
costan09217fd2018-04-10 16:18:06 -070099 result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
David Grogan7b094f12013-06-13 16:14:06 -0700100 ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
101 ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
corradoa2fb0862016-09-27 04:50:38 -0700102 ClipToRange(&result.max_file_size, 1<<20, 1<<30);
David Grogan7b094f12013-06-13 16:14:06 -0700103 ClipToRange(&result.block_size, 1<<10, 4<<20);
costan09217fd2018-04-10 16:18:06 -0700104 if (result.info_log == nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000105 // Open a log file in the same directory as the db
106 src.env->CreateDir(dbname); // In case it does not exist
107 src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
gabor@google.com60bd8012011-07-21 02:40:18 +0000108 Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000109 if (!s.ok()) {
110 // No place suitable for logging
costan09217fd2018-04-10 16:18:06 -0700111 result.info_log = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000112 }
113 }
costan09217fd2018-04-10 16:18:06 -0700114 if (result.block_cache == nullptr) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000115 result.block_cache = NewLRUCache(8 << 20);
116 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000117 return result;
118}
119
costan0fa5a4f2018-03-16 10:06:35 -0700120static int TableCacheSize(const Options& sanitized_options) {
121 // Reserve ten files or so for other uses and give the rest to TableCache.
122 return sanitized_options.max_open_files - kNumNonTableCacheFiles;
123}
124
David Grogan748539c2013-08-21 11:12:47 -0700125DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
126 : env_(raw_options.env),
127 internal_comparator_(raw_options.comparator),
128 internal_filter_policy_(raw_options.filter_policy),
129 options_(SanitizeOptions(dbname, &internal_comparator_,
130 &internal_filter_policy_, raw_options)),
131 owns_info_log_(options_.info_log != raw_options.info_log),
132 owns_cache_(options_.block_cache != raw_options.block_cache),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000133 dbname_(dbname),
costan0fa5a4f2018-03-16 10:06:35 -0700134 table_cache_(new TableCache(dbname_, options_, TableCacheSize(options_))),
costan09217fd2018-04-10 16:18:06 -0700135 db_lock_(nullptr),
costan7d8e41e2019-03-11 13:04:53 -0700136 shutting_down_(false),
costan0fa5a4f2018-03-16 10:06:35 -0700137 background_work_finished_signal_(&mutex_),
costan09217fd2018-04-10 16:18:06 -0700138 mem_(nullptr),
139 imm_(nullptr),
costan7d8e41e2019-03-11 13:04:53 -0700140 has_imm_(false),
costan09217fd2018-04-10 16:18:06 -0700141 logfile_(nullptr),
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000142 logfile_number_(0),
costan09217fd2018-04-10 16:18:06 -0700143 log_(nullptr),
David Grogan748539c2013-08-21 11:12:47 -0700144 seed_(0),
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -0800145 tmp_batch_(new WriteBatch),
costan0fa5a4f2018-03-16 10:06:35 -0700146 background_compaction_scheduled_(false),
costan09217fd2018-04-10 16:18:06 -0700147 manual_compaction_(nullptr),
costan0fa5a4f2018-03-16 10:06:35 -0700148 versions_(new VersionSet(dbname_, &options_, table_cache_,
costan7d8e41e2019-03-11 13:04:53 -0700149 &internal_comparator_)) {}
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000150
151DBImpl::~DBImpl() {
costan7d8e41e2019-03-11 13:04:53 -0700152 // Wait for background work to finish.
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000153 mutex_.Lock();
costan7d8e41e2019-03-11 13:04:53 -0700154 shutting_down_.store(true, std::memory_order_release);
costan0fa5a4f2018-03-16 10:06:35 -0700155 while (background_compaction_scheduled_) {
156 background_work_finished_signal_.Wait();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000157 }
158 mutex_.Unlock();
159
costan09217fd2018-04-10 16:18:06 -0700160 if (db_lock_ != nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000161 env_->UnlockFile(db_lock_);
162 }
163
164 delete versions_;
costan09217fd2018-04-10 16:18:06 -0700165 if (mem_ != nullptr) mem_->Unref();
166 if (imm_ != nullptr) imm_->Unref();
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -0800167 delete tmp_batch_;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000168 delete log_;
169 delete logfile_;
170 delete table_cache_;
171
172 if (owns_info_log_) {
173 delete options_.info_log;
174 }
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000175 if (owns_cache_) {
176 delete options_.block_cache;
177 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000178}
179
180Status DBImpl::NewDB() {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000181 VersionEdit new_db;
182 new_db.SetComparatorName(user_comparator()->Name());
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000183 new_db.SetLogNumber(0);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000184 new_db.SetNextFile(2);
185 new_db.SetLastSequence(0);
186
187 const std::string manifest = DescriptorFileName(dbname_, 1);
188 WritableFile* file;
189 Status s = env_->NewWritableFile(manifest, &file);
190 if (!s.ok()) {
191 return s;
192 }
193 {
194 log::Writer log(file);
195 std::string record;
196 new_db.EncodeTo(&record);
197 s = log.AddRecord(record);
198 if (s.ok()) {
199 s = file->Close();
200 }
201 }
202 delete file;
203 if (s.ok()) {
204 // Make "CURRENT" file that points to the new manifest file.
205 s = SetCurrentFile(env_, dbname_, 1);
206 } else {
207 env_->DeleteFile(manifest);
208 }
209 return s;
210}
211
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000212void DBImpl::MaybeIgnoreError(Status* s) const {
213 if (s->ok() || options_.paranoid_checks) {
214 // No change needed
215 } else {
gabor@google.com60bd8012011-07-21 02:40:18 +0000216 Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000217 *s = Status::OK();
218 }
219}
220
221void DBImpl::DeleteObsoleteFiles() {
costan0fa5a4f2018-03-16 10:06:35 -0700222 mutex_.AssertHeld();
223
David Grogan0cfb9902013-12-10 10:36:31 -0800224 if (!bg_error_.ok()) {
225 // After a background error, we don't know whether a new version may
226 // or may not have been committed, so we cannot safely garbage collect.
227 return;
228 }
229
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000230 // Make a set of all of the live files
231 std::set<uint64_t> live = pending_outputs_;
232 versions_->AddLiveFiles(&live);
233
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000234 std::vector<std::string> filenames;
costan0fa5a4f2018-03-16 10:06:35 -0700235 env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000236 uint64_t number;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000237 FileType type;
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +0000238 for (size_t i = 0; i < filenames.size(); i++) {
239 if (ParseFileName(filenames[i], &number, &type)) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000240 bool keep = true;
241 switch (type) {
242 case kLogFile:
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000243 keep = ((number >= versions_->LogNumber()) ||
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000244 (number == versions_->PrevLogNumber()));
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000245 break;
246 case kDescriptorFile:
247 // Keep my manifest file, and any newer incarnations'
248 // (in case there is a race that allows other incarnations)
249 keep = (number >= versions_->ManifestFileNumber());
250 break;
251 case kTableFile:
252 keep = (live.find(number) != live.end());
253 break;
254 case kTempFile:
255 // Any temp files that are currently being written to must
256 // be recorded in pending_outputs_, which is inserted into "live"
257 keep = (live.find(number) != live.end());
258 break;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000259 case kCurrentFile:
260 case kDBLockFile:
261 case kInfoLogFile:
262 keep = true;
263 break;
264 }
265
266 if (!keep) {
267 if (type == kTableFile) {
268 table_cache_->Evict(number);
269 }
gabor@google.com60bd8012011-07-21 02:40:18 +0000270 Log(options_.info_log, "Delete type=%d #%lld\n",
costan0fa5a4f2018-03-16 10:06:35 -0700271 static_cast<int>(type),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000272 static_cast<unsigned long long>(number));
273 env_->DeleteFile(dbname_ + "/" + filenames[i]);
274 }
275 }
276 }
277}
278
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800279Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000280 mutex_.AssertHeld();
281
282 // Ignore error from CreateDir since the creation of the DB is
283 // committed only when the descriptor is created, and this directory
284 // may already exist from a previous failed creation attempt.
285 env_->CreateDir(dbname_);
costan09217fd2018-04-10 16:18:06 -0700286 assert(db_lock_ == nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000287 Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
288 if (!s.ok()) {
289 return s;
290 }
291
292 if (!env_->FileExists(CurrentFileName(dbname_))) {
293 if (options_.create_if_missing) {
294 s = NewDB();
295 if (!s.ok()) {
296 return s;
297 }
298 } else {
299 return Status::InvalidArgument(
300 dbname_, "does not exist (create_if_missing is false)");
301 }
302 } else {
303 if (options_.error_if_exists) {
304 return Status::InvalidArgument(
305 dbname_, "exists (error_if_exists is true)");
306 }
307 }
308
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800309 s = versions_->Recover(save_manifest);
310 if (!s.ok()) {
311 return s;
312 }
313 SequenceNumber max_sequence(0);
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000314
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800315 // Recover from all newer log files than the ones named in the
316 // descriptor (new log files may have been added by the previous
317 // incarnation without registering them in the descriptor).
318 //
319 // Note that PrevLogNumber() is no longer used, but we pay
320 // attention to it in case we are recovering a database
321 // produced by an older version of leveldb.
322 const uint64_t min_log = versions_->LogNumber();
323 const uint64_t prev_log = versions_->PrevLogNumber();
324 std::vector<std::string> filenames;
325 s = env_->GetChildren(dbname_, &filenames);
326 if (!s.ok()) {
327 return s;
328 }
329 std::set<uint64_t> expected;
330 versions_->AddLiveFiles(&expected);
331 uint64_t number;
332 FileType type;
333 std::vector<uint64_t> logs;
334 for (size_t i = 0; i < filenames.size(); i++) {
335 if (ParseFileName(filenames[i], &number, &type)) {
336 expected.erase(number);
337 if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
338 logs.push_back(number);
339 }
340 }
341 if (!expected.empty()) {
342 char buf[50];
343 snprintf(buf, sizeof(buf), "%d missing files; e.g.",
344 static_cast<int>(expected.size()));
345 return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
346 }
347
348 // Recover in the order in which the logs were generated
349 std::sort(logs.begin(), logs.end());
350 for (size_t i = 0; i < logs.size(); i++) {
351 s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
352 &max_sequence);
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000353 if (!s.ok()) {
354 return s;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000355 }
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000356
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800357 // The previous incarnation may not have written any MANIFEST
358 // records after allocating this log number. So we manually
359 // update the file number allocation counter in VersionSet.
360 versions_->MarkFileNumberUsed(logs[i]);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000361 }
362
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800363 if (versions_->LastSequence() < max_sequence) {
364 versions_->SetLastSequence(max_sequence);
365 }
366
367 return Status::OK();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000368}
369
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800370Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
371 bool* save_manifest, VersionEdit* edit,
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000372 SequenceNumber* max_sequence) {
373 struct LogReporter : public log::Reader::Reporter {
374 Env* env;
gabor@google.com60bd8012011-07-21 02:40:18 +0000375 Logger* info_log;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000376 const char* fname;
costan09217fd2018-04-10 16:18:06 -0700377 Status* status; // null if options_.paranoid_checks==false
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000378 virtual void Corruption(size_t bytes, const Status& s) {
gabor@google.com60bd8012011-07-21 02:40:18 +0000379 Log(info_log, "%s%s: dropping %d bytes; %s",
costan09217fd2018-04-10 16:18:06 -0700380 (this->status == nullptr ? "(ignoring error) " : ""),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000381 fname, static_cast<int>(bytes), s.ToString().c_str());
costan09217fd2018-04-10 16:18:06 -0700382 if (this->status != nullptr && this->status->ok()) *this->status = s;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000383 }
384 };
385
386 mutex_.AssertHeld();
387
388 // Open the log file
389 std::string fname = LogFileName(dbname_, log_number);
390 SequentialFile* file;
391 Status status = env_->NewSequentialFile(fname, &file);
392 if (!status.ok()) {
393 MaybeIgnoreError(&status);
394 return status;
395 }
396
397 // Create the log reader.
398 LogReporter reporter;
399 reporter.env = env_;
400 reporter.info_log = options_.info_log;
401 reporter.fname = fname.c_str();
costan09217fd2018-04-10 16:18:06 -0700402 reporter.status = (options_.paranoid_checks ? &status : nullptr);
Chris Mumford803d6922014-09-16 14:19:52 -0700403 // We intentionally make log::Reader do checksumming even if
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000404 // paranoid_checks==false so that corruptions cause entire commits
405 // to be skipped instead of propagating bad information (like overly
406 // large sequence numbers).
dgrogan@chromium.orgda799092011-05-21 02:17:43 +0000407 log::Reader reader(file, &reporter, true/*checksum*/,
408 0/*initial_offset*/);
gabor@google.com60bd8012011-07-21 02:40:18 +0000409 Log(options_.info_log, "Recovering log #%llu",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000410 (unsigned long long) log_number);
411
412 // Read all the records and add to a memtable
413 std::string scratch;
414 Slice record;
415 WriteBatch batch;
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800416 int compactions = 0;
costan09217fd2018-04-10 16:18:06 -0700417 MemTable* mem = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000418 while (reader.ReadRecord(&record, &scratch) &&
419 status.ok()) {
420 if (record.size() < 12) {
421 reporter.Corruption(
422 record.size(), Status::Corruption("log record too small"));
423 continue;
424 }
425 WriteBatchInternal::SetContents(&batch, record);
426
costan09217fd2018-04-10 16:18:06 -0700427 if (mem == nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000428 mem = new MemTable(internal_comparator_);
dgrogan@chromium.orgda799092011-05-21 02:17:43 +0000429 mem->Ref();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000430 }
431 status = WriteBatchInternal::InsertInto(&batch, mem);
432 MaybeIgnoreError(&status);
433 if (!status.ok()) {
434 break;
435 }
436 const SequenceNumber last_seq =
437 WriteBatchInternal::Sequence(&batch) +
438 WriteBatchInternal::Count(&batch) - 1;
439 if (last_seq > *max_sequence) {
440 *max_sequence = last_seq;
441 }
442
443 if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800444 compactions++;
445 *save_manifest = true;
costan09217fd2018-04-10 16:18:06 -0700446 status = WriteLevel0Table(mem, edit, nullptr);
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800447 mem->Unref();
costan09217fd2018-04-10 16:18:06 -0700448 mem = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000449 if (!status.ok()) {
450 // Reflect errors immediately so that conditions like full
451 // file-systems cause the DB::Open() to fail.
452 break;
453 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000454 }
455 }
456
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800457 delete file;
458
459 // See if we should keep reusing the last log file.
460 if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
costan09217fd2018-04-10 16:18:06 -0700461 assert(logfile_ == nullptr);
462 assert(log_ == nullptr);
463 assert(mem_ == nullptr);
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800464 uint64_t lfile_size;
465 if (env_->GetFileSize(fname, &lfile_size).ok() &&
466 env_->NewAppendableFile(fname, &logfile_).ok()) {
467 Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
468 log_ = new log::Writer(logfile_, lfile_size);
469 logfile_number_ = log_number;
costan09217fd2018-04-10 16:18:06 -0700470 if (mem != nullptr) {
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800471 mem_ = mem;
costan09217fd2018-04-10 16:18:06 -0700472 mem = nullptr;
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800473 } else {
costan09217fd2018-04-10 16:18:06 -0700474 // mem can be nullptr if lognum exists but was empty.
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800475 mem_ = new MemTable(internal_comparator_);
476 mem_->Ref();
477 }
478 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000479 }
480
costan09217fd2018-04-10 16:18:06 -0700481 if (mem != nullptr) {
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800482 // mem did not get reused; compact it.
483 if (status.ok()) {
484 *save_manifest = true;
costan09217fd2018-04-10 16:18:06 -0700485 status = WriteLevel0Table(mem, edit, nullptr);
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -0800486 }
487 mem->Unref();
488 }
489
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000490 return status;
491}
492
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000493Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
494 Version* base) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000495 mutex_.AssertHeld();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000496 const uint64_t start_micros = env_->NowMicros();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000497 FileMetaData meta;
498 meta.number = versions_->NewFileNumber();
499 pending_outputs_.insert(meta.number);
500 Iterator* iter = mem->NewIterator();
gabor@google.com60bd8012011-07-21 02:40:18 +0000501 Log(options_.info_log, "Level-0 table #%llu: started",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000502 (unsigned long long) meta.number);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000503
504 Status s;
505 {
506 mutex_.Unlock();
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000507 s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000508 mutex_.Lock();
509 }
510
gabor@google.com60bd8012011-07-21 02:40:18 +0000511 Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000512 (unsigned long long) meta.number,
513 (unsigned long long) meta.file_size,
514 s.ToString().c_str());
515 delete iter;
516 pending_outputs_.erase(meta.number);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000517
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000518
519 // Note that if file_size is zero, the file has been deleted and
520 // should not be added to the manifest.
521 int level = 0;
522 if (s.ok() && meta.file_size > 0) {
gabor@google.com6699c7e2011-07-15 00:20:57 +0000523 const Slice min_user_key = meta.smallest.user_key();
524 const Slice max_user_key = meta.largest.user_key();
costan09217fd2018-04-10 16:18:06 -0700525 if (base != nullptr) {
Gabor Cselle299cced2011-10-05 16:30:28 -0700526 level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000527 }
528 edit->AddFile(level, meta.number, meta.file_size,
529 meta.smallest, meta.largest);
530 }
531
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000532 CompactionStats stats;
533 stats.micros = env_->NowMicros() - start_micros;
534 stats.bytes_written = meta.file_size;
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000535 stats_[level].Add(stats);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000536 return s;
537}
538
David Grogan0cfb9902013-12-10 10:36:31 -0800539void DBImpl::CompactMemTable() {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000540 mutex_.AssertHeld();
costan09217fd2018-04-10 16:18:06 -0700541 assert(imm_ != nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000542
543 // Save the contents of the memtable as a new Table
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000544 VersionEdit edit;
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000545 Version* base = versions_->current();
546 base->Ref();
547 Status s = WriteLevel0Table(imm_, &edit, base);
548 base->Unref();
549
costan7d8e41e2019-03-11 13:04:53 -0700550 if (s.ok() && shutting_down_.load(std::memory_order_acquire)) {
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000551 s = Status::IOError("Deleting DB during memtable compaction");
552 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000553
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000554 // Replace immutable memtable with the generated Table
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000555 if (s.ok()) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000556 edit.SetPrevLogNumber(0);
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000557 edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
gabor@google.com72630232011-09-01 19:08:02 +0000558 s = versions_->LogAndApply(&edit, &mutex_);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000559 }
560
561 if (s.ok()) {
562 // Commit to the new state
dgrogan@chromium.orgda799092011-05-21 02:17:43 +0000563 imm_->Unref();
costan09217fd2018-04-10 16:18:06 -0700564 imm_ = nullptr;
costan7d8e41e2019-03-11 13:04:53 -0700565 has_imm_.store(false, std::memory_order_release);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000566 DeleteObsoleteFiles();
David Grogan0cfb9902013-12-10 10:36:31 -0800567 } else {
568 RecordBackgroundError(s);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000569 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000570}
571
Gabor Cselle299cced2011-10-05 16:30:28 -0700572void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
573 int max_level_with_files = 1;
574 {
575 MutexLock l(&mutex_);
576 Version* base = versions_->current();
577 for (int level = 1; level < config::kNumLevels; level++) {
578 if (base->OverlapInLevel(level, begin, end)) {
579 max_level_with_files = level;
580 }
581 }
582 }
costan0fa5a4f2018-03-16 10:06:35 -0700583 TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
Gabor Cselle299cced2011-10-05 16:30:28 -0700584 for (int level = 0; level < max_level_with_files; level++) {
585 TEST_CompactRange(level, begin, end);
586 }
587}
588
costan0fa5a4f2018-03-16 10:06:35 -0700589void DBImpl::TEST_CompactRange(int level, const Slice* begin,
590 const Slice* end) {
gabor@google.comccf0fcd2011-06-22 02:36:45 +0000591 assert(level >= 0);
592 assert(level + 1 < config::kNumLevels);
593
Gabor Cselle299cced2011-10-05 16:30:28 -0700594 InternalKey begin_storage, end_storage;
595
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000596 ManualCompaction manual;
597 manual.level = level;
Gabor Cselle299cced2011-10-05 16:30:28 -0700598 manual.done = false;
costan09217fd2018-04-10 16:18:06 -0700599 if (begin == nullptr) {
600 manual.begin = nullptr;
Gabor Cselle299cced2011-10-05 16:30:28 -0700601 } else {
602 begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
603 manual.begin = &begin_storage;
604 }
costan09217fd2018-04-10 16:18:06 -0700605 if (end == nullptr) {
606 manual.end = nullptr;
Gabor Cselle299cced2011-10-05 16:30:28 -0700607 } else {
608 end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
609 manual.end = &end_storage;
610 }
611
612 MutexLock l(&mutex_);
costan7d8e41e2019-03-11 13:04:53 -0700613 while (!manual.done && !shutting_down_.load(std::memory_order_acquire) &&
614 bg_error_.ok()) {
costan09217fd2018-04-10 16:18:06 -0700615 if (manual_compaction_ == nullptr) { // Idle
David Grogan0cfb9902013-12-10 10:36:31 -0800616 manual_compaction_ = &manual;
617 MaybeScheduleCompaction();
618 } else { // Running either my compaction or another compaction.
costan0fa5a4f2018-03-16 10:06:35 -0700619 background_work_finished_signal_.Wait();
Gabor Cselle299cced2011-10-05 16:30:28 -0700620 }
David Grogan0cfb9902013-12-10 10:36:31 -0800621 }
622 if (manual_compaction_ == &manual) {
623 // Cancel my manual compaction since we aborted early for some reason.
costan09217fd2018-04-10 16:18:06 -0700624 manual_compaction_ = nullptr;
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000625 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000626}
627
628Status DBImpl::TEST_CompactMemTable() {
costan09217fd2018-04-10 16:18:06 -0700629 // nullptr batch means just wait for earlier writes to be done
630 Status s = Write(WriteOptions(), nullptr);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000631 if (s.ok()) {
632 // Wait until the compaction completes
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -0800633 MutexLock l(&mutex_);
costan09217fd2018-04-10 16:18:06 -0700634 while (imm_ != nullptr && bg_error_.ok()) {
costan0fa5a4f2018-03-16 10:06:35 -0700635 background_work_finished_signal_.Wait();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000636 }
costan09217fd2018-04-10 16:18:06 -0700637 if (imm_ != nullptr) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000638 s = bg_error_;
639 }
640 }
641 return s;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000642}
643
David Grogan0cfb9902013-12-10 10:36:31 -0800644void DBImpl::RecordBackgroundError(const Status& s) {
645 mutex_.AssertHeld();
646 if (bg_error_.ok()) {
647 bg_error_ = s;
costan0fa5a4f2018-03-16 10:06:35 -0700648 background_work_finished_signal_.SignalAll();
David Grogan0cfb9902013-12-10 10:36:31 -0800649 }
650}
651
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000652void DBImpl::MaybeScheduleCompaction() {
653 mutex_.AssertHeld();
costan0fa5a4f2018-03-16 10:06:35 -0700654 if (background_compaction_scheduled_) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000655 // Already scheduled
costan7d8e41e2019-03-11 13:04:53 -0700656 } else if (shutting_down_.load(std::memory_order_acquire)) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000657 // DB is being deleted; no more background compactions
David Grogan0cfb9902013-12-10 10:36:31 -0800658 } else if (!bg_error_.ok()) {
659 // Already got an error; no more changes
costan09217fd2018-04-10 16:18:06 -0700660 } else if (imm_ == nullptr &&
661 manual_compaction_ == nullptr &&
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000662 !versions_->NeedsCompaction()) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000663 // No work to be done
664 } else {
costan0fa5a4f2018-03-16 10:06:35 -0700665 background_compaction_scheduled_ = true;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000666 env_->Schedule(&DBImpl::BGWork, this);
667 }
668}
669
670void DBImpl::BGWork(void* db) {
671 reinterpret_cast<DBImpl*>(db)->BackgroundCall();
672}
673
674void DBImpl::BackgroundCall() {
675 MutexLock l(&mutex_);
costan0fa5a4f2018-03-16 10:06:35 -0700676 assert(background_compaction_scheduled_);
costan7d8e41e2019-03-11 13:04:53 -0700677 if (shutting_down_.load(std::memory_order_acquire)) {
David Grogan0cfb9902013-12-10 10:36:31 -0800678 // No more background work when shutting down.
679 } else if (!bg_error_.ok()) {
680 // No more background work after a background error.
681 } else {
682 BackgroundCompaction();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000683 }
Sanjay Ghemawat075a35a2012-05-30 09:45:46 -0700684
costan0fa5a4f2018-03-16 10:06:35 -0700685 background_compaction_scheduled_ = false;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000686
687 // Previous compaction may have produced too many files in a level,
688 // so reschedule another compaction if needed.
689 MaybeScheduleCompaction();
costan0fa5a4f2018-03-16 10:06:35 -0700690 background_work_finished_signal_.SignalAll();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000691}
692
David Grogan0cfb9902013-12-10 10:36:31 -0800693void DBImpl::BackgroundCompaction() {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000694 mutex_.AssertHeld();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000695
costan09217fd2018-04-10 16:18:06 -0700696 if (imm_ != nullptr) {
David Grogan0cfb9902013-12-10 10:36:31 -0800697 CompactMemTable();
698 return;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000699 }
700
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000701 Compaction* c;
costan09217fd2018-04-10 16:18:06 -0700702 bool is_manual = (manual_compaction_ != nullptr);
Gabor Cselle299cced2011-10-05 16:30:28 -0700703 InternalKey manual_end;
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000704 if (is_manual) {
Gabor Cselle299cced2011-10-05 16:30:28 -0700705 ManualCompaction* m = manual_compaction_;
706 c = versions_->CompactRange(m->level, m->begin, m->end);
costan09217fd2018-04-10 16:18:06 -0700707 m->done = (c == nullptr);
708 if (c != nullptr) {
Gabor Cselle299cced2011-10-05 16:30:28 -0700709 manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
710 }
711 Log(options_.info_log,
712 "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000713 m->level,
Gabor Cselle299cced2011-10-05 16:30:28 -0700714 (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
715 (m->end ? m->end->DebugString().c_str() : "(end)"),
716 (m->done ? "(end)" : manual_end.DebugString().c_str()));
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000717 } else {
718 c = versions_->PickCompaction();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000719 }
720
721 Status status;
costan09217fd2018-04-10 16:18:06 -0700722 if (c == nullptr) {
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000723 // Nothing to do
724 } else if (!is_manual && c->IsTrivialMove()) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000725 // Move file to next level
jorlow@chromium.org13b72af2011-03-22 18:32:49 +0000726 assert(c->num_input_files(0) == 1);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000727 FileMetaData* f = c->input(0, 0);
728 c->edit()->DeleteFile(c->level(), f->number);
729 c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
730 f->smallest, f->largest);
gabor@google.com72630232011-09-01 19:08:02 +0000731 status = versions_->LogAndApply(c->edit(), &mutex_);
David Grogan0cfb9902013-12-10 10:36:31 -0800732 if (!status.ok()) {
733 RecordBackgroundError(status);
734 }
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000735 VersionSet::LevelSummaryStorage tmp;
gabor@google.com60bd8012011-07-21 02:40:18 +0000736 Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000737 static_cast<unsigned long long>(f->number),
738 c->level() + 1,
739 static_cast<unsigned long long>(f->file_size),
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000740 status.ToString().c_str(),
741 versions_->LevelSummary(&tmp));
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000742 } else {
743 CompactionState* compact = new CompactionState(c);
744 status = DoCompactionWork(compact);
David Grogan0cfb9902013-12-10 10:36:31 -0800745 if (!status.ok()) {
746 RecordBackgroundError(status);
747 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000748 CleanupCompaction(compact);
Sanjay Ghemawat3c8be102012-01-25 14:56:52 -0800749 c->ReleaseInputs();
750 DeleteObsoleteFiles();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000751 }
752 delete c;
753
754 if (status.ok()) {
755 // Done
costan7d8e41e2019-03-11 13:04:53 -0700756 } else if (shutting_down_.load(std::memory_order_acquire)) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000757 // Ignore compaction errors found during shutting down
758 } else {
gabor@google.com60bd8012011-07-21 02:40:18 +0000759 Log(options_.info_log,
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000760 "Compaction error: %s", status.ToString().c_str());
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000761 }
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000762
763 if (is_manual) {
Gabor Cselle299cced2011-10-05 16:30:28 -0700764 ManualCompaction* m = manual_compaction_;
Sanjay Ghemawat3c8be102012-01-25 14:56:52 -0800765 if (!status.ok()) {
766 m->done = true;
767 }
Gabor Cselle299cced2011-10-05 16:30:28 -0700768 if (!m->done) {
769 // We only compacted part of the requested range. Update *m
770 // to the range that is left to be compacted.
771 m->tmp_storage = manual_end;
772 m->begin = &m->tmp_storage;
773 }
costan09217fd2018-04-10 16:18:06 -0700774 manual_compaction_ = nullptr;
hans@chromium.org80e5b0d2011-06-07 14:40:26 +0000775 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000776}
777
778void DBImpl::CleanupCompaction(CompactionState* compact) {
779 mutex_.AssertHeld();
costan09217fd2018-04-10 16:18:06 -0700780 if (compact->builder != nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000781 // May happen if we get a shutdown call in the middle of compaction
782 compact->builder->Abandon();
783 delete compact->builder;
784 } else {
costan09217fd2018-04-10 16:18:06 -0700785 assert(compact->outfile == nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000786 }
787 delete compact->outfile;
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +0000788 for (size_t i = 0; i < compact->outputs.size(); i++) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000789 const CompactionState::Output& out = compact->outputs[i];
790 pending_outputs_.erase(out.number);
791 }
792 delete compact;
793}
794
795Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
costan09217fd2018-04-10 16:18:06 -0700796 assert(compact != nullptr);
797 assert(compact->builder == nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000798 uint64_t file_number;
799 {
800 mutex_.Lock();
801 file_number = versions_->NewFileNumber();
802 pending_outputs_.insert(file_number);
803 CompactionState::Output out;
804 out.number = file_number;
805 out.smallest.Clear();
806 out.largest.Clear();
807 compact->outputs.push_back(out);
808 mutex_.Unlock();
809 }
810
811 // Make the output file
812 std::string fname = TableFileName(dbname_, file_number);
813 Status s = env_->NewWritableFile(fname, &compact->outfile);
814 if (s.ok()) {
815 compact->builder = new TableBuilder(options_, compact->outfile);
816 }
817 return s;
818}
819
820Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
821 Iterator* input) {
costan09217fd2018-04-10 16:18:06 -0700822 assert(compact != nullptr);
823 assert(compact->outfile != nullptr);
824 assert(compact->builder != nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000825
826 const uint64_t output_number = compact->current_output()->number;
827 assert(output_number != 0);
828
829 // Check for iterator errors
830 Status s = input->status();
831 const uint64_t current_entries = compact->builder->NumEntries();
832 if (s.ok()) {
833 s = compact->builder->Finish();
834 } else {
835 compact->builder->Abandon();
836 }
837 const uint64_t current_bytes = compact->builder->FileSize();
838 compact->current_output()->file_size = current_bytes;
839 compact->total_bytes += current_bytes;
840 delete compact->builder;
costan09217fd2018-04-10 16:18:06 -0700841 compact->builder = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000842
843 // Finish and check for file errors
844 if (s.ok()) {
845 s = compact->outfile->Sync();
846 }
847 if (s.ok()) {
848 s = compact->outfile->Close();
849 }
850 delete compact->outfile;
costan09217fd2018-04-10 16:18:06 -0700851 compact->outfile = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000852
853 if (s.ok() && current_entries > 0) {
854 // Verify that the table is usable
jorlow@chromium.orge2da7442011-03-28 20:43:44 +0000855 Iterator* iter = table_cache_->NewIterator(ReadOptions(),
856 output_number,
857 current_bytes);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000858 s = iter->status();
859 delete iter;
860 if (s.ok()) {
gabor@google.com60bd8012011-07-21 02:40:18 +0000861 Log(options_.info_log,
ideawu8fcceb22015-04-20 12:39:14 +0800862 "Generated table #%llu@%d: %lld keys, %lld bytes",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000863 (unsigned long long) output_number,
ideawu76bba132015-04-20 12:41:01 +0800864 compact->compaction->level(),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000865 (unsigned long long) current_entries,
866 (unsigned long long) current_bytes);
867 }
868 }
869 return s;
870}
871
872
873Status DBImpl::InstallCompactionResults(CompactionState* compact) {
874 mutex_.AssertHeld();
gabor@google.com60bd8012011-07-21 02:40:18 +0000875 Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000876 compact->compaction->num_input_files(0),
877 compact->compaction->level(),
878 compact->compaction->num_input_files(1),
879 compact->compaction->level() + 1,
880 static_cast<long long>(compact->total_bytes));
881
882 // Add compaction outputs
883 compact->compaction->AddInputDeletions(compact->compaction->edit());
884 const int level = compact->compaction->level();
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +0000885 for (size_t i = 0; i < compact->outputs.size(); i++) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000886 const CompactionState::Output& out = compact->outputs[i];
887 compact->compaction->edit()->AddFile(
888 level + 1,
889 out.number, out.file_size, out.smallest, out.largest);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000890 }
Sanjay Ghemawat3c8be102012-01-25 14:56:52 -0800891 return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000892}
893
894Status DBImpl::DoCompactionWork(CompactionState* compact) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000895 const uint64_t start_micros = env_->NowMicros();
896 int64_t imm_micros = 0; // Micros spent doing imm_ compactions
897
gabor@google.com60bd8012011-07-21 02:40:18 +0000898 Log(options_.info_log, "Compacting %d@%d + %d@%d files",
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000899 compact->compaction->num_input_files(0),
900 compact->compaction->level(),
901 compact->compaction->num_input_files(1),
902 compact->compaction->level() + 1);
903
904 assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
costan09217fd2018-04-10 16:18:06 -0700905 assert(compact->builder == nullptr);
906 assert(compact->outfile == nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000907 if (snapshots_.empty()) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000908 compact->smallest_snapshot = versions_->LastSequence();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000909 } else {
costan18683982018-04-30 15:11:03 -0700910 compact->smallest_snapshot = snapshots_.oldest()->sequence_number();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000911 }
912
913 // Release mutex while we're actually doing the compaction work
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000914 mutex_.Unlock();
915
916 Iterator* input = versions_->MakeInputIterator(compact->compaction);
917 input->SeekToFirst();
918 Status status;
919 ParsedInternalKey ikey;
920 std::string current_user_key;
921 bool has_current_user_key = false;
922 SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
costan7d8e41e2019-03-11 13:04:53 -0700923 for (; input->Valid() && !shutting_down_.load(std::memory_order_acquire); ) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000924 // Prioritize immutable compaction work
costan7d8e41e2019-03-11 13:04:53 -0700925 if (has_imm_.load(std::memory_order_relaxed)) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000926 const uint64_t imm_start = env_->NowMicros();
927 mutex_.Lock();
costan09217fd2018-04-10 16:18:06 -0700928 if (imm_ != nullptr) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000929 CompactMemTable();
costan0fa5a4f2018-03-16 10:06:35 -0700930 // Wake up MakeRoomForWrite() if necessary.
931 background_work_finished_signal_.SignalAll();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +0000932 }
933 mutex_.Unlock();
934 imm_micros += (env_->NowMicros() - imm_start);
935 }
936
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000937 Slice key = input->key();
dgrogan@chromium.orgda799092011-05-21 02:17:43 +0000938 if (compact->compaction->ShouldStopBefore(key) &&
costan09217fd2018-04-10 16:18:06 -0700939 compact->builder != nullptr) {
jorlow@chromium.org13b72af2011-03-22 18:32:49 +0000940 status = FinishCompactionOutputFile(compact, input);
941 if (!status.ok()) {
942 break;
943 }
944 }
945
946 // Handle key/value, add to state, etc.
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000947 bool drop = false;
948 if (!ParseInternalKey(key, &ikey)) {
949 // Do not hide error keys
950 current_user_key.clear();
951 has_current_user_key = false;
952 last_sequence_for_key = kMaxSequenceNumber;
953 } else {
954 if (!has_current_user_key ||
955 user_comparator()->Compare(ikey.user_key,
956 Slice(current_user_key)) != 0) {
957 // First occurrence of this user key
958 current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
959 has_current_user_key = true;
960 last_sequence_for_key = kMaxSequenceNumber;
961 }
962
963 if (last_sequence_for_key <= compact->smallest_snapshot) {
964 // Hidden by an newer entry for same user key
965 drop = true; // (A)
966 } else if (ikey.type == kTypeDeletion &&
967 ikey.sequence <= compact->smallest_snapshot &&
968 compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
969 // For this user key:
970 // (1) there is no data in higher levels
971 // (2) data in lower levels will have larger sequence numbers
972 // (3) data in layers that are being compacted here and have
973 // smaller sequence numbers will be dropped in the next
974 // few iterations of this loop (by rule (A) above).
975 // Therefore this deletion marker is obsolete and can be dropped.
976 drop = true;
977 }
978
979 last_sequence_for_key = ikey.sequence;
980 }
981#if 0
gabor@google.com60bd8012011-07-21 02:40:18 +0000982 Log(options_.info_log,
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000983 " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
984 "%d smallest_snapshot: %d",
985 ikey.user_key.ToString().c_str(),
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +0000986 (int)ikey.sequence, ikey.type, kTypeValue, drop,
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000987 compact->compaction->IsBaseLevelForKey(ikey.user_key),
988 (int)last_sequence_for_key, (int)compact->smallest_snapshot);
989#endif
990
991 if (!drop) {
992 // Open output file if necessary
costan09217fd2018-04-10 16:18:06 -0700993 if (compact->builder == nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +0000994 status = OpenCompactionOutputFile(compact);
995 if (!status.ok()) {
996 break;
997 }
998 }
999 if (compact->builder->NumEntries() == 0) {
1000 compact->current_output()->smallest.DecodeFrom(key);
1001 }
1002 compact->current_output()->largest.DecodeFrom(key);
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001003 compact->builder->Add(key, input->value());
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001004
1005 // Close output file if it is big enough
1006 if (compact->builder->FileSize() >=
1007 compact->compaction->MaxOutputFileSize()) {
1008 status = FinishCompactionOutputFile(compact, input);
1009 if (!status.ok()) {
1010 break;
1011 }
1012 }
1013 }
1014
1015 input->Next();
1016 }
1017
costan7d8e41e2019-03-11 13:04:53 -07001018 if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001019 status = Status::IOError("Deleting DB during compaction");
1020 }
costan09217fd2018-04-10 16:18:06 -07001021 if (status.ok() && compact->builder != nullptr) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001022 status = FinishCompactionOutputFile(compact, input);
1023 }
1024 if (status.ok()) {
1025 status = input->status();
1026 }
1027 delete input;
costan09217fd2018-04-10 16:18:06 -07001028 input = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001029
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001030 CompactionStats stats;
1031 stats.micros = env_->NowMicros() - start_micros - imm_micros;
1032 for (int which = 0; which < 2; which++) {
1033 for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
1034 stats.bytes_read += compact->compaction->input(which, i)->file_size;
1035 }
1036 }
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001037 for (size_t i = 0; i < compact->outputs.size(); i++) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001038 stats.bytes_written += compact->outputs[i].file_size;
1039 }
1040
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001041 mutex_.Lock();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001042 stats_[compact->compaction->level() + 1].Add(stats);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001043
1044 if (status.ok()) {
1045 status = InstallCompactionResults(compact);
1046 }
David Grogan0cfb9902013-12-10 10:36:31 -08001047 if (!status.ok()) {
1048 RecordBackgroundError(status);
1049 }
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001050 VersionSet::LevelSummaryStorage tmp;
gabor@google.com60bd8012011-07-21 02:40:18 +00001051 Log(options_.info_log,
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001052 "compacted to: %s", versions_->LevelSummary(&tmp));
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001053 return status;
1054}
1055
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001056namespace {
costan0db30412018-03-23 12:50:14 -07001057
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001058struct IterState {
costan0db30412018-03-23 12:50:14 -07001059 port::Mutex* const mu;
1060 Version* const version GUARDED_BY(mu);
1061 MemTable* const mem GUARDED_BY(mu);
1062 MemTable* const imm GUARDED_BY(mu);
1063
1064 IterState(port::Mutex* mutex, MemTable* mem, MemTable* imm, Version* version)
1065 : mu(mutex), version(version), mem(mem), imm(imm) { }
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001066};
1067
1068static void CleanupIteratorState(void* arg1, void* arg2) {
1069 IterState* state = reinterpret_cast<IterState*>(arg1);
1070 state->mu->Lock();
1071 state->mem->Unref();
costan09217fd2018-04-10 16:18:06 -07001072 if (state->imm != nullptr) state->imm->Unref();
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001073 state->version->Unref();
1074 state->mu->Unlock();
1075 delete state;
1076}
costan0db30412018-03-23 12:50:14 -07001077
1078} // anonymous namespace
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001079
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001080Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
David Grogan748539c2013-08-21 11:12:47 -07001081 SequenceNumber* latest_snapshot,
1082 uint32_t* seed) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001083 mutex_.Lock();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001084 *latest_snapshot = versions_->LastSequence();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001085
1086 // Collect together all needed child iterators
1087 std::vector<Iterator*> list;
1088 list.push_back(mem_->NewIterator());
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001089 mem_->Ref();
costan09217fd2018-04-10 16:18:06 -07001090 if (imm_ != nullptr) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001091 list.push_back(imm_->NewIterator());
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001092 imm_->Ref();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001093 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001094 versions_->current()->AddIterators(options, &list);
1095 Iterator* internal_iter =
1096 NewMergingIterator(&internal_comparator_, &list[0], list.size());
1097 versions_->current()->Ref();
dgrogan@chromium.org740d8b32011-05-28 00:53:58 +00001098
costan0db30412018-03-23 12:50:14 -07001099 IterState* cleanup = new IterState(&mutex_, mem_, imm_, versions_->current());
costan09217fd2018-04-10 16:18:06 -07001100 internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001101
David Grogan748539c2013-08-21 11:12:47 -07001102 *seed = ++seed_;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001103 mutex_.Unlock();
1104 return internal_iter;
1105}
1106
1107Iterator* DBImpl::TEST_NewInternalIterator() {
1108 SequenceNumber ignored;
David Grogan748539c2013-08-21 11:12:47 -07001109 uint32_t ignored_seed;
1110 return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001111}
1112
jorlow@chromium.org8303bb12011-03-22 23:24:02 +00001113int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
jorlow@chromium.org13b72af2011-03-22 18:32:49 +00001114 MutexLock l(&mutex_);
1115 return versions_->MaxNextLevelOverlappingBytes();
1116}
1117
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001118Status DBImpl::Get(const ReadOptions& options,
1119 const Slice& key,
1120 std::string* value) {
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001121 Status s;
1122 MutexLock l(&mutex_);
1123 SequenceNumber snapshot;
costan09217fd2018-04-10 16:18:06 -07001124 if (options.snapshot != nullptr) {
costan18683982018-04-30 15:11:03 -07001125 snapshot =
1126 static_cast<const SnapshotImpl*>(options.snapshot)->sequence_number();
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001127 } else {
1128 snapshot = versions_->LastSequence();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001129 }
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001130
gabor@google.come3584f92011-08-22 21:08:51 +00001131 MemTable* mem = mem_;
1132 MemTable* imm = imm_;
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001133 Version* current = versions_->current();
gabor@google.come3584f92011-08-22 21:08:51 +00001134 mem->Ref();
costan09217fd2018-04-10 16:18:06 -07001135 if (imm != nullptr) imm->Ref();
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001136 current->Ref();
gabor@google.come3584f92011-08-22 21:08:51 +00001137
1138 bool have_stat_update = false;
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001139 Version::GetStats stats;
gabor@google.come3584f92011-08-22 21:08:51 +00001140
1141 // Unlock while reading from files and memtables
1142 {
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001143 mutex_.Unlock();
gabor@google.come3584f92011-08-22 21:08:51 +00001144 // First look in the memtable, then in the immutable memtable (if any).
1145 LookupKey lkey(key, snapshot);
gabor@google.com72630232011-09-01 19:08:02 +00001146 if (mem->Get(lkey, value, &s)) {
gabor@google.come3584f92011-08-22 21:08:51 +00001147 // Done
costan09217fd2018-04-10 16:18:06 -07001148 } else if (imm != nullptr && imm->Get(lkey, value, &s)) {
gabor@google.come3584f92011-08-22 21:08:51 +00001149 // Done
1150 } else {
1151 s = current->Get(options, lkey, value, &stats);
1152 have_stat_update = true;
1153 }
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001154 mutex_.Lock();
1155 }
gabor@google.come3584f92011-08-22 21:08:51 +00001156
1157 if (have_stat_update && current->UpdateStats(stats)) {
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001158 MaybeScheduleCompaction();
1159 }
gabor@google.come3584f92011-08-22 21:08:51 +00001160 mem->Unref();
costan09217fd2018-04-10 16:18:06 -07001161 if (imm != nullptr) imm->Unref();
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001162 current->Unref();
1163 return s;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001164}
1165
1166Iterator* DBImpl::NewIterator(const ReadOptions& options) {
1167 SequenceNumber latest_snapshot;
David Grogan748539c2013-08-21 11:12:47 -07001168 uint32_t seed;
1169 Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001170 return NewDBIterator(
David Grogan748539c2013-08-21 11:12:47 -07001171 this, user_comparator(), iter,
costan09217fd2018-04-10 16:18:06 -07001172 (options.snapshot != nullptr
costan18683982018-04-30 15:11:03 -07001173 ? static_cast<const SnapshotImpl*>(options.snapshot)->sequence_number()
David Grogan748539c2013-08-21 11:12:47 -07001174 : latest_snapshot),
1175 seed);
1176}
1177
1178void DBImpl::RecordReadSample(Slice key) {
1179 MutexLock l(&mutex_);
1180 if (versions_->current()->RecordReadSample(key)) {
1181 MaybeScheduleCompaction();
1182 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001183}
1184
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001185const Snapshot* DBImpl::GetSnapshot() {
1186 MutexLock l(&mutex_);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001187 return snapshots_.New(versions_->LastSequence());
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001188}
1189
costan18683982018-04-30 15:11:03 -07001190void DBImpl::ReleaseSnapshot(const Snapshot* snapshot) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001191 MutexLock l(&mutex_);
costan18683982018-04-30 15:11:03 -07001192 snapshots_.Delete(static_cast<const SnapshotImpl*>(snapshot));
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001193}
1194
1195// Convenience methods
1196Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
1197 return DB::Put(o, key, val);
1198}
1199
1200Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
1201 return DB::Delete(options, key);
1202}
1203
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001204Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
1205 Writer w(&mutex_);
1206 w.batch = my_batch;
1207 w.sync = options.sync;
1208 w.done = false;
gabor@google.com72630232011-09-01 19:08:02 +00001209
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001210 MutexLock l(&mutex_);
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001211 writers_.push_back(&w);
1212 while (!w.done && &w != writers_.front()) {
1213 w.cv.Wait();
1214 }
1215 if (w.done) {
1216 return w.status;
1217 }
1218
1219 // May temporarily unlock and wait.
costan09217fd2018-04-10 16:18:06 -07001220 Status status = MakeRoomForWrite(my_batch == nullptr);
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001221 uint64_t last_sequence = versions_->LastSequence();
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001222 Writer* last_writer = &w;
costan09217fd2018-04-10 16:18:06 -07001223 if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001224 WriteBatch* updates = BuildBatchGroup(&last_writer);
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001225 WriteBatchInternal::SetSequence(updates, last_sequence + 1);
1226 last_sequence += WriteBatchInternal::Count(updates);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001227
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001228 // Add to log and apply to memtable. We can release the lock
1229 // during this phase since &w is currently responsible for logging
1230 // and protects against concurrent loggers and concurrent writes
1231 // into mem_.
gabor@google.com72630232011-09-01 19:08:02 +00001232 {
gabor@google.com72630232011-09-01 19:08:02 +00001233 mutex_.Unlock();
1234 status = log_->AddRecord(WriteBatchInternal::Contents(updates));
David Grogan0cfb9902013-12-10 10:36:31 -08001235 bool sync_error = false;
gabor@google.com72630232011-09-01 19:08:02 +00001236 if (status.ok() && options.sync) {
1237 status = logfile_->Sync();
David Grogan0cfb9902013-12-10 10:36:31 -08001238 if (!status.ok()) {
1239 sync_error = true;
1240 }
gabor@google.com72630232011-09-01 19:08:02 +00001241 }
1242 if (status.ok()) {
1243 status = WriteBatchInternal::InsertInto(updates, mem_);
1244 }
1245 mutex_.Lock();
David Grogan0cfb9902013-12-10 10:36:31 -08001246 if (sync_error) {
1247 // The state of the log file is indeterminate: the log record we
1248 // just added may or may not show up when the DB is re-opened.
1249 // So we force the DB into a mode where all future writes fail.
1250 RecordBackgroundError(status);
1251 }
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001252 }
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001253 if (updates == tmp_batch_) tmp_batch_->Clear();
gabor@google.com72630232011-09-01 19:08:02 +00001254
1255 versions_->SetLastSequence(last_sequence);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001256 }
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001257
1258 while (true) {
1259 Writer* ready = writers_.front();
1260 writers_.pop_front();
1261 if (ready != &w) {
1262 ready->status = status;
1263 ready->done = true;
1264 ready->cv.Signal();
1265 }
1266 if (ready == last_writer) break;
1267 }
1268
1269 // Notify new head of write queue
1270 if (!writers_.empty()) {
1271 writers_.front()->cv.Signal();
1272 }
1273
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001274 return status;
1275}
1276
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001277// REQUIRES: Writer list must be non-empty
costan09217fd2018-04-10 16:18:06 -07001278// REQUIRES: First writer must have a non-null batch
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001279WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
costan0fa5a4f2018-03-16 10:06:35 -07001280 mutex_.AssertHeld();
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001281 assert(!writers_.empty());
1282 Writer* first = writers_.front();
1283 WriteBatch* result = first->batch;
costan09217fd2018-04-10 16:18:06 -07001284 assert(result != nullptr);
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001285
1286 size_t size = WriteBatchInternal::ByteSize(first->batch);
1287
1288 // Allow the group to grow up to a maximum size, but if the
1289 // original write is small, limit the growth so we do not slow
1290 // down the small write too much.
1291 size_t max_size = 1 << 20;
1292 if (size <= (128<<10)) {
1293 max_size = size + (128<<10);
1294 }
1295
1296 *last_writer = first;
1297 std::deque<Writer*>::iterator iter = writers_.begin();
1298 ++iter; // Advance past "first"
1299 for (; iter != writers_.end(); ++iter) {
1300 Writer* w = *iter;
1301 if (w->sync && !first->sync) {
1302 // Do not include a sync write into a batch handled by a non-sync write.
1303 break;
1304 }
1305
costan09217fd2018-04-10 16:18:06 -07001306 if (w->batch != nullptr) {
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001307 size += WriteBatchInternal::ByteSize(w->batch);
1308 if (size > max_size) {
1309 // Do not make batch too big
1310 break;
1311 }
1312
Chris Mumford803d6922014-09-16 14:19:52 -07001313 // Append to *result
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001314 if (result == first->batch) {
1315 // Switch to temporary batch instead of disturbing caller's batch
1316 result = tmp_batch_;
1317 assert(WriteBatchInternal::Count(result) == 0);
1318 WriteBatchInternal::Append(result, first->batch);
1319 }
1320 WriteBatchInternal::Append(result, w->batch);
1321 }
1322 *last_writer = w;
1323 }
1324 return result;
1325}
1326
gabor@google.com72630232011-09-01 19:08:02 +00001327// REQUIRES: mutex_ is held
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001328// REQUIRES: this thread is currently at the front of the writer queue
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001329Status DBImpl::MakeRoomForWrite(bool force) {
1330 mutex_.AssertHeld();
Sanjay Ghemawatd79762e2012-03-08 16:23:21 -08001331 assert(!writers_.empty());
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001332 bool allow_delay = !force;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001333 Status s;
1334 while (true) {
1335 if (!bg_error_.ok()) {
1336 // Yield previous error
1337 s = bg_error_;
1338 break;
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001339 } else if (
1340 allow_delay &&
1341 versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
1342 // We are getting close to hitting a hard limit on the number of
1343 // L0 files. Rather than delaying a single write by several
1344 // seconds when we hit the hard limit, start delaying each
1345 // individual write by 1ms to reduce latency variance. Also,
1346 // this delay hands over some CPU to the compaction thread in
1347 // case it is sharing the same core as the writer.
1348 mutex_.Unlock();
1349 env_->SleepForMicroseconds(1000);
1350 allow_delay = false; // Do not delay a single write more than once
1351 mutex_.Lock();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001352 } else if (!force &&
1353 (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
1354 // There is room in current memtable
1355 break;
costan09217fd2018-04-10 16:18:06 -07001356 } else if (imm_ != nullptr) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001357 // We have filled up the current memtable, but the previous
1358 // one is still being compacted, so we wait.
David Grogan28dad912013-05-14 16:52:56 -07001359 Log(options_.info_log, "Current memtable full; waiting...\n");
costan0fa5a4f2018-03-16 10:06:35 -07001360 background_work_finished_signal_.Wait();
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001361 } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
1362 // There are too many level-0 files.
David Grogan28dad912013-05-14 16:52:56 -07001363 Log(options_.info_log, "Too many L0 files; waiting...\n");
costan0fa5a4f2018-03-16 10:06:35 -07001364 background_work_finished_signal_.Wait();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001365 } else {
1366 // Attempt to switch to a new memtable and trigger compaction of old
1367 assert(versions_->PrevLogNumber() == 0);
1368 uint64_t new_log_number = versions_->NewFileNumber();
costan09217fd2018-04-10 16:18:06 -07001369 WritableFile* lfile = nullptr;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001370 s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
1371 if (!s.ok()) {
Sanjay Ghemawat075a35a2012-05-30 09:45:46 -07001372 // Avoid chewing through file number space in a tight loop.
1373 versions_->ReuseFileNumber(new_log_number);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001374 break;
1375 }
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001376 delete log_;
1377 delete logfile_;
1378 logfile_ = lfile;
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001379 logfile_number_ = new_log_number;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001380 log_ = new log::Writer(lfile);
1381 imm_ = mem_;
costan7d8e41e2019-03-11 13:04:53 -07001382 has_imm_.store(true, std::memory_order_release);
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001383 mem_ = new MemTable(internal_comparator_);
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001384 mem_->Ref();
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001385 force = false; // Do not force another compaction if have room
1386 MaybeScheduleCompaction();
1387 }
1388 }
1389 return s;
1390}
1391
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001392bool DBImpl::GetProperty(const Slice& property, std::string* value) {
1393 value->clear();
1394
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001395 MutexLock l(&mutex_);
1396 Slice in = property;
1397 Slice prefix("leveldb.");
1398 if (!in.starts_with(prefix)) return false;
1399 in.remove_prefix(prefix.size());
1400
1401 if (in.starts_with("num-files-at-level")) {
1402 in.remove_prefix(strlen("num-files-at-level"));
1403 uint64_t level;
1404 bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
dgrogan@chromium.orga05525d2011-08-06 00:19:37 +00001405 if (!ok || level >= config::kNumLevels) {
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001406 return false;
1407 } else {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001408 char buf[100];
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001409 snprintf(buf, sizeof(buf), "%d",
1410 versions_->NumLevelFiles(static_cast<int>(level)));
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001411 *value = buf;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001412 return true;
1413 }
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001414 } else if (in == "stats") {
1415 char buf[200];
1416 snprintf(buf, sizeof(buf),
1417 " Compactions\n"
1418 "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
1419 "--------------------------------------------------\n"
1420 );
1421 value->append(buf);
1422 for (int level = 0; level < config::kNumLevels; level++) {
1423 int files = versions_->NumLevelFiles(level);
1424 if (stats_[level].micros > 0 || files > 0) {
1425 snprintf(
1426 buf, sizeof(buf),
1427 "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
1428 level,
1429 files,
1430 versions_->NumLevelBytes(level) / 1048576.0,
1431 stats_[level].micros / 1e6,
1432 stats_[level].bytes_read / 1048576.0,
1433 stats_[level].bytes_written / 1048576.0);
1434 value->append(buf);
1435 }
1436 }
1437 return true;
Gabor Cselle299cced2011-10-05 16:30:28 -07001438 } else if (in == "sstables") {
1439 *value = versions_->current()->DebugString();
1440 return true;
ssid528c2bc2015-09-29 11:52:21 -07001441 } else if (in == "approximate-memory-usage") {
1442 size_t total_usage = options_.block_cache->TotalCharge();
1443 if (mem_) {
1444 total_usage += mem_->ApproximateMemoryUsage();
1445 }
1446 if (imm_) {
1447 total_usage += imm_->ApproximateMemoryUsage();
1448 }
1449 char buf[50];
1450 snprintf(buf, sizeof(buf), "%llu",
1451 static_cast<unsigned long long>(total_usage));
1452 value->append(buf);
1453 return true;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001454 }
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001455
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001456 return false;
1457}
1458
1459void DBImpl::GetApproximateSizes(
1460 const Range* range, int n,
1461 uint64_t* sizes) {
1462 // TODO(opt): better implementation
1463 Version* v;
1464 {
1465 MutexLock l(&mutex_);
1466 versions_->current()->Ref();
1467 v = versions_->current();
1468 }
1469
1470 for (int i = 0; i < n; i++) {
1471 // Convert user_key into a corresponding internal key.
1472 InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
1473 InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
1474 uint64_t start = versions_->ApproximateOffsetOf(v, k1);
1475 uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
1476 sizes[i] = (limit >= start ? limit - start : 0);
1477 }
1478
1479 {
1480 MutexLock l(&mutex_);
1481 v->Unref();
1482 }
1483}
1484
1485// Default implementations of convenience methods that subclasses of DB
1486// can call if they wish
1487Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
1488 WriteBatch batch;
1489 batch.Put(key, value);
1490 return Write(opt, &batch);
1491}
1492
1493Status DB::Delete(const WriteOptions& opt, const Slice& key) {
1494 WriteBatch batch;
1495 batch.Delete(key);
1496 return Write(opt, &batch);
1497}
1498
1499DB::~DB() { }
1500
1501Status DB::Open(const Options& options, const std::string& dbname,
1502 DB** dbptr) {
costan09217fd2018-04-10 16:18:06 -07001503 *dbptr = nullptr;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001504
1505 DBImpl* impl = new DBImpl(options, dbname);
1506 impl->mutex_.Lock();
1507 VersionEdit edit;
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -08001508 // Recover handles create_if_missing, error_if_exists
1509 bool save_manifest = false;
1510 Status s = impl->Recover(&edit, &save_manifest);
costan09217fd2018-04-10 16:18:06 -07001511 if (s.ok() && impl->mem_ == nullptr) {
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -08001512 // Create new log and a corresponding memtable.
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001513 uint64_t new_log_number = impl->versions_->NewFileNumber();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001514 WritableFile* lfile;
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001515 s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001516 &lfile);
1517 if (s.ok()) {
dgrogan@chromium.orgf779e7a2011-04-12 19:38:58 +00001518 edit.SetLogNumber(new_log_number);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001519 impl->logfile_ = lfile;
gabor@google.comccf0fcd2011-06-22 02:36:45 +00001520 impl->logfile_number_ = new_log_number;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001521 impl->log_ = new log::Writer(lfile);
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -08001522 impl->mem_ = new MemTable(impl->internal_comparator_);
1523 impl->mem_->Ref();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001524 }
Sanjay Ghemawatac1d69d2014-12-11 08:13:18 -08001525 }
1526 if (s.ok() && save_manifest) {
1527 edit.SetPrevLogNumber(0); // No older logs needed after recovery.
1528 edit.SetLogNumber(impl->logfile_number_);
1529 s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
1530 }
1531 if (s.ok()) {
1532 impl->DeleteObsoleteFiles();
1533 impl->MaybeScheduleCompaction();
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001534 }
1535 impl->mutex_.Unlock();
1536 if (s.ok()) {
costan09217fd2018-04-10 16:18:06 -07001537 assert(impl->mem_ != nullptr);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001538 *dbptr = impl;
1539 } else {
1540 delete impl;
1541 }
1542 return s;
1543}
1544
dgrogan@chromium.orgda799092011-05-21 02:17:43 +00001545Snapshot::~Snapshot() {
1546}
1547
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001548Status DestroyDB(const std::string& dbname, const Options& options) {
1549 Env* env = options.env;
1550 std::vector<std::string> filenames;
cmumford05094142017-10-17 13:05:47 -07001551 Status result = env->GetChildren(dbname, &filenames);
1552 if (!result.ok()) {
1553 // Ignore error in case directory does not exist
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001554 return Status::OK();
1555 }
1556
1557 FileLock* lock;
gabor@google.com6699c7e2011-07-15 00:20:57 +00001558 const std::string lockname = LockFileName(dbname);
cmumford05094142017-10-17 13:05:47 -07001559 result = env->LockFile(lockname, &lock);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001560 if (result.ok()) {
1561 uint64_t number;
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001562 FileType type;
dgrogan@chromium.orgba6dac02011-04-20 22:48:11 +00001563 for (size_t i = 0; i < filenames.size(); i++) {
gabor@google.com6699c7e2011-07-15 00:20:57 +00001564 if (ParseFileName(filenames[i], &number, &type) &&
Sanjay Ghemawat583f1492012-03-09 07:51:04 -08001565 type != kDBLockFile) { // Lock file will be deleted at end
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001566 Status del = env->DeleteFile(dbname + "/" + filenames[i]);
1567 if (result.ok() && !del.ok()) {
1568 result = del;
1569 }
1570 }
1571 }
1572 env->UnlockFile(lock); // Ignore error since state is already gone
gabor@google.com6699c7e2011-07-15 00:20:57 +00001573 env->DeleteFile(lockname);
jorlow@chromium.orgf67e15e2011-03-18 22:37:00 +00001574 env->DeleteDir(dbname); // Ignore error in case dir contains other files
1575 }
1576 return result;
1577}
1578
Hans Wennborg36a5f8e2011-10-31 17:22:06 +00001579} // namespace leveldb