1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "db/builder.h"
11 
12 #include <algorithm>
13 #include <deque>
14 #include <vector>
15 
16 #include "db/compaction/compaction_iterator.h"
17 #include "db/dbformat.h"
18 #include "db/event_helpers.h"
19 #include "db/internal_stats.h"
20 #include "db/merge_helper.h"
21 #include "db/range_del_aggregator.h"
22 #include "db/table_cache.h"
23 #include "db/version_edit.h"
24 #include "file/filename.h"
25 #include "file/read_write_util.h"
26 #include "file/writable_file_writer.h"
27 #include "monitoring/iostats_context_imp.h"
28 #include "monitoring/thread_status_util.h"
29 #include "rocksdb/db.h"
30 #include "rocksdb/env.h"
31 #include "rocksdb/iterator.h"
32 #include "rocksdb/options.h"
33 #include "rocksdb/table.h"
34 #include "table/block_based/block_based_table_builder.h"
35 #include "table/format.h"
36 #include "table/internal_iterator.h"
37 #include "test_util/sync_point.h"
38 #include "util/stop_watch.h"
39 
40 namespace ROCKSDB_NAMESPACE {
41 
42 class TableFactory;
43 
NewTableBuilder(const ImmutableCFOptions & ioptions,const MutableCFOptions & moptions,const InternalKeyComparator & internal_comparator,const std::vector<std::unique_ptr<IntTblPropCollectorFactory>> * int_tbl_prop_collector_factories,uint32_t column_family_id,const std::string & column_family_name,WritableFileWriter * file,const CompressionType compression_type,uint64_t sample_for_compression,const CompressionOptions & compression_opts,int level,const bool skip_filters,const uint64_t creation_time,const uint64_t oldest_key_time,const uint64_t target_file_size,const uint64_t file_creation_time)44 TableBuilder* NewTableBuilder(
45     const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
46     const InternalKeyComparator& internal_comparator,
47     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
48         int_tbl_prop_collector_factories,
49     uint32_t column_family_id, const std::string& column_family_name,
50     WritableFileWriter* file, const CompressionType compression_type,
51     uint64_t sample_for_compression, const CompressionOptions& compression_opts,
52     int level, const bool skip_filters, const uint64_t creation_time,
53     const uint64_t oldest_key_time, const uint64_t target_file_size,
54     const uint64_t file_creation_time) {
55   assert((column_family_id ==
56           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
57          column_family_name.empty());
58   return ioptions.table_factory->NewTableBuilder(
59       TableBuilderOptions(ioptions, moptions, internal_comparator,
60                           int_tbl_prop_collector_factories, compression_type,
61                           sample_for_compression, compression_opts,
62                           skip_filters, column_family_name, level,
63                           creation_time, oldest_key_time, target_file_size,
64                           file_creation_time),
65       column_family_id, file);
66 }
67 
BuildTable(const std::string & dbname,Env * env,FileSystem * fs,const ImmutableCFOptions & ioptions,const MutableCFOptions & mutable_cf_options,const FileOptions & file_options,TableCache * table_cache,InternalIterator * iter,std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> range_del_iters,FileMetaData * meta,const InternalKeyComparator & internal_comparator,const std::vector<std::unique_ptr<IntTblPropCollectorFactory>> * int_tbl_prop_collector_factories,uint32_t column_family_id,const std::string & column_family_name,std::vector<SequenceNumber> snapshots,SequenceNumber earliest_write_conflict_snapshot,SnapshotChecker * snapshot_checker,const CompressionType compression,uint64_t sample_for_compression,const CompressionOptions & compression_opts,bool paranoid_file_checks,InternalStats * internal_stats,TableFileCreationReason reason,IOStatus * io_status,EventLogger * event_logger,int job_id,const Env::IOPriority io_priority,TableProperties * table_properties,int level,const uint64_t creation_time,const uint64_t oldest_key_time,Env::WriteLifeTimeHint write_hint,const uint64_t file_creation_time)68 Status BuildTable(
69     const std::string& dbname, Env* env, FileSystem* fs,
70     const ImmutableCFOptions& ioptions,
71     const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
72     TableCache* table_cache, InternalIterator* iter,
73     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
74         range_del_iters,
75     FileMetaData* meta, const InternalKeyComparator& internal_comparator,
76     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
77         int_tbl_prop_collector_factories,
78     uint32_t column_family_id, const std::string& column_family_name,
79     std::vector<SequenceNumber> snapshots,
80     SequenceNumber earliest_write_conflict_snapshot,
81     SnapshotChecker* snapshot_checker, const CompressionType compression,
82     uint64_t sample_for_compression, const CompressionOptions& compression_opts,
83     bool paranoid_file_checks, InternalStats* internal_stats,
84     TableFileCreationReason reason, IOStatus* io_status,
85     EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
86     TableProperties* table_properties, int level, const uint64_t creation_time,
87     const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint,
88     const uint64_t file_creation_time) {
89   assert((column_family_id ==
90           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
91          column_family_name.empty());
92   // Reports the IOStats for flush for every following bytes.
93   const size_t kReportFlushIOStatsEvery = 1048576;
94   Status s;
95   meta->fd.file_size = 0;
96   iter->SeekToFirst();
97   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
98       new CompactionRangeDelAggregator(&internal_comparator, snapshots));
99   for (auto& range_del_iter : range_del_iters) {
100     range_del_agg->AddTombstones(std::move(range_del_iter));
101   }
102 
103   std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
104                                     meta->fd.GetPathId());
105 #ifndef ROCKSDB_LITE
106   EventHelpers::NotifyTableFileCreationStarted(
107       ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
108 #endif  // !ROCKSDB_LITE
109   TableProperties tp;
110 
111   if (iter->Valid() || !range_del_agg->IsEmpty()) {
112     TableBuilder* builder;
113     std::unique_ptr<WritableFileWriter> file_writer;
114     // Currently we only enable dictionary compression during compaction to the
115     // bottommost level.
116     CompressionOptions compression_opts_for_flush(compression_opts);
117     compression_opts_for_flush.max_dict_bytes = 0;
118     compression_opts_for_flush.zstd_max_train_bytes = 0;
119     {
120       std::unique_ptr<FSWritableFile> file;
121 #ifndef NDEBUG
122       bool use_direct_writes = file_options.use_direct_writes;
123       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
124 #endif  // !NDEBUG
125       s = NewWritableFile(fs, fname, &file, file_options);
126       if (!s.ok()) {
127         EventHelpers::LogAndNotifyTableFileCreationFinished(
128             event_logger, ioptions.listeners, dbname, column_family_name, fname,
129             job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
130         return s;
131       }
132       file->SetIOPriority(io_priority);
133       file->SetWriteLifeTimeHint(write_hint);
134 
135       file_writer.reset(new WritableFileWriter(
136           std::move(file), fname, file_options, env, ioptions.statistics,
137           ioptions.listeners, ioptions.file_checksum_gen_factory));
138 
139       builder = NewTableBuilder(
140           ioptions, mutable_cf_options, internal_comparator,
141           int_tbl_prop_collector_factories, column_family_id,
142           column_family_name, file_writer.get(), compression,
143           sample_for_compression, compression_opts_for_flush, level,
144           false /* skip_filters */, creation_time, oldest_key_time,
145           0 /*target_file_size*/, file_creation_time);
146     }
147 
148     MergeHelper merge(env, internal_comparator.user_comparator(),
149                       ioptions.merge_operator, nullptr, ioptions.info_log,
150                       true /* internal key corruption is not ok */,
151                       snapshots.empty() ? 0 : snapshots.back(),
152                       snapshot_checker);
153 
154     CompactionIterator c_iter(
155         iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
156         &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
157         ShouldReportDetailedTime(env, ioptions.statistics),
158         true /* internal key corruption is not ok */, range_del_agg.get());
159     c_iter.SeekToFirst();
160     for (; c_iter.Valid(); c_iter.Next()) {
161       const Slice& key = c_iter.key();
162       const Slice& value = c_iter.value();
163       const ParsedInternalKey& ikey = c_iter.ikey();
164       builder->Add(key, value);
165       meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
166 
167       // TODO(noetzli): Update stats after flush, too.
168       if (io_priority == Env::IO_HIGH &&
169           IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
170         ThreadStatusUtil::SetThreadOperationProperty(
171             ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
172       }
173     }
174 
175     auto range_del_it = range_del_agg->NewIterator();
176     for (range_del_it->SeekToFirst(); range_del_it->Valid();
177          range_del_it->Next()) {
178       auto tombstone = range_del_it->Tombstone();
179       auto kv = tombstone.Serialize();
180       builder->Add(kv.first.Encode(), kv.second);
181       meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
182                                      tombstone.seq_, internal_comparator);
183     }
184 
185     // Finish and check for builder errors
186     tp = builder->GetTableProperties();
187     bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
188     s = c_iter.status();
189     TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
190     if (!s.ok() || empty) {
191       builder->Abandon();
192     } else {
193       s = builder->Finish();
194     }
195     *io_status = builder->io_status();
196 
197     if (s.ok() && !empty) {
198       uint64_t file_size = builder->FileSize();
199       meta->fd.file_size = file_size;
200       meta->marked_for_compaction = builder->NeedCompact();
201       assert(meta->fd.GetFileSize() > 0);
202       tp = builder->GetTableProperties(); // refresh now that builder is finished
203       if (table_properties) {
204         *table_properties = tp;
205       }
206     }
207     delete builder;
208 
209     // Finish and check for file errors
210     if (s.ok() && !empty) {
211       StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
212       *io_status = file_writer->Sync(ioptions.use_fsync);
213     }
214     if (io_status->ok() && !empty) {
215       *io_status = file_writer->Close();
216     }
217     if (io_status->ok() && !empty) {
218       // Add the checksum information to file metadata.
219       meta->file_checksum = file_writer->GetFileChecksum();
220       meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
221     }
222 
223     if (!io_status->ok()) {
224       s = *io_status;
225     }
226 
227     // TODO Also check the IO status when create the Iterator.
228 
229     if (s.ok() && !empty) {
230       // Verify that the table is usable
231       // We set for_compaction to false and don't OptimizeForCompactionTableRead
232       // here because this is a special case after we finish the table building
233       // No matter whether use_direct_io_for_flush_and_compaction is true,
234       // we will regrad this verification as user reads since the goal is
235       // to cache it here for further user reads
236       std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
237           ReadOptions(), file_options, internal_comparator, *meta,
238           nullptr /* range_del_agg */,
239           mutable_cf_options.prefix_extractor.get(), nullptr,
240           (internal_stats == nullptr) ? nullptr
241                                       : internal_stats->GetFileReadHist(0),
242           TableReaderCaller::kFlush, /*arena=*/nullptr,
243           /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
244           /*largest_compaction_key*/ nullptr));
245       s = it->status();
246       if (s.ok() && paranoid_file_checks) {
247         for (it->SeekToFirst(); it->Valid(); it->Next()) {
248         }
249         s = it->status();
250       }
251     }
252   }
253 
254   // Check for input iterator errors
255   if (!iter->status().ok()) {
256     s = iter->status();
257   }
258 
259   if (!s.ok() || meta->fd.GetFileSize() == 0) {
260     fs->DeleteFile(fname, IOOptions(), nullptr);
261   }
262 
263   if (meta->fd.GetFileSize() == 0) {
264     fname = "(nil)";
265   }
266   // Output to event logger and fire events.
267   EventHelpers::LogAndNotifyTableFileCreationFinished(
268       event_logger, ioptions.listeners, dbname, column_family_name, fname,
269       job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
270 
271   return s;
272 }
273 
274 }  // namespace ROCKSDB_NAMESPACE
275