LevelDB源碼分析之十六:.log文件
? ? ? ? 下面我們帶大家看看log文件的具體物理和邏輯布局是怎樣的,LevelDB對(duì)于一個(gè)log文件,會(huì)把它切割成以32K為單位的物理Block,每次讀取的單位以一個(gè)Block作為基本讀取單位,下圖展示的log文件由3個(gè)Block構(gòu)成,所以從物理布局來(lái)講,一個(gè)log文件就是由連續(xù)的32K大小Block構(gòu)成的。
? ? ? ? 在應(yīng)用的視野里是看不到這些Block的,應(yīng)用看到的是一系列的Key/Value對(duì),在LevelDB內(nèi)部,會(huì)將一個(gè)Key/Value對(duì)看做一條記錄的數(shù)據(jù),另外在這個(gè)數(shù)據(jù)前增加一個(gè)記錄頭,用來(lái)記載一些管理信息,以方便內(nèi)部處理,下圖顯示了一個(gè)記錄在LevelDB內(nèi)部是如何表示的。
一.log文件的格式
namespace log {
// 記錄的類(lèi)型
enum RecordType {
// 保留位,用于預(yù)分配的文件
kZeroType = 0,
// 整個(gè)存儲(chǔ)
kFullType = 1,
// 分段存儲(chǔ)
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kMaxRecordType = kLastType;
// 32K
static const int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
// chcksum是類(lèi)型和數(shù)據(jù)字段的校驗(yàn)碼,type是記錄類(lèi)型,length是數(shù)據(jù)字段的長(zhǎng)度。
static const int kHeaderSize = 4 + 1 + 2;
}
二.log文件的寫(xiě)
Writer類(lèi)的頭文件很簡(jiǎn)單,看下cpp文件
namespace log {
Writer::Writer(WritableFile* dest)
: dest_(dest),
block_offset_(0) {// block_offset_
// 分別校驗(yàn)所有類(lèi)型,并把校驗(yàn)碼存儲(chǔ)到數(shù)組type_crc_中
// 放在構(gòu)造函數(shù)里提前計(jì)算類(lèi)型的校驗(yàn)碼,是為了減少運(yùn)行中計(jì)算時(shí)的性能損耗
for (int i = 0; i <= kMaxRecordType; i++) {
// 這里直接將int轉(zhuǎn)換為char,因?yàn)閕nt的值較小,不會(huì)造成精度丟失
char t = static_cast(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
// 如果當(dāng)前Block中剩下的容量leftover小于kHeaderSize的大小
// 則將剩下的容量填充空字符,因?yàn)閘eftover小于kHeaderSize
// 所以最多只能填充六個(gè)空字符,當(dāng)leftover大于等于kHeaderSize時(shí),
// Slice會(huì)自行截?cái)? if (leftover < kHeaderSize) {
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("x00x00x00x00x00x00", leftover));
}
// 切換到一個(gè)新的Block
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
// 如果當(dāng)前Block中剩下的容量leftover大于等于kHeaderSize的大小
// 則leftover-kHeaderSize為可用大小,即avail
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
// 如果新的slice小于avail,則該slice可用整個(gè)添加到當(dāng)前Block中,
// 不需要分段,此時(shí)type=kFullType
// 如果slice大于等于avail,則該slice需要分段存儲(chǔ),如果是第一段
// type = kFirstType,如果是最后一段type = kLastType,否則type = kMiddleType
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
// 將數(shù)據(jù)組建成指定格式后存儲(chǔ)到磁盤(pán)
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // 最大為兩個(gè)字節(jié)
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
// 長(zhǎng)度的低位放到數(shù)組的第五個(gè)字節(jié)
// 長(zhǎng)度的高位放到數(shù)組的第六個(gè)字節(jié)
buf[4] = static_cast(n & 0xff);
buf[5] = static_cast(n >> 8);
// 類(lèi)型放到數(shù)組的第七個(gè)字節(jié)
buf[6] = static_cast(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
// 1.添加校驗(yàn)碼到header中(包括類(lèi)型字段和數(shù)據(jù)字段的校驗(yàn))
EncodeFixed32(buf, crc);
// 2.添加header
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
// 3.添加數(shù)據(jù)
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
// 寫(xiě)入到磁盤(pán)
s = dest_->Flush();
}
}
// 偏移的自增
block_offset_ += kHeaderSize + n;
return s;
}
}
三.log文件的讀
Reader類(lèi)的頭文件
namespace log {
class Reader {
public:
// 報(bào)告錯(cuò)誤的接口
class Reporter {
public:
virtual ~Reporter();
// 如果有損壞被檢測(cè)到,那么bytes就是由于檢測(cè)到的損壞而丟失大概字節(jié)數(shù)
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Reader的功能時(shí)從log文件中讀取記錄
// 如果reporter不是NULL,只要有一些數(shù)據(jù)由于檢測(cè)到的損壞而丟失,就會(huì)通知它。
// 如果“校驗(yàn)和”為真,則驗(yàn)證校驗(yàn)和是否可用。
// Reader會(huì)從文件內(nèi)物理位置大于等于initial_offset的第一條記錄開(kāi)始讀
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset);
~Reader();
// 讀取下一個(gè)記錄到*record中,*scratch用于臨時(shí)存儲(chǔ)
bool ReadRecord(Slice* record, std::string* scratch);
// 返回上一條記錄的物理偏移
// 在第一次調(diào)用ReadRecord前調(diào)用該函數(shù)是無(wú)定義的。
// 因此要在ReadRecord之后調(diào)用該函數(shù)。
uint64_t LastRecordOffset();
private:
SequentialFile* const file_;
// 數(shù)據(jù)損壞報(bào)告
Reporter* const reporter_;
// 是否進(jìn)行數(shù)據(jù)校驗(yàn)
bool const checksum_;
// read以Block為單位去從磁盤(pán)取數(shù)據(jù),取完數(shù)據(jù)就是存在blocking_store_里面,
// 其實(shí)就是讀取數(shù)據(jù)的buffer
char* const backing_store_;
// 指向blocking_store_的slice對(duì)象,方便對(duì)blocking_store_的操作
Slice buffer_;
// 是否到了文件尾
bool eof_;
// 上一條記錄的偏移
uint64_t last_record_offset_;
// 當(dāng)前Block的結(jié)束位置的偏移
uint64_t end_of_buffer_offset_;
// 初始Offset,從該偏移出查找第一條記錄
uint64_t const initial_offset_;
// 這些特殊值是記錄類(lèi)型的擴(kuò)展
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// 跳過(guò)"initial_offset_"之前的所有Block.
bool SkipToInitialBlock();
// 讀取一條記錄中的數(shù)據(jù)字段,存儲(chǔ)在result中,返回記錄類(lèi)型或者上面的特殊值之一
unsigned int ReadPhysicalRecord(Slice* result);
// 將損壞的字節(jié)數(shù)報(bào)告給reporter.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
}
Reader類(lèi)的源文件
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::SkipToInitialBlock() {
// 構(gòu)造時(shí)傳入的initial_offset大于等于kBlockSize,則block_start_location
// 是第(initial_offset_ / kBlockSize)+1個(gè)Block起始位置的偏移。
// 當(dāng)initial_offset比kBlockSize小時(shí),則block_start_location是第1個(gè)Block
// 起始位置的偏移
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// offset_in_block > kBlockSize - 6,說(shuō)明已經(jīng)到了一個(gè)Block的尾部,
// 尾部填充的是6個(gè)空字符。此時(shí)只能定位到下一個(gè)Block的開(kāi)頭。
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// 如果block_start_location大于0,則文件中應(yīng)該跳過(guò)block_start_location
// 個(gè)字節(jié),到達(dá)目標(biāo)Block的開(kāi)頭。否則將數(shù)據(jù)損壞信息打印到LOG文件。
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear();
record->clear();
// 是否是分段的記錄
bool in_fragmented_record = false;
// 當(dāng)前讀取的記錄的邏輯偏移
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
// buffer_會(huì)在ReadPhysicalRecord中自偏移,實(shí)際上buffer_中存儲(chǔ)的是當(dāng)前Block
// 還未解析的記錄,而end_of_buffer_offset_是當(dāng)前Block的結(jié)束位置的偏移
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment);
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
// 當(dāng)為kFullType時(shí),物理記錄和邏輯記錄1:1的關(guān)系,所以offset也是一樣的
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
// 因?yàn)槭堑谝环侄危晕锢碛涗浀膐ffset,也是邏輯記錄的offset
// 注意第一個(gè)分段用的是assign添加到scratch
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
// 邏輯記錄結(jié)束,更新最近一條邏輯記錄的offset
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "partial record without end(3)");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
uint64_t Reader::LastRecordOffset() {
return last_record_offset_;
}
void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(bytes, reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
// 兩種情況下該條件成立
// 1.出現(xiàn)在第一次read,因?yàn)閎uffer_在reader的構(gòu)造函數(shù)里是初始化空
// 2.當(dāng)前buffer_的內(nèi)容為Block尾部的6個(gè)空字符,這時(shí)實(shí)際上當(dāng)前Block
// 以及解析完了,準(zhǔn)備解析下一個(gè)Block
if (buffer_.size() < kHeaderSize) {
if (!eof_) {
// 清空buffer_,存儲(chǔ)下一個(gè)Block
buffer_.clear();
// 從文件中每次讀取一個(gè)Block,Read內(nèi)部會(huì)做偏移,保證按順序讀取
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
// 當(dāng)前Block結(jié)束位置的偏移
end_of_buffer_offset_ += buffer_.size();
// 讀取失敗,打印LOG信息,并將eof_設(shè)置為true,終止log文件的解析
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
// 如果讀到的數(shù)據(jù)小于kBlockSize,也說(shuō)明到了文件結(jié)尾,eof_設(shè)為true
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
// 跳過(guò)后面的解析,因?yàn)閎uffer_.size() < kHeaderSize時(shí),buffer是無(wú)法解析的
continue;
} else if (buffer_.size() == 0) {
// 如果eof_為false,但是buffer_.size,說(shuō)明遇到了Bad Record,也應(yīng)該終止log文件的解析
return kEof;
} else {
// 如果最后一個(gè)Block的大小剛好為kBlockSize,且結(jié)尾為6個(gè)空字符
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast(header[4]) & 0xff;
const uint32_t b = static_cast(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
// 一個(gè)Block里放不下一條記錄,顯示是Bad Record
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
// 長(zhǎng)度為0的記錄,顯然也是Bad Record
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// 如果校驗(yàn)失敗,也是Bad Record
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
// buffer_的自偏移
buffer_.remove_prefix(kHeaderSize + length);
// 這樣的記錄也是Bad Record,不解釋了,太明顯
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
// 取出記錄中的數(shù)據(jù)字段
*result = Slice(header + kHeaderSize, length);
return type;
}
}
}
參考鏈接:http://blog.csdn.net/tankles/article/details/7663873