ceph mds源码分析(MDCache.cc篇)
数据结构介绍
- CInode(cache inode):CInode 包含某一文件的元数据,每个文件都有一个 CInode 。它存储着类似谁拥有这个文件、此文件有多大这样的信息。
- CDentry(cache dentry):CDentry 用于把索引节点和文件(或目录)名关联到一起。一个 CDentry 最多可链接到一个 CInode (也可以不链接任何 CInode ),一个 CInode 可被多个 CDentry 链接。
- CDir(cache dir):CDir 仅存在于目录索引节点下,它用于在目录下链接 CDentry 。目录被分片时,一个 CInode 可以有多个 CDir 。
下面是一个可行的数据组织形式:1
2
3
4
5
6
7
8
9
10
11
12
13CInode
CDir
| \
| \
| \
CDentry CDentry
CInode CInode
CDir CDir
| | \
| | \
| | \
CDentry CDentry CDentry
CInode CInode CInode
mds定义以及成员介绍
如果一个文件系统同时拥有多个mds,所有的元数据缓存会分布在mds集群中。对于每一个inode节点,在某一个时刻,只有唯一的一个mds机器被视为主mds节点。只有这一台机器拥有对inode节点的修改权限,当然其他的mds节点可以申请成为主mds节点。- ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
- std::map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
- CInode *root = nullptr; // root inode
- CInode *myin = nullptr; // .ceph/mds%d dir
mds源码分析以及数据io
- MDCache.cc文件
主函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
mds(m),
open_file_table(m),
filer(m->objecter, m->finisher),
stray_manager(m, purge_queue_),
recovery_queue(m),
trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
{
//新建一个migrator,注意到一个migrator里面包含了mdcache和mdrank等信息
migrator.reset(new Migrator(mds, this));
//定义文件最大的提交大小
max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
(g_conf()->mds_dir_max_commit_size << 20) :
(0.9 *(g_conf()->osd_max_write_size << 20));
//cache大小定义,以及cache健康的阈值定义
cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
//初始化LRU算法,具体的在Cdentry和/include/lru.h文件里有
lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
bottom_lru.lru_set_midpoint(0);
decayrate.set_halflife(g_conf()->mds_decay_halflife);
upkeeper = std::thread(&MDCache::upkeep_main, this);
}文件路径查找函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,const filepath& path, int flags,vector<CDentry*> *pdnvec, CInode **pin){ //该函数传入文件路径,返回一个vector类型的centries和对应路径名的cinode信息
//标记,用来判断该路径是否存在或者是正在被使用等等
bool discover = (flags & MDS_TRAVERSE_DISCOVER);
bool forward = !discover;
bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
if (forward)
ceph_assert(mdr); // forward requires a request
snapid_t snapid = CEPH_NOSNAP; //假设path不是snapid
if (mdr)
mdr->snapid = snapid;
client_t client = mdr ? mdr->get_client() : -1;
if (mds->logger) mds->logger->inc(l_mds_traverse);
dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
CInode *cur = get_inode(path.get_ino()); //mds开始在集群上根据传入的path寻找对应的inode信息
if (!cur) { //如果该路径不存在,会有两种情况
if (MDS_INO_IS_MDSDIR(path.get_ino())) { //判断是否在其他的mds节点上
open_foreign_mdsdir(path.get_ino(), cf.build());
return 1;
}
if (MDS_INO_IS_STRAY(path.get_ino())) { //如果path的inode信息时处于stray状态时
mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
filepath path(strays[idx]->get_parent_dn()->get_name(),
MDS_INO_MDSDIR(rank));
MDRequestRef null_ref;
return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
}
return -CEPHFS_ESTALE;
}
//以上我们会获取了根路径的inode节点结构信息
if (cur->state_test(CInode::STATE_PURGING)) //判断不是清除数据状态
return -CEPHFS_ESTALE;
...
// start trace
if (pdnvec) //清空用来记录根目录子节点的vector
pdnvec->clear();
if (pin)
*pin = cur; //pin作为返回的inode节点信息
MutationImpl::LockOpVec lov;
//通过我们获取的根inode信息,对子目录进行遍历
for (unsigned depth = 0; depth < path.depth(); ) {
dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
<< "' snapid " << snapid << dendl;
if (!cur->is_dir()) {
dout(7) << "traverse: " << *cur << " not a dir " << dendl;
return -CEPHFS_ENOTDIR;
}
...
// 打开目录
frag_t fg = cur->pick_dirfrag(path[depth]);
CDir *curdir = cur->get_dirfrag(fg); //通过inode信息获取他的cdir结构信息
if (!curdir) {
...
// 通过上面获取的cdir结构信息访问下一层的cdentry结构信息
CDentry *dn = curdir->lookup(path[depth], snapid); //lookup函数会进行寻找
if (dn) { //如果成功找到了
if (dn->state_test(CDentry::STATE_PURGING)) //进行状态检查
return -CEPHFS_ENOENT;
if (rdlock_path) { //锁操作等等
...
if (pdnvec) //最后把找到的下一层cdentry存入vector中
pdnvec->push_back(dn);
//这里的linkage数据结构保存了其是否有指向inode的指针
CDentry::linkage_t *dnl = dn->get_projected_linkage();
// can we conclude CEPHFS_ENOENT?
if (dnl->is_null()) {
dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
if (depth == path.depth() - 1) {
if (want_dentry)
break;
} else {
if (pdnvec)
pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
}
return -CEPHFS_ENOENT;
}
// 通过上面获取的linkage访问inode
CInode *in = dnl->get_inode();
//假设成功
...
cur = in;
if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
lov.clear();
lov.add_rdlock(&cur->snaplock);
if (!mds->locker->acquire_locks(mdr, lov)) {
dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
return 1;
}
}
// 添加到路径中,继续往更深一层进行访问
touch_inode(cur);
if (pin)
*pin = cur;
depth++;
continue;
}
ceph_assert(!dn);
// dentry结构不存在
dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
if (curdir->is_auth()) {
// dentry is mine.
if (curdir->is_complete() ||
(snapid == CEPH_NOSNAP &&
curdir->has_bloom() &&
!curdir->is_in_bloom(path[depth]))) {
// 文件没有找到
if (pdnvec) { //没有遍历完的几种情况
// instantiate a null dn?
if (depth < path.depth() - 1) {
dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
} else if (snapid < CEPH_MAXSNAP) {
dout(20) << " not adding null for snapid " << snapid << dendl;
} else if (curdir->is_frozen()) {
dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
return 1;
} else {
// 创建一个新的dentry
dn = curdir->add_null_dentry(path[depth]);
dout(20) << " added null " << *dn << dendl;
if (rdlock_path) {
lov.clear();
if (xlock_dentry) {
if (depth > 0 || !mdr->lock_cache) {
lov.add_wrlock(&cur->filelock);
lov.add_wrlock(&cur->nestlock);
if (rdlock_authlock)
lov.add_rdlock(&cur->authlock);
}
lov.add_xlock(&dn->lock);
} else {
// force client to flush async dir operation if necessary
if (cur->filelock.is_cached())
lov.add_wrlock(&cur->filelock);
lov.add_rdlock(&dn->lock);
}
if (!mds->locker->acquire_locks(mdr, lov)) {
dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
return 1;
}
}
}
if (dn) {
pdnvec->push_back(dn);
if (want_dentry)
break;
} else {
pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
}
}
return -CEPHFS_ENOENT;
} else {
// Check DamageTable for missing fragments before trying to fetch
// this
if (mds->damage_table.is_dirfrag_damaged(curdir)) {
dout(4) << "traverse: damaged dirfrag " << *curdir
<< ", blocking fetch" << dendl;
return -CEPHFS_EIO;
}
// directory isn't complete; reload
dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
touch_inode(cur);
curdir->fetch(cf.build(), path[depth]);
if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
return 1;
}
} else {
// dirfrag/dentry is not mine.
mds_authority_t dauth = curdir->authority();
if (forward &&
mdr && mdr->client_request &&
(int)depth < mdr->client_request->get_num_fwd()){
dout(7) << "traverse: snap " << snapid << " and depth " << depth
<< " < fwd " << mdr->client_request->get_num_fwd()
<< ", discovering instead of forwarding" << dendl;
discover = true;
}
if ((discover)) {
dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
path_locked);
if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
return 1;
}
if (forward) {
// forward
dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
if (curdir->is_ambiguous_auth()) {
// wait
dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
return 1;
}
dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
request_forward(mdr, dauth.first);
if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
return 2;
}
}
ceph_abort(); // i shouldn't get here
}
//是否为主mds节点
if (want_auth && !want_dentry) {
if (cur->is_ambiguous_auth()) {
dout(10) << "waiting for single auth on " << *cur << dendl;
cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
return 1;
}
if (!cur->is_auth()) {
dout(10) << "fw to auth for " << *cur << dendl;
request_forward(mdr, cur->authority().first);
return 2;
}
}
// 成功操作
if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
dout(10) << "path_traverse finish on snapid " << snapid << dendl;
if (mdr)
ceph_assert(mdr->snapid == snapid);
if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
mdr->locking_state |= MutationImpl::SNAP_LOCKED;
else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
if (rdlock_path)
mdr->locking_state |= MutationImpl::PATH_LOCKED;
return 0;
}
- mds文件查找流程整理 首先我们打开一个client端,输入以上的命令,代表创建一个新的文件,接下来会给出发生的函数调用:
1
touch /mnt/dir1/dir2/file2.txt
- 调用server端的处理客户端请求的函数,调用mds的消息处理函数,接受来自client端的消息请求
- 进入mds的cache部分,获取到root的inode数字
- 进入遍历path的循环,这个时候打印path的值为dir1,通过inode找到dir最后找到下一层的entry直到循环结束。
- 客户端获取了dir1下一层的inode(存在vector里面),然后再次向mds发送下一层对应目录的inode和文件名dir2.
- 重复以上的操作直到执行到file2.txt,发现其下一层的dentry是null,最后返回cephfs_enoent表示是最后一个组件
给cache添加/删除cinode信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83void MDCache::add_inode(CInode *in)
{
// add to inode map
if (in->last == CEPH_NOSNAP) { //判断是不是snap文件
auto &p = inode_map[in->ino()];
ceph_assert(!p);
p = in;
} else {
auto &p = snap_inode_map[in->vino()];
ceph_assert(!p);
p = in;
}
if (in->ino() < MDS_INO_SYSTEM_BASE) {
if (in->ino() == CEPH_INO_ROOT)
root = in;
else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
myin = in;
else if (in->is_stray()) {
if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
}
}
if (in->is_base())
base_inodes.insert(in);
}
}
//从cache中移除某个节点信息
void MDCache::remove_inode(CInode *o)
{
dout(14) << "remove_inode " << *o << dendl;
if (o->get_parent_dn()) {
// 判断是否有多个父节点,如果有就是脏数据
CDentry *dn = o->get_parent_dn();
ceph_assert(!dn->is_dirty());
dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
}
if (o->is_dirty())
o->mark_clean();
if (o->is_dirty_parent())
o->clear_dirty_parent();
o->clear_scatter_dirty();
o->clear_clientwriteable();
o->item_open_file.remove_myself();
if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
export_pin_queue.erase(o);
if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
export_pin_delayed_queue.erase(o);
o->clear_ephemeral_pin(true, true);
// remove from inode map
if (o->last == CEPH_NOSNAP) {
inode_map.erase(o->ino());
} else {
o->item_caps.remove_myself();
snap_inode_map.erase(o->vino());
}
if (o->ino() < MDS_INO_SYSTEM_BASE) {
if (o == root) root = 0;
if (o == myin) myin = 0;
if (o->is_stray()) {
if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
}
}
if (o->is_base())
base_inodes.erase(o);
}
// delete it
ceph_assert(o->get_num_ref() == 0);
delete o;
}打印cache信息(通过这里可以看到那些cache关键的数据结构)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32void MDCache::show_cache()
{
if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
return;
dout(7) << "show_cache" << dendl;
auto show_func = [this](CInode *in) {
// unlinked?
if (!in->parent)
dout(7) << " unlinked " << *in << dendl;
// dirfrags?
auto&& dfs = in->get_dirfrags();
for (const auto& dir : dfs) {
dout(7) << " dirfrag " << *dir << dendl;
for (auto &p : dir->items) {
CDentry *dn = p.second;
dout(7) << " dentry " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_linkage();
if (dnl->is_primary() && dnl->get_inode())
dout(7) << " inode " << *dnl->get_inode() << dendl;
}
}
};
//打印所有的inode
for (auto &p : inode_map)
show_func(p.second);
//打印所有的snap_inode
for (auto &p : snap_inode_map)
show_func(p.second);
}
- MDCache.cc文件
参考