0%

ceph mds

  • ceph mds源码分析(MDCache.cc篇)

  • 数据结构介绍

    1. CInode(cache inode):CInode 包含某一文件的元数据,每个文件都有一个 CInode 。它存储着类似谁拥有这个文件、此文件有多大这样的信息。
    2. CDentry(cache dentry):CDentry 用于把索引节点和文件(或目录)名关联到一起。一个 CDentry 最多可链接到一个 CInode (也可以不链接任何 CInode ),一个 CInode 可被多个 CDentry 链接。
    3. CDir(cache dir):CDir 仅存在于目录索引节点下,它用于在目录下链接 CDentry 。目录被分片时,一个 CInode 可以有多个 CDir 。
      下面是一个可行的数据组织形式:
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      CInode
      CDir
      | \
      | \
      | \
      CDentry CDentry
      CInode CInode
      CDir CDir
      | | \
      | | \
      | | \
      CDentry CDentry CDentry
      CInode CInode CInode
  • mds定义以及成员介绍
    如果一个文件系统同时拥有多个mds,所有的元数据缓存会分布在mds集群中。对于每一个inode节点,在某一个时刻,只有唯一的一个mds机器被视为主mds节点。只有这一台机器拥有对inode节点的修改权限,当然其他的mds节点可以申请成为主mds节点。

    • ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
    • std::map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
    • CInode *root = nullptr; // root inode
    • CInode *myin = nullptr; // .ceph/mds%d dir
  • mds源码分析以及数据io

    • MDCache.cc文件
      1. 主函数

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
        mds(m),
        open_file_table(m),
        filer(m->objecter, m->finisher),
        stray_manager(m, purge_queue_),
        recovery_queue(m),
        trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
        {
        //新建一个migrator,注意到一个migrator里面包含了mdcache和mdrank等信息
        migrator.reset(new Migrator(mds, this));

        //定义文件最大的提交大小
        max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
        (g_conf()->mds_dir_max_commit_size << 20) :
        (0.9 *(g_conf()->osd_max_write_size << 20));
        //cache大小定义,以及cache健康的阈值定义
        cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
        cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
        cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");

        export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
        export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
        export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
        //初始化LRU算法,具体的在Cdentry和/include/lru.h文件里有
        lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));

        bottom_lru.lru_set_midpoint(0);

        decayrate.set_halflife(g_conf()->mds_decay_halflife);

        upkeeper = std::thread(&MDCache::upkeep_main, this);
        }
      2. 文件路径查找函数

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        38
        39
        40
        41
        42
        43
        44
        45
        46
        47
        48
        49
        50
        51
        52
        53
        54
        55
        56
        57
        58
        59
        60
        61
        62
        63
        64
        65
        66
        67
        68
        69
        70
        71
        72
        73
        74
        75
        76
        77
        78
        79
        80
        81
        82
        83
        84
        85
        86
        87
        88
        89
        90
        91
        92
        93
        94
        95
        96
        97
        98
        99
        100
        101
        102
        103
        104
        105
        106
        107
        108
        109
        110
        111
        112
        113
        114
        115
        116
        117
        118
        119
        120
        121
        122
        123
        124
        125
        126
        127
        128
        129
        130
        131
        132
        133
        134
        135
        136
        137
        138
        139
        140
        141
        142
        143
        144
        145
        146
        147
        148
        149
        150
        151
        152
        153
        154
        155
        156
        157
        158
        159
        160
        161
        162
        163
        164
        165
        166
        167
        168
        169
        170
        171
        172
        173
        174
        175
        176
        177
        178
        179
        180
        181
        182
        183
        184
        185
        186
        187
        188
        189
        190
        191
        192
        193
        194
        195
        196
        197
        198
        199
        200
        201
        202
        203
        204
        205
        206
        207
        208
        209
        210
        211
        212
        213
        214
        215
        216
        217
        218
        219
        220
        221
        222
        223
        224
        225
        226
        227
        228
        229
        230
        231
        232
        233
        234
        235
        236
        237
        238
        239
        240
        241
        242
        243
        244
        245
        246
        247
        248
        249
        250
        251
        252
        253
        254
        255
        256
        257
        258
        259
        260
        261
        262
        263
        264
        265
        266
        267
        268
        269
        270
        271
        int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,const filepath& path, int flags,vector<CDentry*> *pdnvec, CInode **pin){    //该函数传入文件路径,返回一个vector类型的centries和对应路径名的cinode信息
        //标记,用来判断该路径是否存在或者是正在被使用等等
        bool discover = (flags & MDS_TRAVERSE_DISCOVER);
        bool forward = !discover;
        bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
        bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
        bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
        bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
        bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
        bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
        bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);

        if (forward)
        ceph_assert(mdr); // forward requires a request

        snapid_t snapid = CEPH_NOSNAP; //假设path不是snapid
        if (mdr)
        mdr->snapid = snapid;

        client_t client = mdr ? mdr->get_client() : -1;

        if (mds->logger) mds->logger->inc(l_mds_traverse);

        dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
        CInode *cur = get_inode(path.get_ino()); //mds开始在集群上根据传入的path寻找对应的inode信息
        if (!cur) { //如果该路径不存在,会有两种情况
        if (MDS_INO_IS_MDSDIR(path.get_ino())) { //判断是否在其他的mds节点上
        open_foreign_mdsdir(path.get_ino(), cf.build());
        return 1;
        }
        if (MDS_INO_IS_STRAY(path.get_ino())) { //如果path的inode信息时处于stray状态时
        mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
        unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
        filepath path(strays[idx]->get_parent_dn()->get_name(),
        MDS_INO_MDSDIR(rank));
        MDRequestRef null_ref;
        return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
        }
        return -CEPHFS_ESTALE;
        }
        //以上我们会获取了根路径的inode节点结构信息
        if (cur->state_test(CInode::STATE_PURGING)) //判断不是清除数据状态
        return -CEPHFS_ESTALE;
        ...

        // start trace
        if (pdnvec) //清空用来记录根目录子节点的vector
        pdnvec->clear();
        if (pin)
        *pin = cur; //pin作为返回的inode节点信息

        MutationImpl::LockOpVec lov;
        //通过我们获取的根inode信息,对子目录进行遍历
        for (unsigned depth = 0; depth < path.depth(); ) {
        dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
        << "' snapid " << snapid << dendl;

        if (!cur->is_dir()) {
        dout(7) << "traverse: " << *cur << " not a dir " << dendl;
        return -CEPHFS_ENOTDIR;
        }

        ...

        // 打开目录
        frag_t fg = cur->pick_dirfrag(path[depth]);
        CDir *curdir = cur->get_dirfrag(fg); //通过inode信息获取他的cdir结构信息
        if (!curdir) {

        ...

        // 通过上面获取的cdir结构信息访问下一层的cdentry结构信息
        CDentry *dn = curdir->lookup(path[depth], snapid); //lookup函数会进行寻找
        if (dn) { //如果成功找到了
        if (dn->state_test(CDentry::STATE_PURGING)) //进行状态检查
        return -CEPHFS_ENOENT;

        if (rdlock_path) { //锁操作等等

        ...

        if (pdnvec) //最后把找到的下一层cdentry存入vector中
        pdnvec->push_back(dn);

        //这里的linkage数据结构保存了其是否有指向inode的指针
        CDentry::linkage_t *dnl = dn->get_projected_linkage();
        // can we conclude CEPHFS_ENOENT?
        if (dnl->is_null()) {
        dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
        if (depth == path.depth() - 1) {
        if (want_dentry)
        break;
        } else {
        if (pdnvec)
        pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
        }
        return -CEPHFS_ENOENT;
        }

        // 通过上面获取的linkage访问inode
        CInode *in = dnl->get_inode();
        //假设成功
        ...

        cur = in;

        if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
        lov.clear();
        lov.add_rdlock(&cur->snaplock);
        if (!mds->locker->acquire_locks(mdr, lov)) {
        dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
        return 1;
        }
        }

        // 添加到路径中,继续往更深一层进行访问
        touch_inode(cur);
        if (pin)
        *pin = cur;
        depth++;
        continue;
        }

        ceph_assert(!dn);

        // dentry结构不存在
        dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;

        if (curdir->is_auth()) {
        // dentry is mine.
        if (curdir->is_complete() ||
        (snapid == CEPH_NOSNAP &&
        curdir->has_bloom() &&
        !curdir->is_in_bloom(path[depth]))) {
        // 文件没有找到
        if (pdnvec) { //没有遍历完的几种情况
        // instantiate a null dn?
        if (depth < path.depth() - 1) {
        dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
        } else if (snapid < CEPH_MAXSNAP) {
        dout(20) << " not adding null for snapid " << snapid << dendl;
        } else if (curdir->is_frozen()) {
        dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
        curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
        return 1;
        } else {
        // 创建一个新的dentry
        dn = curdir->add_null_dentry(path[depth]);
        dout(20) << " added null " << *dn << dendl;

        if (rdlock_path) {
        lov.clear();
        if (xlock_dentry) {
        if (depth > 0 || !mdr->lock_cache) {
        lov.add_wrlock(&cur->filelock);
        lov.add_wrlock(&cur->nestlock);
        if (rdlock_authlock)
        lov.add_rdlock(&cur->authlock);
        }
        lov.add_xlock(&dn->lock);
        } else {
        // force client to flush async dir operation if necessary
        if (cur->filelock.is_cached())
        lov.add_wrlock(&cur->filelock);
        lov.add_rdlock(&dn->lock);
        }
        if (!mds->locker->acquire_locks(mdr, lov)) {
        dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
        return 1;
        }
        }
        }
        if (dn) {
        pdnvec->push_back(dn);
        if (want_dentry)
        break;
        } else {
        pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
        }
        }
        return -CEPHFS_ENOENT;
        } else {

        // Check DamageTable for missing fragments before trying to fetch
        // this
        if (mds->damage_table.is_dirfrag_damaged(curdir)) {
        dout(4) << "traverse: damaged dirfrag " << *curdir
        << ", blocking fetch" << dendl;
        return -CEPHFS_EIO;
        }

        // directory isn't complete; reload
        dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
        touch_inode(cur);
        curdir->fetch(cf.build(), path[depth]);
        if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
        return 1;
        }
        } else {
        // dirfrag/dentry is not mine.
        mds_authority_t dauth = curdir->authority();

        if (forward &&
        mdr && mdr->client_request &&
        (int)depth < mdr->client_request->get_num_fwd()){
        dout(7) << "traverse: snap " << snapid << " and depth " << depth
        << " < fwd " << mdr->client_request->get_num_fwd()
        << ", discovering instead of forwarding" << dendl;
        discover = true;
        }

        if ((discover)) {
        dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
        discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
        path_locked);
        if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
        return 1;
        }
        if (forward) {
        // forward
        dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;

        if (curdir->is_ambiguous_auth()) {
        // wait
        dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
        curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
        return 1;
        }

        dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;

        request_forward(mdr, dauth.first);

        if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
        return 2;
        }
        }

        ceph_abort(); // i shouldn't get here
        }

        //是否为主mds节点
        if (want_auth && !want_dentry) {
        if (cur->is_ambiguous_auth()) {
        dout(10) << "waiting for single auth on " << *cur << dendl;
        cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
        return 1;
        }
        if (!cur->is_auth()) {
        dout(10) << "fw to auth for " << *cur << dendl;
        request_forward(mdr, cur->authority().first);
        return 2;
        }
        }

        // 成功操作
        if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
        dout(10) << "path_traverse finish on snapid " << snapid << dendl;
        if (mdr)
        ceph_assert(mdr->snapid == snapid);

        if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
        mdr->locking_state |= MutationImpl::SNAP_LOCKED;
        else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
        mdr->locking_state |= MutationImpl::SNAP2_LOCKED;

        if (rdlock_path)
        mdr->locking_state |= MutationImpl::PATH_LOCKED;

        return 0;
        }
      • mds文件查找流程整理
        1
        touch /mnt/dir1/dir2/file2.txt
        首先我们打开一个client端,输入以上的命令,代表创建一个新的文件,接下来会给出发生的函数调用:
        1. 调用server端的处理客户端请求的函数,调用mds的消息处理函数,接受来自client端的消息请求
        2. 进入mds的cache部分,获取到root的inode数字
        3. 进入遍历path的循环,这个时候打印path的值为dir1,通过inode找到dir最后找到下一层的entry直到循环结束。
        4. 客户端获取了dir1下一层的inode(存在vector里面),然后再次向mds发送下一层对应目录的inode和文件名dir2.
        5. 重复以上的操作直到执行到file2.txt,发现其下一层的dentry是null,最后返回cephfs_enoent表示是最后一个组件
    1. 给cache添加/删除cinode信息

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      void MDCache::add_inode(CInode *in)
      {
      // add to inode map
      if (in->last == CEPH_NOSNAP) { //判断是不是snap文件
      auto &p = inode_map[in->ino()];
      ceph_assert(!p);
      p = in;
      } else {
      auto &p = snap_inode_map[in->vino()];
      ceph_assert(!p);
      p = in;
      }

      if (in->ino() < MDS_INO_SYSTEM_BASE) {
      if (in->ino() == CEPH_INO_ROOT)
      root = in;
      else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
      myin = in;
      else if (in->is_stray()) {
      if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
      strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
      }
      }
      if (in->is_base())
      base_inodes.insert(in);
      }
      }
      //从cache中移除某个节点信息
      void MDCache::remove_inode(CInode *o)
      {
      dout(14) << "remove_inode " << *o << dendl;

      if (o->get_parent_dn()) {
      // 判断是否有多个父节点,如果有就是脏数据
      CDentry *dn = o->get_parent_dn();
      ceph_assert(!dn->is_dirty());
      dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
      }

      if (o->is_dirty())
      o->mark_clean();
      if (o->is_dirty_parent())
      o->clear_dirty_parent();

      o->clear_scatter_dirty();

      o->clear_clientwriteable();

      o->item_open_file.remove_myself();

      if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
      export_pin_queue.erase(o);

      if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
      export_pin_delayed_queue.erase(o);

      o->clear_ephemeral_pin(true, true);

      // remove from inode map
      if (o->last == CEPH_NOSNAP) {
      inode_map.erase(o->ino());
      } else {
      o->item_caps.remove_myself();
      snap_inode_map.erase(o->vino());
      }

      if (o->ino() < MDS_INO_SYSTEM_BASE) {
      if (o == root) root = 0;
      if (o == myin) myin = 0;
      if (o->is_stray()) {
      if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
      strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
      }
      }
      if (o->is_base())
      base_inodes.erase(o);
      }

      // delete it
      ceph_assert(o->get_num_ref() == 0);
      delete o;
      }

    2. 打印cache信息(通过这里可以看到那些cache关键的数据结构)

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      void MDCache::show_cache()
      {
      if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
      return;
      dout(7) << "show_cache" << dendl;

      auto show_func = [this](CInode *in) {
      // unlinked?
      if (!in->parent)
      dout(7) << " unlinked " << *in << dendl;

      // dirfrags?
      auto&& dfs = in->get_dirfrags();
      for (const auto& dir : dfs) {
      dout(7) << " dirfrag " << *dir << dendl;

      for (auto &p : dir->items) {
      CDentry *dn = p.second;
      dout(7) << " dentry " << *dn << dendl;
      CDentry::linkage_t *dnl = dn->get_linkage();
      if (dnl->is_primary() && dnl->get_inode())
      dout(7) << " inode " << *dnl->get_inode() << dendl;
      }
      }
      };
      //打印所有的inode
      for (auto &p : inode_map)
      show_func(p.second);
      //打印所有的snap_inode
      for (auto &p : snap_inode_map)
      show_func(p.second);
      }
  • 参考