運行陣列意味著陣列經歷從無到有,建立了作為一個raid應有的屬性(如同步重建),並為隨後的讀寫做好的鋪墊。那麼運行陣列的時候到底做了哪些事情,讓原來的磁盤像變形金剛一樣組成一個新的巨無霸。現在就來看陣列運行處理流程:
5158 static int do_md_run(struct mddev *mddev)
5159 {
5160 int err;
5161
5162 err = md_run(mddev);
5163 if (err)
5164 goto out;
5165 err = bitmap_load(mddev);
5166 if (err) {
5167 bitmap_destroy(mddev);
5168 goto out;
5169 }
5170
5171 md_wakeup_thread(mddev->thread);
5172 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */5173
5174 set_capacity(mddev->gendisk, mddev->array_sectors);
5175 revalidate_disk(mddev->gendisk);
5176 mddev->changed = 1;
5177 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5178 out:
5179 return err;
5180 }
如果說運行陣列的過程是一本書,那麼這個函數就是這本書的目錄,每一個目錄中都隱含著一個深刻的故事。
5162行,md_run運行陣列,這個函數比較長,我們按一段一段來分析:
4956 int md_run(struct mddev *mddev)
4957 {
4958 int err;
4959 struct md_rdev *rdev;
4960 struct md_personality *pers;
4961
4962 if (list_empty(&mddev->disks))
4963 /* cannot run an array with no devices.. */
4964 return -EINVAL;
4965
4966 if (mddev->pers)
4967 return -EBUSY;
4968 /* Cannot run until previous stop completes properly */
4969 if (mddev->sysfs_active)
4970 return -EBUSY;
4971
4972 /*
4973 * Analyze all RAID superblock(s)
4974 */
4975 if (!mddev->raid_disks) {
4976 if (!mddev->persistent)
4977 return -EINVAL;
4978 analyze_sbs(mddev);
4979 }
4962-4969行檢查,陣列還沒運行,所以直接到4978行。
4978行,analyze_sbs,分析超級塊,依次分析每一個磁盤的超級塊,不符合陣列需求的磁盤將會被踢出陣列。
3310 static void analyze_sbs(struct mddev * mddev)
3311 {
3312 int i;
3313 struct md_rdev *rdev, *freshest, *tmp;
3314 char b[BDEVNAME_SIZE];
3315
3316 freshest = NULL;
3317 rdev_for_each_safe(rdev, tmp, mddev)
3318 switch (super_types[mddev->major_version].
3319 load_super(rdev, freshest, mddev->minor_version)) {
3320 case 1:
3321 freshest = rdev;
3322 break;
3323 case 0:
3324 break;
3325 default:
3326 printk( KERN_ERR \
3327 "md: fatal superblock inconsistency in %s"
3328 " -- removing from array\n",
3329 bdevname(rdev->bdev,b));
3330 kick_rdev_from_array(rdev);
3331 }
3332
3333
3334 super_types[mddev->major_version].
3335 validate_super(mddev, freshest);
3336
3337 i = 0;
3338 rdev_for_each_safe(rdev, tmp, mddev) {
3339 if (mddev->max_disks &&
3340 (rdev->desc_nr >= mddev->max_disks ||
3341 i > mddev->max_disks)) {
3342 printk(KERN_WARNING
3343 "md: %s: %s: only %d devices permitted\n",
3344 mdname(mddev), bdevname(rdev->bdev, b),
3345 mddev->max_disks);
3346 kick_rdev_from_array(rdev);
3347 continue;
3348 }
3349 if (rdev != freshest)
3350 if (super_types[mddev->major_version].
3351 validate_super(mddev, rdev)) {
3352 printk(KERN_WARNING "md: kicking non-fresh %s"
3353 " from array!\n",
3354 bdevname(rdev->bdev,b));
3355 kick_rdev_from_array(rdev);
3356 continue;
3357 }
3358 if (mddev->level == LEVEL_MULTIPATH) {
3359 rdev->desc_nr = i++;
3360 rdev->raid_disk = rdev->desc_nr;
3361 set_bit(In_sync, &rdev->flags);
3362 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3363 rdev->raid_disk = -1;
3364 clear_bit(In_sync, &rdev->flags);
3365 }
3366 }
3367 }
3316-3331行,依次對陣列中每一個磁盤加載超級塊,如果是最新的超級塊則保存對應的struct md_rdev在freshest指針中,如果是不符合條件的超級塊,將會踢出陣列。
3319行,我們用1.2版本的超級塊,那麼對應這裡load_super為super_1_load函數,這個函數就是把超級塊信息從磁盤讀出來,然後保存在md_rdev->sb_page中。然而這個函數還額外做了一件事情,就是比較哪個磁盤的超級塊最新,看函數原型:
1433 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
第一個參數就是要加載超級塊的磁盤,第二個參數是目前為止最新的超級塊,第一次比較時為空。當返回值為1時表示rdev為最新,當返回為0時表示rdfdev仍然為最新超級塊,小於0表示非法超級塊。
3330行,將非法超級塊的磁盤踢出陣列。
3334行,對應的validate_super函數為super_1_validate,這個函數根據最新超級塊信息初始化了陣列struct mddev信息,這裡代碼省略了不相關的if分支:
1600 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1601 {
1602 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1603 __u64 ev1 = le64_to_cpu(sb->events);
1604
1605 rdev->raid_disk = -1;
1606 clear_bit(Faulty, &rdev->flags);
1607 clear_bit(In_sync, &rdev->flags);
1608 clear_bit(WriteMostly, &rdev->flags);
1609
1610 if (mddev->raid_disks == 0) {
1611 mddev->major_version = 1;
1612 mddev->patch_version = 0;
1613 mddev->external = 0;
1614 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1615 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1616 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1617 mddev->level = le32_to_cpu(sb->level);
1618 mddev->clevel[0] = 0;
1619 mddev->layout = le32_to_cpu(sb->layout);
1620 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1621 mddev->dev_sectors = le64_to_cpu(sb->size);
1622 mddev->events = ev1;
1623 mddev->bitmap_info.offset = 0;
1624 mddev->bitmap_info.space = 0;
1625 /* Default location for bitmap is 1K after superblock
1626 * using 3K - total of 4K
1627 */
1628 mddev->bitmap_info.default_offset = 1024 >> 9;
1629 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1630 mddev->reshape_backwards = 0;
1631
1632 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1633 memcpy(mddev->uuid, sb->set_uuid, 16);
1634
1635 mddev->max_disks = (4096-256)/2;
1636
1637 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1638 mddev->bitmap_info.file == NULL) {
1639 mddev->bitmap_info.offset =
1640 (__s32)le32_to_cpu(sb->bitmap_offset);
1641 /* Metadata doesn't record how much space is available.
1642 * For 1.0, we assume we can use up to the superblock
1643 * if before, else to 4K beyond superblock.
1644 * For others, assume no change is possible.
1645 */
1646 if (mddev->minor_version > 0)
1647 mddev->bitmap_info.space = 0;
1648 else if (mddev->bitmap_info.offset > 0)
1649 mddev->bitmap_info.space =
1650 8 - mddev->bitmap_info.offset;
1651 else
1652 mddev->bitmap_info.space =
1653 -mddev->bitmap_info.offset;
1654 }
1655
1656 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1657 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1658 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1659 mddev->new_level = le32_to_cpu(sb->new_level);
1660 mddev->new_layout = le32_to_cpu(sb->new_layout);
1661 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1662 if (mddev->delta_disks < 0 ||
1663 (mddev->delta_disks == 0 &&
1664 (le32_to_cpu(sb->feature_map)
1665 & MD_FEATURE_RESHAPE_BACKWARDS)))
1666 mddev->reshape_backwards = 1;
1667 } else {
1668 mddev->reshape_position = MaxSector;
1669 mddev->delta_disks = 0;
1670 mddev->new_level = mddev->level;
1671 mddev->new_layout = mddev->layout;
1672 mddev->new_chunk_sectors = mddev->chunk_sectors;
1673 }
1674
1675 }
...
1695 if (mddev->level != LEVEL_MULTIPATH) {
1696 int role;
1697 if (rdev->desc_nr < 0 ||
1698 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1699 role = 0xffff;
1700 rdev->desc_nr = -1;
1701 } else
1702 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1703 switch(role) {
1704 case 0xffff: /* spare */
1705 break;
1706 case 0xfffe: /* faulty */
1707 set_bit(Faulty, &rdev->flags);
1708 break;
1709 default:
1710 if ((le32_to_cpu(sb->feature_map) &
1711 MD_FEATURE_RECOVERY_OFFSET))
1712 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1713 else
1714 set_bit(In_sync, &rdev->flags);
1715 rdev->raid_disk = role;
1716 break;
1717 }
1718 if (sb->devflags & WriteMostly1)
1719 set_bit(WriteMostly, &rdev->flags);
1720 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1721 set_bit(Replacement, &rdev->flags);
1722 } else /* MULTIPATH are always insync */
1723 set_bit(In_sync, &rdev->flags);
1724
1725 return 0;
1726 }
1602行,獲取磁盤對應的超級塊信息。
1610行,if分支成立,進入初始化struct mddev結構體,就是將陣列磁盤中最新超級塊信息賦給struct mddev。
1695行,設置rdev->raid_disk和rdev->recovery_offset信息,注意這裡的role有幾個特殊值,0xffff表示熱備盤,0xfffe表示faulty盤。recovery_offset顧名思義就是已重建偏移,In_sync表示磁盤在同步狀態,WriteMostly表示優先讀只用於raid1陣列。
又回到analyze_sbs函數中,
3338行,這個循環遍歷陣列所有磁盤,依次validate每一個磁盤。這裡的作用就是給每一個磁盤定一個身份,到底是數據盤啊還是熱備盤,當然還有些磁盤超級塊信息檢查不合格,要淘汰出陣列的。
3350行,再一次進入validate_super函數,不過上一次主要作用是初始化struct mddev信息,這一次主要鑒定磁盤身份信息。
1600 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1601 {
1602 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1603 __u64 ev1 = le64_to_cpu(sb->events);
1604
1605 rdev->raid_disk = -1;
1606 clear_bit(Faulty, &rdev->flags);
1607 clear_bit(In_sync, &rdev->flags);
1608 clear_bit(WriteMostly, &rdev->flags);
1609
1610 if (mddev->raid_disks == 0) {
...
1675 } else if (mddev->pers == NULL) {
1676 /* Insist of good event counter while assembling, except for
1677 * spares (which don't need an event count) */
1678 ++ev1;
1679 if (rdev->desc_nr >= 0 &&
1680 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1681 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1682 if (ev1 < mddev->events)
1683 return -EINVAL;
1684 }
1695 if (mddev->level != LEVEL_MULTIPATH) {
1696 int role;
1697 if (rdev->desc_nr < 0 ||
1698 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1699 role = 0xffff;
1700 rdev->desc_nr = -1;
1701 } else
1702 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1703 switch(role) {
1704 case 0xffff: /* spare */
1705 break;
1706 case 0xfffe: /* faulty */
1707 set_bit(Faulty, &rdev->flags);
1708 break;
1709 default:
1710 if ((le32_to_cpu(sb->feature_map) &
1711 MD_FEATURE_RECOVERY_OFFSET))
1712 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1713 else
1714 set_bit(In_sync, &rdev->flags);
1715 rdev->raid_disk = role;
1716 break;
1717 }
1718 if (sb->devflags & WriteMostly1)
1719 set_bit(WriteMostly, &rdev->flags);
1720 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1721 set_bit(Replacement, &rdev->flags);
1722 } else /* MULTIPATH are always insync */
1723 set_bit(In_sync, &rdev->flags);
1724
1725 return 0;
1726 }
1610行,經過上一次struct mddev的初始化,這時raid_disk已經不為0了。
1675行,陣列還未運行起來,if成立進入分支。
1679行,先判斷rdev->desc_nr是否合法,再判斷是否為數據盤。
1682行,如果為數據盤,則判斷時間戳是否為最新,不是最新的超級塊,數據也不是最新的,就不能繼續留在陣列中了。
1695行,設置rdev->raid_disk和rdev->recovery_offset信息。
analyze_sbs函數已經完成,返回到md_run函數中繼續往下看:
4981 if (mddev->level != LEVEL_NONE)
4982 request_module("md-level-%d", mddev->level);
4983 else if (mddev->clevel[0])
4984 request_module("md-%s", mddev->clevel);
4985
4986 /*
4987 * Drop all container device buffers, from now on
4988 * the only valid external interface is through the md
4989 * device.
4990 */
4991 rdev_for_each(rdev, mddev) {
4992 if (test_bit(Faulty, &rdev->flags))
4993 continue;
4994 sync_blockdev(rdev->bdev);
4995 invalidate_bdev(rdev->bdev);
4996
4997 /* perform some consistency tests on the device.
4998 * We don't want the data to overlap the metadata,
4999 * Internal Bitmap issues have been handled elsewhere.
5000 */
5001 if (rdev->meta_bdev) {
5002 /* Nothing to check */;
5003 } else if (rdev->data_offset < rdev->sb_start) {
5004 if (mddev->dev_sectors &&
5005 rdev->data_offset + mddev->dev_sectors
5006 > rdev->sb_start) {
5007 printk("md: %s: data overlaps metadata\n",
5008 mdname(mddev));
5009 return -EINVAL;
5010 }
5011 } else {
5012 if (rdev->sb_start + rdev->sb_size/512
5013 > rdev->data_offset) {
5014 printk("md: %s: metadata overlaps data\n",
5015 mdname(mddev));
5016 return -EINVAL;
5017 }
5018 }
5019 sysfs_notify_dirent_safe(rdev->sysfs_state);
5020 }
4981-4984行,用於請求內核模塊加載,因為linux內核模塊可以按需加載,只有在需要該模塊的時候再加載這樣比較節約資源。
4991行,首先看注釋,丟掉原磁盤設置的緩存,從現在開始這些磁盤只能由md訪問了。就好像一個人要去當兵了,進入部隊之後原來的身份證作廢,新發了一個軍人證,並且這個人以後只歸部隊管了,地方政府法庭不能管。
4992行,判斷為faulty盤,壞盤就不用多費心思了。
4994行,刷磁盤buffer。
4995行,注銷掉原來的身份證。
4997行,看注釋,基本檢查,看磁盤上數據部分與超級塊是否overlap。rdev->data_offset表示磁盤上數據區開始偏移,rdev->sb_start表示超級塊開始偏移,mddev->dev_sectors表示磁盤用於陣列的空間,rdev->sb_size表示超級塊大小。
5019行,更新sysfs文件中磁盤state狀態。
5022 if (mddev->bio_set == NULL)
5023 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5024
5025 spin_lock(&pers_lock);
5026 pers = find_pers(mddev->level, mddev->clevel);
5027 if (!pers || !try_module_get(pers->owner)) {
5028 spin_unlock(&pers_lock);
5029 if (mddev->level != LEVEL_NONE)
5030 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5031 mddev->level);
5032 else
5033 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5034 mddev->clevel);
5035 return -EINVAL;
5036 }
5037 mddev->pers = pers;
5038 spin_unlock(&pers_lock);
5039 if (mddev->level != pers->level) {
5040 mddev->level = pers->level;
5041 mddev->new_level = pers->level;
5042 }
5043 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5044
5045 if (mddev->reshape_position != MaxSector &&
5046 pers->start_reshape == NULL) {
5047 /* This personality cannot handle reshaping... */
5048 mddev->pers = NULL;
5049 module_put(pers->owner);
5050 return -EINVAL;
5051 }
5052
5053 if (pers->sync_request) {
5054 /* Warn if this is a potentially silly
5055 * configuration.
5056 */
5057 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5058 struct md_rdev *rdev2;
5059 int warned = 0;
5060
5061 rdev_for_each(rdev, mddev)
5062 rdev_for_each(rdev2, mddev) {
5063 if (rdev < rdev2 &&
5064 rdev->bdev->bd_contains ==
5065 rdev2->bdev->bd_contains) {
5066 printk(KERN_WARNING
5067 "%s: WARNING: %s appears to be"
5068 " on the same physical disk as"
5069 " %s.\n",
5070 mdname(mddev),
5071 bdevname(rdev->bdev,b),
5072 bdevname(rdev2->bdev,b2));
5073 warned = 1;
5074 }
5075 }
5076
5077 if (warned)
5078 printk(KERN_WARNING
5079 "True protection against single-disk"
5080 " failure might be compromised.\n");
5081 }
5082
5083 mddev->recovery = 0;
5084 /* may be over-ridden by personality */
5085 mddev->resync_max_sectors = mddev->dev_sectors;
5086
5087 mddev->ok_start_degraded = start_dirty_degraded;
5088
5089 if (start_readonly && mddev->ro == 0)
5090 mddev->ro = 2; /* read-only, but switch on first write */
5022行,創建bio內存池,用於讀寫時克隆保存原始bio。
5026行,查找對應陣列級別的struct md_personality是否存在,經過我們在4982行的request_module之後,新加載的模塊會調用register_md_personality函數注冊struct md_personality結構體,所以這裡可以找到需要的pers。
5037行,將找到的pers賦值給mddev->pers。
5053行,這個if分支用於檢查陣列中是否有兩個struct md_rdev位於同一物理磁盤上。因為創建陣列可以用分區來創建,所以這裡需要檢查一下。如果兩個struct md_rdev位於同一物理磁盤上,導致陣列性能很差。既然要玩raid就沒有必要那麼小氣嘛,直接用整個磁盤,沒有必要用磁盤分區。
5083行,初始化陣列sync標記。
5085行,初始化陣列最大同步偏移。
5087行,是否自動運行降級的髒陣列。可別小看了簡簡單單的一行代碼,卻代表了一個raid5陣列很復雜的問題。當一個raid5/6為髒並且降級時,就可能有數據錯誤的風險。為髒就是校驗盤數據未經過同步,再加上降級就表示這一條帶數據無法通過重建來恢復。所以md就不直接去運行陣列,而是由系統管理員手動運行。然而如果根文件系統是建立在raid上的時候,就會導致系統無法啟動,所以就提供一個內核模塊參數start_dirty_degraded來控制強制運行這樣的陣列。
但實際上情況並沒有看起來那麼嚴重,例如在一個raid5陣列上建立一個ext4文件系統,為髒部分代表陣列還沒有同步,而沒有同步的條帶是沒有文件存儲在條帶上的(如果存儲代表已經寫過,寫過的條帶是同步的),所以這個時候強制運行降級的髒陣列是沒有問題的。
5089行,在很多用戶的環境裡,經常會遇到一個問題,就是系統重啟之後查看cat
/proc/mdstat目錄下陣列resync=pending狀態,解決這個問題有兩個方法,一是使用命令mdadm --read-write
/dev/md*,另一個是設置模塊參數/sys/module/md_mod/parameters/start_ro為0。那麼為什麼要設置這樣一個狀態呢?代碼作者neil brown,是為了解決在Debian系統啟動時要做一個重要的事情,所以讓陣列進入這個臨時狀態。還好只要有讀寫陣列就會自動解除這個臨時狀態,對於正常使用沒有影響。
5092 err = mddev->pers->run(mddev);
5093 if (err)
5094 printk(KERN_ERR "md: pers->run() failed ...\n");
5095 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5096 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5097 " but 'external_size' not in effect?\n", __func__);
5098 printk(KERN_ERR
5099 "md: invalid array_size %llu > default size %llu\n",
5100 (unsigned long long)mddev->array_sectors / 2,
5101 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5102 err = -EINVAL;
5103 mddev->pers->stop(mddev);
5104 }
5105 if (err == 0 && mddev->pers->sync_request &&
5106 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5107 err = bitmap_create(mddev);
5108 if (err) {
5109 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5110 mdname(mddev), err);
5111 mddev->pers->stop(mddev);
5112 }
5113 }
5114 if (err) {
5115 module_put(mddev->pers->owner);
5116 mddev->pers = NULL;
5117 bitmap_destroy(mddev);
5118 return err;
5119 }
5092行,毫無疑問一看函數名就知道這一行是重中之重。這裡選擇raid1的run作示例,因為raid1是比較簡單的,raid5和raid10在後面小節單獨講解。在講run之前先簡要說明一下mddev->pers->run是怎麼調用到各個模塊的run函數的?
首先每個模塊初始化的時候都會調用到register_md_persionality函數,向md模塊注冊各自的struct md_personality結構,
7158 int register_md_personality(struct md_personality *p)
7159 {
7160 spin_lock(&pers_lock);
7161 list_add_tail(&p->list, &pers_list);
7162 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
7163 spin_unlock(&pers_lock);
7164 return 0;
7165 }
在md_run函數中根據mddev->level初始化mddev->pers,如果level為1,這裡pers就指向raid1的struct md_personality raid1_personality,那麼這裡調用的run函數也就是raid1中的run函數。接著看raid1中的run函數:
2769 static int run(struct mddev *mddev)
2770 {
2771 struct r1conf *conf;
2772 int i;
2773 struct md_rdev *rdev;
2774 int ret;
2775 bool discard_supported = false;
2776
2777 if (mddev->level != 1) {
2778 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
2779 mdname(mddev), mddev->level);
2780 return -EIO;
2781 }
2782 if (mddev->reshape_position != MaxSector) {
2783 printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
2784 mdname(mddev));
2785 return -EIO;
2786 }
2787 /*
2788 * copy the already verified devices into our private RAID1
2789 * bookkeeping area. [whatever we allocate in run(),
2790 * should be freed in stop()]
2791 */
2792 if (mddev->private == NULL)
2793 conf = setup_conf(mddev);
2794 else
2795 conf = mddev->private;
2796
2797 if (IS_ERR(conf))
2798 return PTR_ERR(conf);
2799
2800 if (mddev->queue)
2801 blk_queue_max_write_same_sectors(mddev->queue, 0);
2802
2803 rdev_for_each(rdev, mddev) {
2804 if (!mddev->gendisk)
2805 continue;
2806 disk_stack_limits(mddev->gendisk, rdev->bdev,
2807 rdev->data_offset << 9);
2808 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
2809 discard_supported = true;
2810 }
2811
2812 mddev->degraded = 0;
2813 for (i=0; i < conf->raid_disks; i++)
2814 if (conf->mirrors[i].rdev == NULL ||
2815 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2816 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2817 mddev->degraded++;
2818
2819 if (conf->raid_disks - mddev->degraded == 1)
2820 mddev->recovery_cp = MaxSector;
2821
2822 if (mddev->recovery_cp != MaxSector)
2823 printk(KERN_NOTICE "md/raid1:%s: not clean"
2824 " -- starting background reconstruction\n",
2825 mdname(mddev));
2826 printk(KERN_INFO
2827 "md/raid1:%s: active with %d out of %d mirrors\n",
2828 mdname(mddev), mddev->raid_disks - mddev->degraded,
2829 mddev->raid_disks);
2830
2831 /*
2832 * Ok, everything is just fine now
2833 */
2834 mddev->thread = conf->thread;
2835 conf->thread = NULL;
2836 mddev->private = conf;
2837
2838 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2839
2840 if (mddev->queue) {
2841 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2842 mddev->queue->backing_dev_info.congested_data = mddev;
2843 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2844
2845 if (discard_supported)
2846 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
2847 mddev->queue);
2848 else
2849 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
2850 mddev->queue);
2851 }
2852
2853 ret = md_integrity_register(mddev);
2854 if (ret)
2855 stop(mddev);
2856 return ret;
2857 }
2777-2786行,基本檢查。
2792行,域private未賦值,進入if分支。
2793行,配置raid1環境。俗話說,國有國法,家有家規。如果說struct mddev是國法,那麼setup_conf要建立的struct r1conf就是家規了,同樣對於raid5和raid10都有自己有家規struct r5conf和struct r10conf。struct mddev存放是所有陣列共同的屬性,而各自struct r*conf存放是私有的屬性,而這些私有屬性就是為了管理好各自管轄的磁盤。進入setup_conf函數:
2648 static struct r1conf *setup_conf(struct mddev *mddev)
2649 {
2650 struct r1conf *conf;
2651 int i;
2652 struct raid1_info *disk;
2653 struct md_rdev *rdev;
2654 int err = -ENOMEM;
2655
2656 conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
2657 if (!conf)
2658 goto abort;
2659
2660 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2661 * mddev->raid_disks * 2,
2662 GFP_KERNEL);
2663 if (!conf->mirrors)
2664 goto abort;
2665
2666 conf->tmppage = alloc_page(GFP_KERNEL);
2667 if (!conf->tmppage)
2668 goto abort;
2669
2670 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
2671 if (!conf->poolinfo)
2672 goto abort;
2673 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2674 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2675 r1bio_pool_free,
2676 conf->poolinfo);
2677 if (!conf->r1bio_pool)
2678 goto abort;
2679
2680 conf->poolinfo->mddev = mddev;
2681
2682 err = -EINVAL;
2683 spin_lock_init(&conf->device_lock);
2684 rdev_for_each(rdev, mddev) {
2685 struct request_queue *q;
2686 int disk_idx = rdev->raid_disk;
2687 if (disk_idx >= mddev->raid_disks
2688 || disk_idx < 0)
2689 continue;
2690 if (test_bit(Replacement, &rdev->flags))
2691 disk = conf->mirrors + mddev->raid_disks + disk_idx;
2692 else
2693 disk = conf->mirrors + disk_idx;
2694
2695 if (disk->rdev)
2696 goto abort;
2697 disk->rdev = rdev;
2698 q = bdev_get_queue(rdev->bdev);
2699 if (q->merge_bvec_fn)
2700 mddev->merge_check_needed = 1;
2701
2702 disk->head_position = 0;
2703 disk->seq_start = MaxSector;
2704 }
2705 conf->raid_disks = mddev->raid_disks;
2706 conf->mddev = mddev;
2707 INIT_LIST_HEAD(&conf->retry_list);
2708
2709 spin_lock_init(&conf->resync_lock);
2710 init_waitqueue_head(&conf->wait_barrier);
2711
2712 bio_list_init(&conf->pending_bio_list);
2713 conf->pending_count = 0;
2714 conf->recovery_disabled = mddev->recovery_disabled - 1;
2715
2716 err = -EIO;
2717 for (i = 0; i < conf->raid_disks * 2; i++) {
2718
2719 disk = conf->mirrors + i;
2720
2721 if (i < conf->raid_disks &&
2722 disk[conf->raid_disks].rdev) {
2723 /* This slot has a replacement. */
2724 if (!disk->rdev) {
2725 /* No original, just make the replacement
2726 * a recovering spare
2727 */
2728 disk->rdev =
2729 disk[conf->raid_disks].rdev;
2730 disk[conf->raid_disks].rdev = NULL;
2731 } else if (!test_bit(In_sync, &disk->rdev->flags))
2732 /* Original is not in_sync - bad */
2733 goto abort;
2734 }
2735
2736 if (!disk->rdev ||
2737 !test_bit(In_sync, &disk->rdev->flags)) {
2738 disk->head_position = 0;
2739 if (disk->rdev &&
2740 (disk->rdev->saved_raid_disk < 0))
2741 conf->fullsync = 1;
2742 }
2743 }
2744
2745 err = -ENOMEM;
2746 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2747 if (!conf->thread) {
2748 printk(KERN_ERR
2749 "md/raid1:%s: couldn't allocate thread\n",
2750 mdname(mddev));
2751 goto abort;
2752 }
2753
2754 return conf;
 
2656-2680行,申請與讀寫相關的資源,後面講讀寫的時候再深入。
2684行,對每個陣列中數據盤,在struct r1conf中建立關聯,讀寫時用到。
2697行,建立struct r1conf到struct md_rdev關聯。
2717行,磁盤replacement機制,這是陣列的高級特性,這裡先不關注。
2746行,注冊陣列處理線程。每個運行陣列都有這樣的一個主線程,主要負責檢查同步重建(只檢查由另一線程負責具體處理),數據流處理。
小結一下,setup_conf函數主要作用是初始化struct r1conf,建立陣列數據流處理的上下文環境。
繼續回到raid1的run函數中。
2803行,對陣列中每一個磁盤設置struct queue_limit,每個塊設備都有一個struct queue_limit,表示塊設備隊列物理特性。這裡主要作用是讓磁盤請求隊列根據陣列請求隊列調整請求塊大小和對齊。
2812-2817行,計算陣列降級磁盤數。
2834行,設置mddev->thread。
2836行,設置mddev->private為struct r1conf。
2838行,設置陣列大小。
2840-2851行,設置擁塞處理函數和請求合並函數。
2853行,塊設備integrity,有興趣可查看內核文檔的integrity說明。
run函數就結束了,小結一下,run函數的主要作用是建立陣列讀寫的上下文環境,包括struct r1conf,陣列主線程等等。
繼續回到md_run函數中。
5107行,創建陣列bitmap,具體過程在bitmap章節裡再詳細閱讀。
接下來就是一些sysfs的顯示和鏈接,最有欣賞價值的是mddev->safemode,什麼是安全模式呢?沒有寫(包括同步和重建寫)的時候就是安全模式,反之正在寫的時候就不安全。因為對於有數據冗余的陣列來說,每一份數據都至少要寫入兩個物理磁盤中,在寫的過程中程序異常或者系統掉電異常都會導致數據不一致,為了保證數據一致性,必須要在系統重啟之後做全盤同步。然而全盤同步需要花費很長時間,bitmap的出現在一定程度上解決了這個問題,但卻對陣列性能產生一定的消極作用。
經過了這麼長的跋山涉水,終於又回到do_md_run的溫暖懷抱了。這個函數不長,我們不厭其煩地再貼一次代碼:
5158 static int do_md_run(struct mddev *mddev)
5159 {
5160 int err;
5161
5162 err = md_run(mddev);
5163 if (err)
5164 goto out;
5165 err = bitmap_load(mddev);
5166 if (err) {
5167 bitmap_destroy(mddev);
5168 goto out;
5169 }
5170
5171 md_wakeup_thread(mddev->thread);
5172 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5173
5174 set_capacity(mddev->gendisk, mddev->array_sectors);
5175 revalidate_disk(mddev->gendisk);
5176 mddev->changed = 1;
5177 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5178 out:
5179 return err;
5180 }
5165行,加載bitmap,同樣留到bitmap章節再詳解。5171行,喚醒陣列主線程。5172行,喚醒陣列同步線程。5174行,設置虛擬gendisk磁盤大小。5175行,運行磁盤,讓磁盤為系統可見。5176行,設置md改變標志。5177行,上報磁盤信息到udev。do_md_run完成,RUN_ARRAY命令也就執行完成了。小結一下,do_md_run函數的作用就是向上虛擬一個塊設備,向下包裝磁盤,建立讀寫請求的通道,將對md設備的請求能夠轉發到磁盤上去。下一小節就介紹raidd5陣列的運行。
出處:http://blog.csdn.net/liumangxiong