如果看懂了raid1陣列的run函數,那麼看raid5陣列run就非常輕松了,因為兩者要做的事情都是大同小異。
raid5的run函數很長,但很大一部分跟創建運行是沒有關系的,特別是有一段跟reshape相關的,大多數系統都不關注該功能,因此可以直接跳過。經過刪減之後的run函數如下:
5307 static int run(struct mddev *mddev) 5308 { 5309 struct r5conf *conf; 5310 int working_disks = 0; 5311 int dirty_parity_disks = 0; 5312 struct md_rdev *rdev; 5313 sector_t reshape_offset = 0; 5314 int i; 5315 long long min_offset_diff = 0; 5316 int first = 1; ... 5426 if (mddev->private == NULL) 5427 conf = setup_conf(mddev); 5428 else 5429 conf = mddev->private; 5430 5431 if (IS_ERR(conf)) 5432 return PTR_ERR(conf); 5433 5434 conf->min_offset_diff = min_offset_diff; 5435 mddev->thread = conf->thread; 5436 conf->thread = NULL; 5437 mddev->private = conf; ... 5491 /* 5492 * 0 for a fully functional array, 1 or 2 for a degraded array. 5493 */ 5494 mddev->degraded = calc_degraded(conf); ... 5503 /* device size must be a multiple of chunk size */ 5504 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5505 mddev->resync_max_sectors = mddev->dev_sectors; ... 5556 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5557 5558 if (mddev->queue) { ... 5628 } 5629 5630 return 0;
是不是感覺超級簡單呢,就像有些事情表面上看起來很復雜,但只要認真地去分析之後發現其實是有規律可循的。就像這個run函數,做的事情與raid1的run是相同的,就是建立讀寫的上下文環境。
5427行,創建struct r5conf,跟進函數:
5131 static struct r5conf *setup_conf(struct mddev *mddev) 5132 { 5133 struct r5conf *conf; 5134 int raid_disk, memory, max_disks; 5135 struct md_rdev *rdev; 5136 struct disk_info *disk; 5137 char pers_name[6]; 5138 5139 if (mddev->new_level != 5 5140 && mddev->new_level != 4 5141 && mddev->new_level != 6) { 5142 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5143 mdname(mddev), mddev->new_level); 5144 return ERR_PTR(-EIO); 5145 } 5146 if ((mddev->new_level == 5 5147 && !algorithm_valid_raid5(mddev->new_layout)) || 5148 (mddev->new_level == 6 5149 && !algorithm_valid_raid6(mddev->new_layout))) { 5150 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5151 mdname(mddev), mddev->new_layout); 5152 return ERR_PTR(-EIO); 5153 } 5154 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5155 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5156 mdname(mddev), mddev->raid_disks); 5157 return ERR_PTR(-EINVAL); 5158 } 5159 5160 if (!mddev->new_chunk_sectors || 5161 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5162 !is_power_of_2(mddev->new_chunk_sectors)) { 5163 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5164 mdname(mddev), mddev->new_chunk_sectors << 9); 5165 return ERR_PTR(-EINVAL); 5166 } 5167 5168 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5169 if (conf == NULL) 5170 goto abort; 5171 spin_lock_init(&conf->device_lock); 5172 init_waitqueue_head(&conf->wait_for_stripe); 5173 init_waitqueue_head(&conf->wait_for_overlap); 5174 INIT_LIST_HEAD(&conf->handle_list); 5175 INIT_LIST_HEAD(&conf->hold_list); 5176 INIT_LIST_HEAD(&conf->delayed_list); 5177 INIT_LIST_HEAD(&conf->bitmap_list); 5178 INIT_LIST_HEAD(&conf->inactive_list); 5179 atomic_set(&conf->active_stripes, 0); 5180 atomic_set(&conf->preread_active_stripes, 0); 5181 atomic_set(&conf->active_aligned_reads, 0); 5182 conf->bypass_threshold = BYPASS_THRESHOLD; 5183 conf->recovery_disabled = mddev->recovery_disabled - 1; 5184 5185 conf->raid_disks = mddev->raid_disks; 5186 if (mddev->reshape_position == MaxSector) 5187 conf->previous_raid_disks = mddev->raid_disks; 5188 else 5189 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5190 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5191 conf->scribble_len = scribble_len(max_disks); 5192 5193 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5194 GFP_KERNEL); 5195 if (!conf->disks) 5196 goto abort; 5197 5198 conf->mddev = mddev; 5199 5200 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5201 goto abort; 5202 5203 conf->level = mddev->new_level; 5204 if (raid5_alloc_percpu(conf) != 0) 5205 goto abort; 5206 5207 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5208 5209 rdev_for_each(rdev, mddev) { 5210 raid_disk = rdev->raid_disk; 5211 if (raid_disk >= max_disks 5212 || raid_disk < 0) 5213 continue; 5214 disk = conf->disks + raid_disk; 5215 5216 if (test_bit(Replacement, &rdev->flags)) { 5217 if (disk->replacement) 5218 goto abort; 5219 disk->replacement = rdev; 5220 } else { 5221 if (disk->rdev) 5222 goto abort; 5223 disk->rdev = rdev; 5224 } 5225 5226 if (test_bit(In_sync, &rdev->flags)) { 5227 char b[BDEVNAME_SIZE]; 5228 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5229 " disk %d\n", 5230 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5231 } else if (rdev->saved_raid_disk != raid_disk) 5232 /* Cannot rely on bitmap to complete recovery */ 5233 conf->fullsync = 1; 5234 } 5235 5236 conf->chunk_sectors = mddev->new_chunk_sectors; 5237 conf->level = mddev->new_level; 5238 if (conf->level == 6) 5239 conf->max_degraded = 2; 5240 else 5241 conf->max_degraded = 1; 5242 conf->algorithm = mddev->new_layout; 5243 conf->max_nr_stripes = NR_STRIPES; 5244 conf->reshape_progress = mddev->reshape_position; 5245 if (conf->reshape_progress != MaxSector) { 5246 conf->prev_chunk_sectors = mddev->chunk_sectors; 5247 conf->prev_algo = mddev->layout; 5248 } 5249 5250 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5251 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5252 if (grow_stripes(conf, conf->max_nr_stripes)) { 5253 printk(KERN_ERR 5254 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5255 mdname(mddev), memory); 5256 goto abort; 5257 } else 5258 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5259 mdname(mddev), memory); 5260 5261 sprintf(pers_name, "raid%d", mddev->new_level); 5262 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5263 if (!conf->thread) { 5264 printk(KERN_ERR 5265 "md/raid:%s: couldn't allocate thread.\n", 5266 mdname(mddev)); 5267 goto abort; 5268 } 5269 5270 return conf;
同樣,這個函數與raid1的setup_conf也很相似。
5139行,檢查陣列級別,支持raid4,5,6。
5147行,檢查raid5的layout是否正確。
5160行,檢查陣列chunk大小,必須為page整數倍並且是2的n次方。
5168行,申請struct r5conf內存空間並初始化。
5185行,設置數據盤數。
5193行,申請struct disk_info,用於保存與磁盤的關聯。
5200行,用於保存struct stripe_head的哈希表,用於快速查找指定扇區的stripe_head。
5209-5234行,最關鍵的是5223行,關聯struct disk_info與struct md_rdev。
5236行,設置條塊大小。
5237行,設置級別。
5241行,設置最大降級磁盤數。
5252行,申請struct stripe_head slab。跟進函數grow_stripes:
1501 static int grow_stripes(struct r5conf *conf, int num) 1502 { 1503 struct kmem_cache *sc; 1504 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1505 1506 if (conf->mddev->gendisk) 1507 sprintf(conf->cache_name[0], 1508 "raid%d-%s", conf->level, mdname(conf->mddev)); 1509 else 1510 sprintf(conf->cache_name[0], 1511 "raid%d-%p", conf->level, conf->mddev); 1512 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1513 1514 conf->active_name = 0; 1515 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1516 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1517 0, 0, NULL); 1518 if (!sc) 1519 return 1; 1520 conf->slab_cache = sc; 1521 conf->pool_size = devs; 1522 while (num--) 1523 if (!grow_one_stripe(conf)) 1524 return 1; 1525 return 0; 1526 }
1504行,計算數據盤數目。
1506行,設置slab名稱。
1515行,創建slab, 空間大小為sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev)是因為struct stripe_head尾部有devs個struct r5dev。
1523行,創建空閒struct stripe_head。然而只是簡單地創建就沒有必要跟進看了,但該函數中隱藏著一個最經常調用的函數release_stripe,所以還是有必要跟進的:
1477 static int grow_one_stripe(struct r5conf *conf) 1478 { 1479 struct stripe_head *sh; 1480 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1481 if (!sh) 1482 return 0; 1483 1484 sh->raid_conf = conf; 1485 1486 spin_lock_init(&sh->stripe_lock); 1487 1488 if (grow_buffers(sh)) { 1489 shrink_buffers(sh); 1490 kmem_cache_free(conf->slab_cache, sh); 1491 return 0; 1492 } 1493 /* we just created an active stripe so... */ 1494 atomic_set(&sh->count, 1); 1495 atomic_inc(&conf->active_stripes); 1496 INIT_LIST_HEAD(&sh->lru); 1497 release_stripe(sh); 1498 return 1; 1499 }
 
1480行,新申請一個struct stripe_head。
1484行,關聯struct r5conf。
1488行,grow_buffers,為每個struct r5dev申請一個page頁用於stripe_head頁拷貝和計算校驗。頁指針保存在sh->dev[].page指針中。
1494行,設置struct stripe_head計數器,在1497行release_stripe中會遞減。
1495行,遞增陣列活躍條帶數。
1496行,lru鏈表初始化。
1497行,釋放struct stripe_head,添加到空閒條帶空閒鏈表。release_stripe最終會調用到do_release_stripe,do_release_stripe裡會執行到下面幾行:
228 list_add_tail(&sh->lru, &conf->inactive_list); 229 wake_up(&conf->wait_for_stripe); 230 if (conf->retry_read_aligned) 231 md_wakeup_thread(conf->mddev->thread);
228行,添加struct stripe_head到inactive_list,即條帶空閒鏈表。229行,喚醒等待空閒條帶的請求,因為每個陣列的struct stripe_head資源是有限的,申請不到時就在等待隊列上等候。231行,喚醒條塊讀請求。繼續返回到setup_conf函數中,這裡已經通過grow_stripes為陣列申請了NR_STRIPES個struct stripe_head。5262行,創建raid5主線程。這樣setup_conf函數也結束了,繼續返回到run函數中。5434-5437行,conf和mddev的關聯和賦值。5494-5556行,mddev相關域的賦值。5558行,mddev請求隊列struct queue_limits設置等等初始化。小結一下,raid5的run函數同raid1基本作用是一樣的,都是向上虛擬一個塊設備,向下包裝磁盤,建立讀寫請求的通道。區別在於raid5的讀寫是以struct stripe_head為基礎的,而在raid5的讀寫中也是圍繞著struct stripe_head展開的。下一小節介紹raid10陣列的運行。
出處:http://blog.csdn.net/liumangxiong