不管是快速分配還是慢速分配,實際分配內存的都是 buffered_rmqueue()函數,其他的都是在選擇從哪個地方來分配比較合適;
還是先來說說各個參數:
struct zone *preferred_zone 表示分配所能接受的最大zone類型
struct zone *zone 表示就在該zone上分配內存;
int order 表示分配頁的階數
gfp_t gfp_flags 分配的標識
page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); /* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, int order, gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD);//是否指定冷熱頁 again: if (likely(order == 0)) {//分配單頁 struct per_cpu_pages *pcp; struct list_head *list; local_irq_save(flags);//禁止本地CPU中斷,禁止前先保存中斷狀態 pcp = &this_cpu_ptr(zone->pageset)->pcp;//獲取到cpu高速緩存頁 list = &pcp->lists[migratetype];//根據遷移類型,得到高速緩存區的freelist if (list_empty(list)) {//空的,高速緩存沒有數據;這可能是上次獲取的cpu高速緩存遷移類型和這次不一樣 pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, migratetype, cold);//該函數向高速緩存中添加內存頁,具體分析見文章後面 if (unlikely(list_empty(list))) goto failed; } if (cold) page = list_entry(list->prev, struct page, lru); else page = list_entry(list->next, struct page, lru); list_del(&page->lru); pcp->count--; } else { if (unlikely(gfp_flags & __GFP_NOFAIL)) { /* * __GFP_NOFAIL is not to be used in new code. * * All __GFP_NOFAIL callers should be fixed so that they * properly detect and handle allocation failures. * * We most definitely don't want callers attempting to * allocate greater than order-1 page units with * __GFP_NOFAIL. */ WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), get_pageblock_migratetype(page)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) goto again; return page; failed: local_irq_restore(flags); return NULL; }
struct zone結構體中有個struct per_cpu_pageset __percpu *pageset; 成員,該成員用於冷熱分配器,熱頁表示已經在cpu的高速緩存中了;
struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif };
cpu緩存頁數組
struct per_cpu_pages { int count; /* number of pages in the list */列表中頁數 int high; /* high watermark, emptying needed */列表頁數的上限 int batch; /* chunk size for buddy add/remove */添加和刪除頁時,一次操作多少頁。不是單頁刪除和填充的,而是以該單位頁來操作的 /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES];//遷移類型的鏈表 };
從伙伴系統中得到頁,然後填充到cpu的高速緩存中
/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { int mt = migratetype, i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) {//一個頁面一個頁面處理, struct page *page = __rmqueue(zone, order, migratetype);//分配到指定遷移類型的內存頁 if (unlikely(page == NULL)) break; /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and * list and the list head then moves forward. From the callers * perspective, the linked list is ordered by page number in * some conditions. This is useful for IO devices that can * merge IO requests if the physical pages are ordered * properly. */ if (likely(cold == 0)) list_add(&page->lru, list);//如果是冷頁,則添加到鏈表頭 else list_add_tail(&page->lru, list);//否則添加鏈表尾部 if (IS_ENABLED(CONFIG_CMA)) {//條件編譯了CONFIG_CMA選項 mt = get_pageblock_migratetype(page);//獲取頁面的遷移類型 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))//如果不是MIGRATE_CMA和 MIGRATE_CMA mt = migratetype; } set_freepage_migratetype(page, mt); //設置page的遷移類型 list = &page->lru;//循環鏈接下一個頁 if (is_migrate_cma(mt))//如果是MIGRATE_CMA遷移類型 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order));//修改cma遷移類型的頁面計數 } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));//修改空閒頁面的計數 spin_unlock(&zone->lock); return i;//返回添加到cpu高速緩存鏈表的頁面個數 }
修改對應類型的頁面計數
static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { zone_page_state_add(delta, zone, item); } static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_stat[item]); }
/* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, int migratetype) { struct page *page; retry_reserve: page = __rmqueue_smallest(zone, order, migratetype);//常規情況下,從zone上分配指定的遷移類型的內存頁 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {//上面沒有分配到內存頁,並且不是緊急的遷移類型 page = __rmqueue_fallback(zone, order, migratetype);//修改搬遷其他遷移類型的頁, /* * Use MIGRATE_RESERVE rather than fail an allocation. goto * is used because __rmqueue_smallest is an inline function * and we want just one call site */ if (!page) {//沒有成功,則把遷移類型調整為 MIGRATE_RESERVE表示是緊急分配 migratetype = MIGRATE_RESERVE; goto retry_reserve;//重試 } } trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; }
/* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ static inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; struct free_area * area; struct page *page; /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) {//掃描所有階的內存 area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype]))//查看下遷移類型下的鏈表是否為空 continue; //獲取到鏈表中的頁 page = list_entry(area->free_list[migratetype].next, struct page, lru); list_del(&page->lru); rmv_page_order(page);//設置屬性,清除buddy標識,也就是設置 page->_mapcount = -1 area->nr_free--;//從這裡可以看出,nr_free是表示該階下的頁塊的數目,而不是頁的個數 expand(zone, page, order, current_order, area, migratetype);//這是把從高階分配的頁,逐漸對半分給下一階,直到自己需要的 return page; } return NULL; }
這是buddy的一個重要函數:在高階分配得到內存塊時,比如 8階分配得到內存塊時。而我們需要的是低價的,比如 6;那麼就要調用下面該函數,把8階分配得到的內存塊,掛到7階上,然後從該內存塊上截取一半,再到6階上,這時候再比較發現正是我們需要分配的內存階,就直接返回了;
說下參數:
struct zone *zone:所有的操作都在該zone上完成
struct page *page:高階上分配得到的頁塊
int low:我們需要的內存階
int high:在該階上分配到的內存
struct free_area *area:這是zone上的高階空閒頁數組項
int migratetype:遷移類型
/* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression * testing. Specifically, as large blocks of memory are subdivided, * the order in which smaller blocks are delivered depends on the order * they're subdivided in this function. This is the primary factor * influencing the order in which pages are delivered to the IO * subsystem according to empirical testing, and this is also justified * by considering the behavior of a buddy system containing a single * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * * -- nyc */ static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, int migratetype) { unsigned long size = 1 << high; while (high > low) {//如果在同階上分配得到了內存頁就不需要執行該函數了 area--;//從高階空閒數組元素,遞減到下一個階的空閒數組元素 high--;//下一個階 size >>= 1;//內存大小的一半 VM_BUG_ON(bad_range(zone, &page[size])); #ifdef CONFIG_DEBUG_PAGEALLOC if (high < debug_guardpage_minorder()) { /* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ INIT_LIST_HEAD(&page[size].lru); set_page_guard_flag(&page[size]); set_page_private(&page[size], high); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << high), migratetype); continue; } #endif list_add(&page[size].lru, &area->free_list[migratetype]);//掛入該階的對應遷移類型下的鏈表中 area->nr_free++;//該階上的內存塊增加 set_page_order(&page[size], high);//設置private為高階,清除掉buddy標識,因為該頁已經不是伙伴系統的頁了 } }
跑到這個函數時,表明上面指定遷移類型從伙伴系統中分配內存失敗,所以要用備用遷移列表;
/* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, #ifdef CONFIG_CMA [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ #else [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ #endif };
/* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area * area; int current_order; struct page *page; int migratetype, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) {//這是和指定遷移類型的遍歷不一樣,這裡是從最大階開始遍歷,就是為了防止內存碎片 for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE)//這是最後的選擇,現在還不到時候 break; area = &(zone->free_area[current_order]);//得到高階空閒數組元素 if (list_empty(&area->free_list[migratetype]))//如果對應階上的對應遷移類型的空閒頁鏈表是空的,則循環找備用遷移類型的空閒鏈表 continue; page = list_entry(area->free_list[migratetype].next, struct page, lru);//如果找到了空閒頁塊,則當前階上的空閒頁塊遞減 area->nr_free--; /* * If breaking a large block of pages, move all free * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages * * On the other hand, never change migration * type of MIGRATE_CMA pageblocks nor move CMA * pages on different free lists. We don't * want unmovable pages to be allocated from * MIGRATE_CMA areas. *///下面是解決剩余的空閒頁,上面的注釋說的很清楚了 //解釋下幾個有關遷移類型的全局變量,pageblock_order 表示內核認為是大的分配階(看自己配置,一般會配置MAX_ORDER - 1);pageblock_nr_pages 大分配階對應的頁數 if (!is_migrate_cma(migratetype) &&//不是CMA區域 (unlikely(current_order >= pageblock_order / 2) || //大內存塊,則全部轉到start_migratetype類型下 start_migratetype == MIGRATE_RECLAIMABLE || //可回收內存頁,就遷移類型轉換時,會更加積極 page_group_by_mobility_disabled)) { int pages; pages = move_freepages_block(zone, page, start_migratetype);//把這些頁面轉換到 start_migratetype 遷移類型下面去 /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype);//這裡是設置整個頁塊的遷移類型,上面move_freepage_block()函數是設置每個頁的遷移類型 migratetype = start_migratetype; } /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page);//清除buddy的標識,標識該page將不是buddy系統的了 /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order && !is_migrate_cma(migratetype)) change_pageblock_range(page, current_order, start_migratetype);//這個函數是把剩下的其他pageblock塊都設置成start_migratetype類型 expand(zone, page, order, current_order, area, is_migrate_cma(migratetype) ? migratetype : start_migratetype);//瓜分大伙伴頁塊,分成小伙伴頁塊 trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); return page; } } return NULL; }
int move_freepages_block(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; start_pfn = page_to_pfn(page);//頁幀號 start_pfn = start_pfn & ~(pageblock_nr_pages-1);//pageblock_nr_pages是遷移類型認為大階所對應的頁數 start_page = pfn_to_page(start_pfn); end_page = start_page + pageblock_nr_pages - 1;//准備遷移pgeblock_nr_pages個頁面,一般要轉換遷移類型的話,就轉換pageblock_nr_pages個連續頁面,這樣會減少內存碎片 end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) start_page = page; if (!zone_spans_pfn(zone, end_pfn))//判斷要遷移的內存區是否在一個zone上,不能交錯zone return 0; return move_freepages(zone, start_page, end_page, migratetype);//把要轉換遷移類型的內存頁面地址范圍給move_freepages()進行轉換 }
/* * Move the free pages in a range to the free lists of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() *///對注釋有點不理解??前一個調用函數明明做了pageblock_nr_pages 對齊處理的,而這裡卻說不必對齊?????????? int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype) { struct page *page; unsigned long order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE /* * page_zone is not safe to call in this context when * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant * anyway as we check zone boundaries in move_freepages_block(). * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { /* Make sure we are not inadvertently changing nodes */ VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } if (!PageBuddy(page)) {//現在頁還是伙伴系統的 page++; continue; } order = page_order(page);//得到階 list_move(&page->lru, &zone->free_area[order].free_list[migratetype]);//把這些頁搬遷到指定遷移類型對應的鏈表上 set_freepage_migratetype(page, migratetype);//設置這些頁的遷移類型,page->index = migratetype page += 1 << order;//一下子就轉換了 2^order 個頁面 pages_moved += 1 << order; } return pages_moved;//把范圍內的頁都遷移完,返回實際遷移了多少頁 }
static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { int nr_pageblocks = 1 << (start_order - pageblock_order);//得到有多少個pageblock_order的頁塊 while (nr_pageblocks--) {//循環設置每個pageblock_order頁塊 set_pageblock_migratetype(pageblock_page, migratetype);//設置頁塊的遷移類型 pageblock_page += pageblock_nr_pages;//調整到下一個頁塊的地址上去 } }