歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
您现在的位置: Linux教程網 >> UnixLinux >  >> Linux基礎 >> 關於Linux

linux內存管理之伙伴系統(內存分配)

一、Linux伙伴系統分配器
伙伴系統分配器大體上分為兩類。__get_free_pages()類函數返回分配的第一個頁面的線性地址;alloc_pages()類函數返回頁面描述符地址。不管以哪種函數進行分配,最終會調用alloc_pages()進行分配頁面。
 
為清楚了解其分配制度,先給個伙伴系統數據的存儲框圖
 \
 
 
也就是每個order對應一個free_area結構,free_area以不同的類型以鏈表的方式存儲這些內存塊。
 
二、主分配函數
 
下面我們來看這個函數(在UMA模式下)
 
www.2cto.com
#define alloc_pages(gfp_mask, order) \ 
        alloc_pages_node(numa_node_id(), gfp_mask, order) 
  
www.2cto.com
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, 
                        unsigned int order) 

    /* Unknown node is current node */ 
    if (nid < 0) 
        nid = numa_node_id(); 
 
    return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); 

www.2cto.com
static inline struct page * 
__alloc_pages(gfp_t gfp_mask, unsigned int order, 
        struct zonelist *zonelist) 

    return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); 

上層分配函數__alloc_pages_nodemask()
 
www.2cto.com
/*
 * This is the 'heart' of the zoned buddy allocator.
 */ 
 /*上層分配器運用了各種方式進行*/ 
struct page * 
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 
            struct zonelist *zonelist, nodemask_t *nodemask) 

    enum zone_type high_zoneidx = gfp_zone(gfp_mask); 
    struct zone *preferred_zone; 
    struct page *page; 
     
    /* Convert GFP flags to their corresponding migrate type */ 
    int migratetype = allocflags_to_migratetype(gfp_mask); 
 
    gfp_mask &= gfp_allowed_mask; 
    /*調試用*/ 
    lockdep_trace_alloc(gfp_mask); 
    /*如果__GFP_WAIT標志設置了,需要等待和重新調度*/ 
    might_sleep_if(gfp_mask & __GFP_WAIT); 
    /*沒有設置對應的宏*/ 
    if (should_fail_alloc_page(gfp_mask, order)) 
        return NULL; 
 
    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */ 
    if (unlikely(!zonelist->_zonerefs->zone)) 
        return NULL; 
 
    /* The preferred zone is used for statistics later */ 
    /* 英文注釋所說*/ 
    first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 
    if (!preferred_zone) 
        return NULL; 
 
    /* First allocation attempt */ 
    /*從pcp和伙伴系統中正常的分配內存空間*/ 
    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 
            zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 
            preferred_zone, migratetype); 
    if (unlikely(!page))/*如果上面沒有分配到空間,調用下面函數慢速分配,允許等待和回收*/ 
        page = __alloc_pages_slowpath(gfp_mask, order, 
                zonelist, high_zoneidx, nodemask, 
                preferred_zone, migratetype); 
    /*調試用*/ 
    trace_mm_page_alloc(page, order, gfp_mask, migratetype); 
    return page; 

三、從pcp和伙伴系統中正常的分配內存空間
 
函數get_page_from_freelist()
 
www.2cto.com
/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */ 
/*為分配制定內存空間,遍歷每個zone*/ 
static struct page * 
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 
        struct zone *preferred_zone, int migratetype) 

    struct zoneref *z; 
    struct page *page = NULL; 
    int classzone_idx; 
    struct zone *zone; 
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 
    int zlc_active = 0;     /* set if using zonelist_cache */ 
    int did_zlc_setup = 0;      /* just call zlc_setup() one time */ 
    /*zone對應的下標*/ 
    classzone_idx = zone_idx(preferred_zone); 
zonelist_scan: 
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
     */ 
     /*遍歷每個zone,進行分配*/ 
    for_each_zone_zonelist_nodemask(zone, z, zonelist, 
        /*在UMA模式下不成立*/              high_zoneidx, nodemask) { 
        if (NUMA_BUILD && zlc_active && 
            !zlc_zone_worth_trying(zonelist, z, allowednodes)) 
                continue; 
        if ((alloc_flags & ALLOC_CPUSET) && 
            !cpuset_zone_allowed_softwall(zone, gfp_mask)) 
                goto try_next_zone; 
 
        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 
        /*需要關注水位*/ 
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 
            unsigned long mark; 
            int ret; 
            /*從flags中取的mark*/ 
            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 
            /*如果水位正常,從本zone中分配*/ 
            if (zone_watermark_ok(zone, order, mark, 
                    classzone_idx, alloc_flags)) 
                goto try_this_zone; 
 
            if (zone_reclaim_mode == 0)/*如果上面檢查的水位低於正常值,且沒有設置頁面回收值*/ 
                goto this_zone_full; 
            /*在UMA模式下下面函數直接返回0*/ 
            ret = zone_reclaim(zone, gfp_mask, order); 
            switch (ret) { 
            case ZONE_RECLAIM_NOSCAN: 
                /* did not scan */ 
                goto try_next_zone; 
            case ZONE_RECLAIM_FULL: 
                /* scanned but unreclaimable */ 
                goto this_zone_full; 
            default: 
                /* did we reclaim enough */ 
                if (!zone_watermark_ok(zone, order, mark, 
                        classzone_idx, alloc_flags)) 
                    goto this_zone_full; 
            } 
        } 
 
try_this_zone:/*本zone正常水位*/ 
    /*先從pcp中分配,然後不行的話再從伙伴系統中分配*/ 
        page = buffered_rmqueue(preferred_zone, zone, order, 
                        gfp_mask, migratetype); 
        if (page) 
            break; 
this_zone_full: 
        if (NUMA_BUILD)/*UMA模式為0*/ 
            zlc_mark_zone_full(zonelist, z); 
try_next_zone: 
        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 
            /*
             * we do zlc_setup after the first zone is tried but only
             * if there are multiple nodes make it worthwhile
             */ 
            allowednodes = zlc_setup(zonelist, alloc_flags); 
            zlc_active = 1; 
            did_zlc_setup = 1; 
        } 
    } 
 
    if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 
        /* Disable zlc cache for second zonelist scan */ 
        zlc_active = 0; 
        goto zonelist_scan; 
    } 
    return page;/*返回頁面*/ 

主分配函數
 
www.2cto.com
/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */ 
 /*先考慮從pcp中分配空間,當order大於0時再考慮從伙伴系統中分配*/ 
static inline 
struct page *buffered_rmqueue(struct zone *preferred_zone, 
            struct zone *zone, int order, gfp_t gfp_flags, 
            int migratetype) 

    unsigned long flags; 
    struct page *page; 
    int cold = !!(gfp_flags & __GFP_COLD);/*如果分配參數指定了__GFP_COLD標志,則設置cold標志*/ 
    int cpu; 
 
again: 
    cpu  = get_cpu(); 
    if (likely(order == 0)) {/*分配一個頁面時,使用pcp*/ 
        struct per_cpu_pages *pcp; 
        struct list_head *list; 
        /*找到zone對應的pcp*/ 
        pcp = &zone_pcp(zone, cpu)->pcp; 
        list = &pcp->lists[migratetype];/*pcp中對應類型的list*/ 
         
        /* 這裡需要關中斷,因為內存回收過程可能發送核間中斷,強制每個核從每CPU
        緩存中釋放頁面。而且中斷處理函數也會分配單頁。*/ 
        local_irq_save(flags); 
        if (list_empty(list)) {/*如果pcp中沒有頁面,需要補充*/ 
            /*從伙伴系統中獲得batch個頁面
            batch為一次分配的頁面數*/ 
            pcp->count += rmqueue_bulk(zone, 0, 
                    pcp->batch, list, 
                    migratetype, cold); 
            /*如果鏈表仍然為空,申請失敗返回*/ 
            if (unlikely(list_empty(list))) 
                goto failed; 
        } 
        /* 如果分配的頁面不需要考慮硬件緩存(注意不是每CPU頁面緩存)
        ,則取出鏈表的最後一個節點返回給上層*/ 
        if (cold) 
            page = list_entry(list->prev, struct page, lru); 
        else/* 如果要考慮硬件緩存,則取出鏈表的第一個頁面,這個頁面是最近剛釋放到每CPU
            緩存的,緩存熱度更高*/ 
            page = list_entry(list->next, struct page, lru); 
 
        list_del(&page->lru);/*從pcp中脫離*/ 
        pcp->count--;/*pcp計數減一*/ 
    }  
    else {/*當order為大於1時,不從pcp中分配,直接考慮從伙伴系統中分配*/ 
        if (unlikely(gfp_flags & __GFP_NOFAIL)) { 
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */ 
            WARN_ON_ONCE(order > 1); 
        } 
        /* 關中斷,並獲得管理區的鎖*/ 
        spin_lock_irqsave(&zone->lock, flags); 
        /*從伙伴系統中相應類型的相應鏈表中分配空間*/ 
        page = __rmqueue(zone, order, migratetype); 
        /* 已經分配了1 << order個頁面,這裡進行管理區空閒頁面統計計數*/ 
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 
        spin_unlock(&zone->lock);/* 這裡僅僅打開自旋鎖,待後面統計計數設置完畢後再開中斷*/ 
        if (!page) 
            goto failed; 
    } 
    /*事件統計計數,調試*/ 
    __count_zone_vm_events(PGALLOC, zone, 1 << order); 
    zone_statistics(preferred_zone, zone); 
    local_irq_restore(flags);/*恢復中斷*/ 
    put_cpu(); 
 
    VM_BUG_ON(bad_range(zone, page)); 
     
     /* 這裡進行安全性檢查,並進行一些善後工作。
      如果頁面標志破壞,返回的頁面出現了問題,則返回試圖分配其他頁面*/ 
    if (prep_new_page(page, order, gfp_flags)) 
        goto again; 
    return page; 
 
failed: 
    local_irq_restore(flags); 
    put_cpu(); 
    return NULL; 

3.1 pcp緩存補充
 
從伙伴系統中獲得batch個頁面,batch為一次分配的頁面數rmqueue_bulk()函數。
 
www.2cto.com
/* 
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */ 
 /*該函數返回的是1<<order個頁面,但是在pcp
 處理中調用,其他地方沒看到,order為0
  也就是說返回的是頁面數,加入的鏈表為
  對應調用pcp的鏈表*/ 
static int rmqueue_bulk(struct zone *zone, unsigned int order,  
            unsigned long count, struct list_head *list, 
            int migratetype, int cold) 

    int i; 
     
    spin_lock(&zone->lock);/* 上層函數已經關了中斷,這裡需要操作管理區,獲取管理區的自旋鎖*/ 
    for (i = 0; i < count; ++i) {/* 重復指定的次數,從伙伴系統中分配頁面*/ 
        /* 從伙伴系統中取出頁面*/ 
        struct page *page = __rmqueue(zone, order, migratetype); 
        if (unlikely(page == NULL))/*分配失敗*/ 
            break; 
 
        /*
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */ 
        if (likely(cold == 0))/*根據調用者的要求,將頁面放到每CPU緩存鏈表的頭部或者尾部*/ 
            list_add(&page->lru, list); 
        else 
            list_add_tail(&page->lru, list); 
        set_page_private(page, migratetype);/*設置private屬性為頁面的遷移類型*/ 
        list = &page->lru; 
    } 
    /*遞減管理區的空閒頁面計數*/ 
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 
    spin_unlock(&zone->lock);/*釋放管理區的子璇鎖*/ 
    return i; 

3.2 從伙伴系統中取出頁面
 
__rmqueue()函數
 
www.2cto.com
/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */ 
 /*采用兩種范式試著分配order個page*/ 
static struct page *__rmqueue(struct zone *zone, unsigned int order, 
                        int migratetype) 

    struct page *page; 
 
retry_reserve: 
    /*從指定order開始從小到達遍歷,優先從指定的遷移類型鏈表中分配頁面*/ 
    page = __rmqueue_smallest(zone, order, migratetype); 
     
        /*
         * 如果滿足以下兩個條件,就從備用鏈表中分配頁面:
         *        快速流程沒有分配到頁面,需要從備用遷移鏈表中分配.
         *        當前不是從保留的鏈表中分配.因為保留的鏈表是最後可用的鏈表,
             *  不能從該鏈表分配的話,說明本管理區真的沒有可用內存了.
         */  
    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 
        /*order從大到小遍歷,從備用鏈表中分配頁面*/ 
        page = __rmqueue_fallback(zone, order, migratetype); 
 
        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */ 
        if (!page) {/* 備用鏈表中沒有分配到頁面,從保留鏈表中分配頁面了*/ 
            migratetype = MIGRATE_RESERVE; 
            goto retry_reserve;/* 跳轉到retry_reserve,從保留的鏈表中分配頁面*/  
        } 
    } 
    /*調試代碼*/ 
    trace_mm_page_alloc_zone_locked(page, order, migratetype); 
    return page; 

3.2.1 從指定的遷移類型鏈表中分配頁面
 
從指定order開始從小到達遍歷,優先從指定的遷移類型鏈表中分配頁面__rmqueue_smallest(zone, order, migratetype);
 
www.2cto.com
/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */ 
 /*從給定的order開始,從小到大遍歷;
  找到後返回頁面基址,合並分割後的空間*/ 
static inline 
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 
                        int migratetype) 

    unsigned int current_order; 
    struct free_area * area; 
    struct page *page; 
 
    /* Find a page of the appropriate size in the preferred list */ 
    for (current_order = order; current_order < MAX_ORDER; ++current_order) { 
        area = &(zone->free_area[current_order]);/*得到指定order的area*/ 
        /*如果area指定類型的伙伴系統鏈表為空*/ 
        if (list_empty(&area->free_list[migratetype])) 
            continue;/*查找下一個order*/ 
        /*對應的鏈表不空,得到鏈表中數據*/ 
        page = list_entry(area->free_list[migratetype].next, 
                            struct page, lru); 
        list_del(&page->lru);/*從伙伴系統中刪除;*/ 
        rmv_page_order(page);/*移除page中order的變量*/ 
        area->nr_free--;/*空閒塊數減一*/ 
        /*拆分、合並*/ 
        expand(zone, page, order, current_order, area, migratetype); 
        return page; 
    } 
 
    return NULL; 

伙伴系統內存塊拆分和合並
 
看一個輔助函數,用於伙伴系統中內存塊的拆分、合並
 
www.2cto.com
/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- wli
 */ 
 /*此函數主要用於下面這種情況:
  分配函數從high中分割出去了low大小的內存;
  然後要將high留下的內存塊合並放到伙伴系統中;*/ 
static inline void expand(struct zone *zone, struct page *page, 
    int low, int high, struct free_area *area, 
    int migratetype) 

    unsigned long size = 1 << high; 
 
    while (high > low) {/*因為去掉了low的大小,所以最後肯定剩下的
     是low的大小(2的指數運算)*/ 
        area--;/*減一到order減一的area*/ 
        high--;/*order減一*/ 
        size >>= 1;/*大小除以2*/ 
        VM_BUG_ON(bad_range(zone, &page[size])); 
        /*加到指定的伙伴系統中*/ 
        list_add(&page[size].lru, &area->free_list[migratetype]); 
        area->nr_free++;/*空閒塊加一*/ 
        set_page_order(&page[size], high);/*設置相關order*/ 
    } 

3.2.2 從備用鏈表中分配頁面
 
www.2cto.com
/* Remove an element from the buddy allocator from the fallback list */ 
static inline struct page * 
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 

    struct free_area * area; 
    int current_order; 
    struct page *page; 
    int migratetype, i; 
 
    /* Find the largest possible block of pages in the other list */ 
     
    /* 從最高階搜索,這樣可以盡量的將其他遷移列表中的大塊分割,避免形成過多的碎片*/ 
    for (current_order = MAX_ORDER-1; current_order >= order; 
                        --current_order) { 
        for (i = 0; i < MIGRATE_TYPES - 1; i++) { 
            /*回調到下一個migratetype*/ 
            migratetype = fallbacks[start_migratetype][i]; 
 
            /* MIGRATE_RESERVE handled later if necessary */ 
             
              /* 本函數不處理MIGRATE_RESERVE類型的遷移鏈表,如果本函數返回NULL,
            則上層函數直接從MIGRATE_RESERVE中分配*/ 
            if (migratetype == MIGRATE_RESERVE) 
                continue;/*訪問下一個類型*/ 
 
            area = &(zone->free_area[current_order]); 
            /*如果指定order和類型的鏈表為空*/ 
            if (list_empty(&area->free_list[migratetype])) 
                continue;/*訪問下一個類型*/ 
            /*得到指定類型和order的頁面基址*/ 
            page = list_entry(area->free_list[migratetype].next, 
                    struct page, lru); 
            area->nr_free--;/*空閒塊數減一*/ 
 
            /*
             * If breaking a large block of pages, move all free
             * pages to the preferred allocation list. If falling
             * back for a reclaimable kernel allocation, be more
             * agressive about taking ownership of free pages
             */ 
            if (unlikely(current_order >= (pageblock_order >> 1)) ||/* 要分割的頁面是一個大頁面,則將整個頁面全部遷移到當前遷移類型的鏈表中,
                這樣可以避免過多的碎片*/              
                    start_migratetype == MIGRATE_RECLAIMABLE ||/* 目前分配的是可回收頁面,這類頁面有突發的特點,將頁面全部遷移到可回收鏈表中,
                可以避免將其他遷移鏈表分割成太多的碎片*/       
                page_group_by_mobility_disabled) {/* 指定了遷移策略,總是將被分割的頁面遷移*/ 
                 
                unsigned long pages; 
                /*移動到先前類型的伙伴系統中*/ 
                pages = move_freepages_block(zone, page, 
                                start_migratetype); 
 
                /* Claim the whole block if over half of it is free */ 
                 
                 /* pages是移動的頁面數,如果可移動的頁面數量較多,
                則將整個大內存塊的遷移類型修改*/         
                if (pages >= (1 << (pageblock_order-1)) || 
                        page_group_by_mobility_disabled) 
                    /*設置頁面標示*/ 
                    set_pageblock_migratetype(page, 
                                start_migratetype); 
 
                migratetype = start_migratetype; 
            } 
 
            /* Remove the page from the freelists */ 
            list_del(&page->lru); 
            rmv_page_order(page); 
 
            /* Take ownership for orders >= pageblock_order */ 
            if (current_order >= pageblock_order)//大於pageblock_order的部分設置相應標示 
            /*這個不太可能,因為pageblock_order為10*/ 
                change_pageblock_range(page, current_order, 
                            start_migratetype); 
            /*拆分和合並*/ 
            expand(zone, page, order, current_order, area, migratetype); 
 
            trace_mm_page_alloc_extfrag(page, order, current_order, 
                start_migratetype, migratetype); 
 
            return page; 
        } 
    } 
 
    return NULL; 

備用鏈表
 
www.2cto.com
/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */ 
 /*指定類型的鏈表為空時,這個數組規定
  回調的到那個類型的鏈表*/ 
static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 
    [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE }, 
    [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE }, 
    [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 
    [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */ 
}; 
移動到指定類型的伙伴系統中
 
www.2cto.com
/*將指定區域段的頁面移動到指定類型的
  伙伴系統中,其實就是將頁面的類型做了
  更改,但是是采用移動的方式
 
 功能和上面函數類似,但是要求以
 頁面塊方式對其*/ 
static int move_freepages_block(struct zone *zone, struct page *page, 
                int migratetype) 

    unsigned long start_pfn, end_pfn; 
    struct page *start_page, *end_page; 
 
/*如下是對齊操作,其中變量pageblock_nr_pages為MAX_ORDER-1*/ 
    start_pfn = page_to_pfn(page); 
    start_pfn = start_pfn & ~(pageblock_nr_pages-1); 
    start_page = pfn_to_page(start_pfn); 
    end_page = start_page + pageblock_nr_pages - 1; 
    end_pfn = start_pfn + pageblock_nr_pages - 1; 
 
    /* Do not cross zone boundaries */ 
    if (start_pfn < zone->zone_start_pfn) 
        start_page = page; 
    /*結束邊界檢查*/ 
    if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 
        return 0; 
/*調用上面函數*/ 
    return move_freepages(zone, start_page, end_page, migratetype); 

www.2cto.com
/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */ 
 /*將指定區域段的頁面移動到指定類型的
  伙伴系統中,其實就是將頁面的類型做了 更改,但是是采用移動的方式*/ 
static int move_freepages(struct zone *zone, 
              struct page *start_page, struct page *end_page, 
              int migratetype) 

    struct page *page; 
    unsigned long order; 
    int pages_moved = 0; 
 
#ifndef CONFIG_HOLES_IN_ZONE 
    /*
     * page_zone is not safe to call in this context when
     * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
     * anyway as we check zone boundaries in move_freepages_block().
     * Remove at a later date when no bug reports exist related to
     * grouping pages by mobility
     */ 
    BUG_ON(page_zone(start_page) != page_zone(end_page)); 
#endif 
 
    for (page = start_page; page <= end_page;) { 
        /* Make sure we are not inadvertently changing nodes */ 
        VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 
 
        if (!pfn_valid_within(page_to_pfn(page))) { 
            page++; 
            continue; 
        } 
 
        if (!PageBuddy(page)) { 
            page++; 
            continue; 
        } 
 
        order = page_order(page); 
        list_del(&page->lru);/*將頁面塊從原來的伙伴系統鏈表*/ 
        /*中刪除,注意,這裡不是一個頁面
        *而是以該頁面的伙伴塊*/ 
        list_add(&page->lru,/*添加到指定order和類型下的伙伴系統鏈表*/ 
            &zone->free_area[order].free_list[migratetype]); 
        page += 1 << order;/*移動頁面數往上定位*/ 
        pages_moved += 1 << order;/*移動的頁面數*/ 
    } 
 
    return pages_moved; 

四、慢速分配,允許等待和回收
 
www.2cto.com
/**
 * 當無法快速分配頁面時,如果調用者允許等待
 ,則通過本函數進行慢速分配。
 * 此時允許進行內存回收。
 */ 
static inline struct page * 
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 
    struct zonelist *zonelist, enum zone_type high_zoneidx, 
    nodemask_t *nodemask, struct zone *preferred_zone, 
    int migratetype) 

    const gfp_t wait = gfp_mask & __GFP_WAIT; 
    struct page *page = NULL; 
    int alloc_flags; 
    unsigned long pages_reclaimed = 0; 
    unsigned long did_some_progress; 
    struct task_struct *p = current; 
 
    /*
     * In the slowpath, we sanity check order to avoid ever trying to
     * reclaim >= MAX_ORDER areas which will never succeed. Callers may
     * be using allocators in order of preference for an area that is
     * too large.
     *//*參數合法性檢查*/ 
    if (order >= MAX_ORDER) { 
        WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 
        return NULL; 
    } 
 
    /*
     * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
     * __GFP_NOWARN set) should not cause reclaim since the subsystem
     * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
     * using a larger set of nodes after it has established that the
     * allowed per node queues are empty and that nodes are
     * over allocated.
     */ 
         /**
          * 調用者指定了GFP_THISNODE標志,表示不能進行內存回收。
          * 上層調用者應當在指定了GFP_THISNODE失敗後,使用其他標志進行分配。
          */ 
    if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 
        goto nopage; 
 
restart:/*如果調用者沒有禁止kswapd,則喚醒該線程進行內存回收。*/ 
    wake_all_kswapd(order, zonelist, high_zoneidx); 
 
    /*
     * OK, we're below the kswapd watermark and have kicked background
     * reclaim. Now things get more complex, so set up alloc_flags according
     * to how we want to proceed.
     */ 
     /*根據分配標志確定內部標志,主要是用於水線*/ 
    alloc_flags = gfp_to_alloc_flags(gfp_mask); 
 
        /**
          * 與快速分配流程相比,這裡的分配標志使用了低的水線。
          * 在進行內存回收操作前,我們使用低水線再嘗試分配一下。
          * 當然,不管是否允許ALLOC_NO_WATERMARKS標志,我們都將它清除。
          */ 
    /* This is the last chance, in general, before the goto nopage. */ 
    page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 
            high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 
            preferred_zone, migratetype); 
    if (page)/*分配成功,找到頁面*/ 
        goto got_pg; 
 
rebalance: 
    /* Allocate without watermarks if the context allows */ 
/* 某些上下文,如內存回收進程及被殺死的任務,都允許它完全突破水線的限制分配內存。*/ 
    if (alloc_flags & ALLOC_NO_WATERMARKS) { 
        page = __alloc_pages_high_priority(gfp_mask, order, 
                zonelist, high_zoneidx, nodemask, 
                preferred_zone, migratetype); 
        if (page))/* 在不考慮水線的情況下,分配到了內存*/ 
            goto got_pg; 
    } 
 
    /* Atomic allocations - we can't balance anything */ 
    /* 調用者希望原子分配內存,此時不能等待內存回收,返回NULL */ 
    if (!wait) 
        goto nopage; 
 
    /* Avoid recursion of direct reclaim */ 
/* 調用者本身就是內存回收進程,不能進入後面的內存回收處理流程,否則死鎖*/ 
    if (p->flags & PF_MEMALLOC) 
        goto nopage; 
 
    /* Avoid allocations with no watermarks from looping endlessly */ 
    /**
    * 當前線程正在被殺死,它可以完全突破水線分配內存。這裡向上層返回NULL,是為了避免系統進入死循環。
    * 當然,如果上層調用不允許失敗,則死循環繼續分配,等待其他線程釋放一點點內存。
    */ 
    if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 
        goto nopage; 
 
    /* Try direct reclaim and then allocating */ 
    /**
    * 直接在內存分配上下文中進行內存回收操作。
    */ 
    page = __alloc_pages_direct_reclaim(gfp_mask, order, 
                    zonelist, high_zoneidx, 
                    nodemask, 
                    alloc_flags, preferred_zone, 
                    migratetype, &did_some_progress); 
    if (page))/* 慶幸,回收了一些內存後,滿足了上層分配需求*/ 
        goto got_pg; 
 
    /*
     * If we failed to make any progress reclaiming, then we are
     * running out of options and have to consider going OOM
     */ 
    /* 內存回收過程沒有回收到內存,系統真的內存不足了*/ 
    if (!did_some_progress) { 
        /**
         * 調用者不是文件系統的代碼,允許進行文件系統操作,並且允許重試。 
         * 這裡需要__GFP_FS標志可能是進入OOM流程後會殺進程或進入panic,需要文件操作。
         */ 
        if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 
            if (oom_killer_disabled)/* 系統禁止了OOM,向上層返回NULL */ 
                goto nopage; 
            /**
             * 殺死其他進程後再嘗試分配內存
             */ 
            page = __alloc_pages_may_oom(gfp_mask, order, 
                    zonelist, high_zoneidx, 
                    nodemask, preferred_zone, 
                    migratetype); 
            if (page) 
                goto got_pg; 
 
            /*
             * The OOM killer does not trigger for high-order
             * ~__GFP_NOFAIL allocations so if no progress is being
             * made, there are no other options and retrying is
             * unlikely to help.
             */)/* 要求的頁面數量較多,再試意義不大*/ 
            if (order > PAGE_ALLOC_COSTLY_ORDER && 
                        !(gfp_mask & __GFP_NOFAIL)) 
                goto nopage; 
 
            goto restart; 
        } 
    } 
 
    /* Check if we should retry the allocation */ 
 /* 內存回收過程回收了一些內存,接下來判斷是否有必要繼續重試*/ 
    pages_reclaimed += did_some_progress; 
    if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 
        /* Wait for some write requests to complete then retry */ 
        congestion_wait(BLK_RW_ASYNC, HZ/50); 
        goto rebalance; 
    } 
 
nopage: 
/* 內存分配失敗了,打印內存分配失敗的警告*/ 
    if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 
        printk(KERN_WARNING "%s: page allocation failure." 
            " order:%d, mode:0x%x\n", 
            p->comm, order, gfp_mask); 
        dump_stack(); 
        show_mem(); 
    } 
    return page; 
got_pg: 
    /* 運行到這裡,說明成功分配了內存,這裡進行內存檢測調試*/ 
    if (kmemcheck_enabled) 
        kmemcheck_pagealloc_alloc(page, order, gfp_mask); 
    return page; 
 

總結:Linux伙伴系統主要分配流程為
 
正常非配(或叫快速分配)流程:
 
1,如果分配的是單個頁面,考慮從per CPU緩存中分配空間,如果緩存中沒有頁面,從伙伴系統中提取頁面做補充。
 
2,分配多個頁面時,從指定類型中分配,如果指定類型中沒有足夠的頁面,從備用類型鏈表中分配。最後會試探保留類型鏈表。
 
慢速(允許等待和頁面回收)分配:
 
3,當上面兩種分配方案都不能滿足要求時,考慮頁面回收、殺死進程等操作後在試
Copyright © Linux教程網 All Rights Reserved