歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
您现在的位置: Linux教程網 >> UnixLinux >  >> Linux基礎 >> 關於Linux

linux物理內存管理區初始化

Linux物理內存管理區在start_kernel函數中進行初始化,此時啟動分配器已經建立,所以可以從bootmem中分配需要的內存。
 
一、全局變量初始化
 
max_pfn:最大物理頁面幀號
 
start_kernel()->setup_arch()->e820_end_of_ram_pfn()找出最大可用內存頁面幀號。
 
 
void __init setup_arch(char **cmdline_p) 

    …… 
/*
     * partially used pages are not usable - thus
     * we are rounding upwards:
     */ 
    /*遍歷e820.map,找到系統中得最大內存數,
    這個內存數需小於4G*/ 
    max_pfn = e820_end_of_ram_pfn(); 
    …… 
} 
 
unsigned long __init e820_end_of_ram_pfn(void) 

    /*MAX_ARCH_PFN為4G空間*/ 
    return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); 

 
/*
 * Find the highest page frame number we have available
 */ 
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) 

    int i; 
    unsigned long last_pfn = 0; 
    unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空間對應的頁面數*/ 
 
    /*對e820中所有的內存塊,其中e820為從bios中探測到的頁面數存放處*/ 
    for (i = 0; i < e820.nr_map; i++) { 
        struct e820entry *ei = &e820.map[i];/*第i個物理頁面塊*/ 
        unsigned long start_pfn; 
        unsigned long end_pfn; 
 
        if (ei->type != type)/*與找的類型不匹配*/ 
            continue; 
        /*起始地址對應的頁面幀號*/ 
        start_pfn = ei->addr >> PAGE_SHIFT; 
        /*結束物理地址對應的頁面幀號*/ 
        end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; 
 
        if (start_pfn >= limit_pfn) 
            continue; 
        if (end_pfn > limit_pfn) { 
            last_pfn = limit_pfn;/*找到的結束頁面幀號大於限制大小時*/ 
            break; 
        } 
        if (end_pfn > last_pfn) 
            last_pfn = end_pfn;/*保存更新last_pfn*/ 
    } 
 
    if (last_pfn > max_arch_pfn)/*大於4G空間時*/ 
        last_pfn = max_arch_pfn; 
    /*打印輸出信息*/ 
    printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", 
             last_pfn, max_arch_pfn); 
    /*返回最後一個頁面幀號*/ 
    return last_pfn; 

max_low_pfn:低端內存最大頁面數
 
start_kernel()->setup_arch()->find_low_pfn_range()
 
 
/*
 * Determine low and high memory ranges:
 */ 
/*找到低端內存的做大內存頁面數,初始化兩個變量*/ 
void __init find_low_pfn_range(void) 

    /* it could update max_pfn */ 
    /*當內存的大小本來就小於低端內存的做大頁框數時;
    直接沒有高端地址映射*/ 
    if (max_pfn <= MAXMEM_PFN) 
        lowmem_pfn_init(); 
    else/*這是一般PC機的運行流程,存在高端映射*/ 
        highmem_pfn_init(); 

我們直接看具有高端地址空間的部分。
 
 
/*
 * We have more RAM than fits into lowmem - we try to put it into
 * highmem, also taking the highmem=x boot parameter into account:
 */ 
 /*高端地址空間的頁面數可以在啟動中進行配置;
 如果不配置,在這裡進行設置大小*/ 
void __init highmem_pfn_init(void) 

    /*MAXMEM_PFN為最大物理地址-(4M+4M+8K+128M);
    所以低端內存的大小其實比我們說的896M低一些*/ 
    max_low_pfn = MAXMEM_PFN; 
 
    if (highmem_pages == -1)/*高端內存頁面數如果在開機沒有設置*/ 
        highmem_pages = max_pfn - MAXMEM_PFN;/*總頁面數減去低端頁面數*/ 
    /*如果highmem_pages變量在啟動項設置了,那麼在這裡就要進行這樣的判斷,因為可能出現不一致的情況*/ 
    if (highmem_pages + MAXMEM_PFN < max_pfn) 
        max_pfn = MAXMEM_PFN + highmem_pages; 
 
    if (highmem_pages + MAXMEM_PFN > max_pfn) { 
        printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, 
            pages_to_mb(max_pfn - MAXMEM_PFN), 
            pages_to_mb(highmem_pages)); 
        highmem_pages = 0; 
    } 
#ifndef CONFIG_HIGHMEM 
    /* Maximum memory usable is what is directly addressable */ 
    printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); 
    if (max_pfn > MAX_NONPAE_PFN) 
        printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); 
    else 
        printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); 
    max_pfn = MAXMEM_PFN; 
#else /* !CONFIG_HIGHMEM *//*存在高端地址情況*/ 
#ifndef CONFIG_HIGHMEM64G 
    /*在沒有配置64G的情況下,內存的大小不能超過4G*/ 
    if (max_pfn > MAX_NONPAE_PFN) { 
        max_pfn = MAX_NONPAE_PFN; 
        printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); 
    } 
#endif /* !CONFIG_HIGHMEM64G */ 
#endif /* !CONFIG_HIGHMEM */ 

二、管理區初始化
 
Start_kernl()->setup_arch()->paging_init()->zone_sizes_init()
 
 
static void __init zone_sizes_init(void) 

    /*初始化幾個內存區中的最大頁面數,在後面用於具體的初始化工作*/ 
    unsigned long max_zone_pfns[MAX_NR_ZONES]; 
    memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 
    max_zone_pfns[ZONE_DMA] =/*DMA區的最大頁面幀號,後面的類似*/ 
        virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
    max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 
#ifdef CONFIG_HIGHMEM 
    max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 
#endif 
    /*內存體系的MMU建立,包括伙伴系統的初步建立*/ 
    free_area_init_nodes(max_zone_pfns); 

其中x86-32 非PAE下MAX_DMA_ADDRESS為16M+3G大小
 
 
/* The maximum address that we can perform a DMA transfer to on this platform */ 
#define MAX_DMA_ADDRESS      (PAGE_OFFSET + 0x1000000) 
 
/**
 * free_area_init_nodes - Initialise all pg_data_t and zone data
 * @max_zone_pfn: an array of max PFNs for each zone
 *
 * This will call free_area_init_node() for each active node in the system.
 * Using the page ranges provided by add_active_range(), the size of each
 * zone in each node and their holes is calculated. If the maximum PFN
 * between two adjacent zones match, it is assumed that the zone is empty.
 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
 * starts where the previous one ended. For example, ZONE_DMA32 starts
 * at arch_max_dma_pfn.
 */ 
void __init free_area_init_nodes(unsigned long *max_zone_pfn) 

    unsigned long nid; 
    int i; 
 
    /* Sort early_node_map as initialisation assumes it is sorted */ 
 
    /*將活動區域進行排序,關於活動區域在後面會有介紹*/ 
    sort_node_map(); 
 
    /* Record where the zone boundaries are */ 
    memset(arch_zone_lowest_possible_pfn, 0, 
                sizeof(arch_zone_lowest_possible_pfn)); 
    memset(arch_zone_highest_possible_pfn, 0, 
                sizeof(arch_zone_highest_possible_pfn)); 
     
    /*找出活動內存中最小的頁面,在代碼中的作者的注釋很詳細*/ 
    arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 
    arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 
    for (i = 1; i < MAX_NR_ZONES; i++) { 
        if (i == ZONE_MOVABLE) 
            continue; 
        arch_zone_lowest_possible_pfn[i] = 
            arch_zone_highest_possible_pfn[i-1];/*假定區域連續,下一個區域的最小頁面為上一個區的最後頁面*/ 
        arch_zone_highest_possible_pfn[i] = 
            max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 
    } 
    /*對ZONE_MOVABLE區域設置為0*/ 
    arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 
    arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 
 
    /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 
    memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 
    find_zone_movable_pfns_for_nodes(zone_movable_pfn);/*找出每個區的movable的頁面數,關於movable為新引入的機制,在後面的文章中會對其詳細分析*/ 
 
    /* Print out the zone ranges */ 
    printk("Zone PFN ranges:\n"); 
    for (i = 0; i < MAX_NR_ZONES; i++) { 
        if (i == ZONE_MOVABLE) 
            continue; 
        printk("  %-8s %0#10lx -> %0#10lx\n", 
                zone_names[i], 
                arch_zone_lowest_possible_pfn[i], 
                arch_zone_highest_possible_pfn[i]); 
    } 
 
    /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 
    printk("Movable zone start PFN for each node\n"); 
    for (i = 0; i < MAX_NUMNODES; i++) { 
        if (zone_movable_pfn[i]) 
            printk("  Node %d: %lu\n", i, zone_movable_pfn[i]); 
    } 
 
    /* Print out the early_node_map[] */ 
    printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 
    for (i = 0; i < nr_nodemap_entries; i++) 
        printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, 
                        early_node_map[i].start_pfn, 
                        early_node_map[i].end_pfn); 
 
    /* Initialise every node */ 
    /*調試用*/ 
    mminit_verify_pageflags_layout(); 
    setup_nr_node_ids(); 
     
    for_each_online_node(nid) { 
        pg_data_t *pgdat = NODE_DATA(nid); 
         
    /*zone中數據的初始化,伙伴系統建立但是沒有頁面
        和數據,頁面在後面的mem_init中得到*/ 
        free_area_init_node(nid, NULL, 
                find_min_pfn_for_node(nid), NULL); 
 
        /* Any memory on that node */ 
        if (pgdat->node_present_pages) 
            node_set_state(nid, N_HIGH_MEMORY); 
    /*內存的相關檢查*/ 
        check_for_regular_memory(pgdat); 
    } 

 
void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 
        unsigned long node_start_pfn, unsigned long *zholes_size) 

    pg_data_t *pgdat = NODE_DATA(nid); 
 
    pgdat->node_id = nid; 
    pgdat->node_start_pfn = node_start_pfn;/*這個在前面調用一個函數得到*/ 
    /*計算系統中節點nid的所有物理頁面保存在數據結構中*/ 
    calculate_node_totalpages(pgdat, zones_size, zholes_size); 
    /*當節點只有一個時,將節點的map保存到全局變量中*/ 
    alloc_node_mem_map(pgdat); 
#ifdef CONFIG_FLAT_NODE_MEM_MAP 
    printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 
        nid, (unsigned long)pgdat, 
        (unsigned long)pgdat->node_mem_map); 
#endif 
    /*zone中相關數據的初始化,包括伙伴系統,等待隊列,相關變量,
    數據結構、鏈表等;*/ 
    free_area_init_core(pgdat, zones_size, zholes_size); 

具體的區域的初始化在下面函數進行
 
 
/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */ 
static void __paginginit free_area_init_core(struct pglist_data *pgdat, 
        unsigned long *zones_size, unsigned long *zholes_size) 

    enum zone_type j; 
    int nid = pgdat->node_id; 
    unsigned long zone_start_pfn = pgdat->node_start_pfn; 
    int ret; 
 
    pgdat_resize_init(pgdat); 
    pgdat->nr_zones = 0; 
    init_waitqueue_head(&pgdat->kswapd_wait); 
    pgdat->kswapd_max_order = 0; 
    pgdat_page_cgroup_init(pgdat); 
     
    for (j = 0; j < MAX_NR_ZONES; j++) { 
        struct zone *zone = pgdat->node_zones + j; 
        unsigned long size, realsize, memmap_pages; 
        enum lru_list l; 
    /*下面的兩個函數會獲得指定節點的真實內存大小*/ 
        size = zone_spanned_pages_in_node(nid, j, zones_size); 
        realsize = size - zone_absent_pages_in_node(nid, j, 
                                zholes_size); 
 
        /*
         * Adjust realsize so that it accounts for how much memory
         * is used by this zone for memmap. This affects the watermark
         * and per-cpu initialisations
         */ 
        memmap_pages =/*存放頁面所需要的內存大小*/ 
            PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 
        if (realsize >= memmap_pages) { 
            realsize -= memmap_pages; 
            if (memmap_pages) 
                printk(KERN_DEBUG 
                       "  %s zone: %lu pages used for memmap\n", 
                       zone_names[j], memmap_pages); 
        } else 
            printk(KERN_WARNING 
                "  %s zone: %lu pages exceeds realsize %lu\n", 
                zone_names[j], memmap_pages, realsize); 
 
        /* Account for reserved pages */ 
        if (j == 0 && realsize > dma_reserve) { 
            realsize -= dma_reserve;/*減去為DMA保留的頁面*/ 
            printk(KERN_DEBUG "  %s zone: %lu pages reserved\n", 
                    zone_names[0], dma_reserve); 
        } 
 
        if (!is_highmem_idx(j))/*如果不是高端內存區*/                                    nr_kernel_pages += realsize; 
        nr_all_pages += realsize; 
    /*下面為初始化zone結構的相關變量*/ 
        zone->spanned_pages = size; 
        zone->present_pages = realsize; 
#ifdef CONFIG_NUMA 
        zone->node = nid; 
        zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 
                        / 100; 
        zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 
#endif 
        zone->name = zone_names[j]; 
        spin_lock_init(&zone->lock); 
        spin_lock_init(&zone->lru_lock); 
        zone_seqlock_init(zone); 
        zone->zone_pgdat = pgdat; 
 
        zone->prev_priority = DEF_PRIORITY; 
 
        zone_pcp_init(zone); 
        for_each_lru(l) {//初始化鏈表 
            INIT_LIST_HEAD(&zone->lru[l].list); 
            zone->reclaim_stat.nr_saved_scan[l] = 0; 
        } 
        zone->reclaim_stat.recent_rotated[0] = 0; 
        zone->reclaim_stat.recent_rotated[1] = 0; 
        zone->reclaim_stat.recent_scanned[0] = 0; 
        zone->reclaim_stat.recent_scanned[1] = 0; 
        zap_zone_vm_stats(zone);/*將變量zone->vm_stat變量置0*/ 
        zone->flags = 0; 
        if (!size) 
            continue; 
    /*需要定義相關宏該版本沒定義*/ 
        set_pageblock_order(pageblock_default_order()); 
    /zone中變量pageblock_flags內存申請,從啟動分配器中*/ 
        setup_usemap(pgdat, zone, size); 
    /*zone中的任務等待隊列和zone的伙伴系統(MAX_ORDER個鏈表)的初始化,關於伙伴系統將單獨在後面總結*/ 
        ret = init_currently_empty_zone(zone, zone_start_pfn, 
                        size, MEMMAP_EARLY); 
        BUG_ON(ret); 
        /*zone中page相關屬性的初始化工作*/ 
        memmap_init(size, nid, j, zone_start_pfn); 
        zone_start_pfn += size; 
    } 

三、分配內存的備用區域初始化(非CONFIG_NUMA)
 
數據結構表示
 
 
x;      /* zone_idx(zoneref->zone) */ 
}; 
 
/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
 * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
 * *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()  - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()  - Return the index of the zone for an entry
 * zonelist_node_idx()  - Return the index of the node for an entry
 *///zone分配方案 
struct zonelist { 
    struct zonelist_cache *zlcache_ptr;          // NULL or &zlcache 
    struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; 
#ifdef CONFIG_NUMA 
    struct zonelist_cache zlcache;               // optional ... 
#endif 
}; 
代碼中的英文注釋很詳細了
 
初始化
 
Start_kernel()->build_all_zonelists()
 
 
void build_all_zonelists(void) 

    /*設置全局變量current_zonelist_order*/ 
    set_zonelist_order(); 
 
    if (system_state == SYSTEM_BOOTING) { 
    /*對所有節點創建zonelists*/ 
        __build_all_zonelists(NULL); 
    /*調試用*/ 
        mminit_verify_zonelist(); 
        cpuset_init_current_mems_allowed(); 
    } else { 
        /* we have to stop all cpus to guarantee there is no user
           of zonelist */ 
        stop_machine(__build_all_zonelists, NULL, NULL); 
        /* cpuset refresh routine should be here */ 
    } 
    /*計算所有zone中可分配的頁面數之和*/ 
    vm_total_pages = nr_free_pagecache_pages(); 
    /*
     * Disable grouping by mobility if the number of pages in the
     * system is too low to allow the mechanism to work. It would be
     * more accurate, but expensive to check per-zone. This check is
     * made on memory-hotadd so a system can start with mobility
     * disabled and enable it later
     */ 
    if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 
        page_group_by_mobility_disabled = 1; 
    else 
        page_group_by_mobility_disabled = 0; 
 
    printk("Built %i zonelists in %s order, mobility grouping %s.  " 
        "Total pages: %ld\n", 
            nr_online_nodes, 
            zonelist_order_name[current_zonelist_order], 
            page_group_by_mobility_disabled ? "off" : "on", 
            vm_total_pages); 
#ifdef CONFIG_NUMA 
    printk("Policy zone: %s\n", zone_names[policy_zone]); 
#endif 

 
/* return values int ....just for stop_machine() */ 
static int __build_all_zonelists(void *dummy) 

    int nid; 
 
#ifdef CONFIG_NUMA 
    memset(node_load, 0, sizeof(node_load)); 
#endif 
    for_each_online_node(nid) { 
        pg_data_t *pgdat = NODE_DATA(nid); 
    /*創建zonelists,這個數組用來在分配內存中坐回繞,循環訪問*/ 
        build_zonelists(pgdat); 
    /*在UMA中,這個僅僅是把相關的變量設置成了NULL*/ 
        build_zonelist_cache(pgdat); 
    } 
    return 0; 

 
static void build_zonelists(pg_data_t *pgdat) 

    int node, local_node; 
    enum zone_type j; 
    struct zonelist *zonelist; 
 
    local_node = pgdat->node_id; 
 
    zonelist = &pgdat->node_zonelists[0]; 
    /*將zone添加到zone鏈表中,這樣,zone中page的
    分配等操作將依靠這個環形的鏈表;*/ 
    j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 
 
    /*
     * Now we build the zonelist so that it contains the zones
     * of all the other nodes.
     * We don't want to pressure a particular node, so when
     * building the zones for node N, we make sure that the
     * zones coming right after the local ones are those from
     * node N+1 (modulo N)
     *//*對其他在線的節點創建zonelist*/ 
    for (node = local_node + 1; node < MAX_NUMNODES; node++) { 
        if (!node_online(node)) 
            continue; 
        j = build_zonelists_node(NODE_DATA(node), zonelist, j, 
                            MAX_NR_ZONES - 1); 
    } 
    for (node = 0; node < local_node; node++) { 
        if (!node_online(node)) 
            continue; 
        j = build_zonelists_node(NODE_DATA(node), zonelist, j, 
                            MAX_NR_ZONES - 1); 
    } 
 
    zonelist->_zonerefs[j].zone = NULL; 
    zonelist->_zonerefs[j].zone_idx = 0; 

 
/*
 * Builds allocation fallback zone lists.
 *
 * Add all populated zones of a node to the zonelist.
 */ 
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 
                int nr_zones, enum zone_type zone_type) 

    struct zone *zone; 
 
    BUG_ON(zone_type >= MAX_NR_ZONES); 
    zone_type++; 
 
    do { 
        zone_type--; 
        zone = pgdat->node_zones + zone_type; 
        if (populated_zone(zone)) {/*如果以頁面為單位的管理區的總大小不為0*/ 
            zoneref_set_zone(zone,/*設置管理區鏈表,將相關信息加入*/ 
                &zonelist->_zonerefs[nr_zones++]); 
            check_highest_zone(zone_type); 
        } 
 
    } while (zone_type); 
    return nr_zones; 

內存管理區初始化主要是借助於啟動分配器和以初始化的e820全局變量。內存管理區初始化後相應的伙伴系統、slab機制等等就可以在此基礎上建立了,在後面會一點一點總結

bullbat的專欄
Copyright © Linux教程網 All Rights Reserved