Linux物理內存管理區在start_kernel函數中進行初始化,此時啟動分配器已經建立,所以可以從bootmem中分配需要的內存。
一、全局變量初始化
max_pfn:最大物理頁面幀號
start_kernel()->setup_arch()->e820_end_of_ram_pfn()找出最大可用內存頁面幀號。
void __init setup_arch(char **cmdline_p)
{
……
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
/*遍歷e820.map,找到系統中得最大內存數,
這個內存數需小於4G*/
max_pfn = e820_end_of_ram_pfn();
……
}
unsigned long __init e820_end_of_ram_pfn(void)
{
/*MAX_ARCH_PFN為4G空間*/
return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}
/*
* Find the highest page frame number we have available
*/
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空間對應的頁面數*/
/*對e820中所有的內存塊,其中e820為從bios中探測到的頁面數存放處*/
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];/*第i個物理頁面塊*/
unsigned long start_pfn;
unsigned long end_pfn;
if (ei->type != type)/*與找的類型不匹配*/
continue;
/*起始地址對應的頁面幀號*/
start_pfn = ei->addr >> PAGE_SHIFT;
/*結束物理地址對應的頁面幀號*/
end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
if (start_pfn >= limit_pfn)
continue;
if (end_pfn > limit_pfn) {
last_pfn = limit_pfn;/*找到的結束頁面幀號大於限制大小時*/
break;
}
if (end_pfn > last_pfn)
last_pfn = end_pfn;/*保存更新last_pfn*/
}
if (last_pfn > max_arch_pfn)/*大於4G空間時*/
last_pfn = max_arch_pfn;
/*打印輸出信息*/
printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
/*返回最後一個頁面幀號*/
return last_pfn;
}
max_low_pfn:低端內存最大頁面數
start_kernel()->setup_arch()->find_low_pfn_range()
/*
* Determine low and high memory ranges:
*/
/*找到低端內存的做大內存頁面數,初始化兩個變量*/
void __init find_low_pfn_range(void)
{
/* it could update max_pfn */
/*當內存的大小本來就小於低端內存的做大頁框數時;
直接沒有高端地址映射*/
if (max_pfn <= MAXMEM_PFN)
lowmem_pfn_init();
else/*這是一般PC機的運行流程,存在高端映射*/
highmem_pfn_init();
}
我們直接看具有高端地址空間的部分。
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
/*高端地址空間的頁面數可以在啟動中進行配置;
如果不配置,在這裡進行設置大小*/
void __init highmem_pfn_init(void)
{
/*MAXMEM_PFN為最大物理地址-(4M+4M+8K+128M);
所以低端內存的大小其實比我們說的896M低一些*/
max_low_pfn = MAXMEM_PFN;
if (highmem_pages == -1)/*高端內存頁面數如果在開機沒有設置*/
highmem_pages = max_pfn - MAXMEM_PFN;/*總頁面數減去低端頁面數*/
/*如果highmem_pages變量在啟動項設置了,那麼在這裡就要進行這樣的判斷,因為可能出現不一致的情況*/
if (highmem_pages + MAXMEM_PFN < max_pfn)
max_pfn = MAXMEM_PFN + highmem_pages;
if (highmem_pages + MAXMEM_PFN > max_pfn) {
printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
pages_to_mb(max_pfn - MAXMEM_PFN),
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM *//*存在高端地址情況*/
#ifndef CONFIG_HIGHMEM64G
/*在沒有配置64G的情況下,內存的大小不能超過4G*/
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
二、管理區初始化
Start_kernl()->setup_arch()->paging_init()->zone_sizes_init()
static void __init zone_sizes_init(void)
{
/*初始化幾個內存區中的最大頁面數,在後面用於具體的初始化工作*/
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] =/*DMA區的最大頁面幀號,後面的類似*/
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
#endif
/*內存體系的MMU建立,包括伙伴系統的初步建立*/
free_area_init_nodes(max_zone_pfns);
}
其中x86-32 非PAE下MAX_DMA_ADDRESS為16M+3G大小
/* The maximum address that we can perform a DMA transfer to on this platform */
#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by add_active_range(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long nid;
int i;
/* Sort early_node_map as initialisation assumes it is sorted */
/*將活動區域進行排序,關於活動區域在後面會有介紹*/
sort_node_map();
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
/*找出活動內存中最小的頁面,在代碼中的作者的注釋很詳細*/
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];/*假定區域連續,下一個區域的最小頁面為上一個區的最後頁面*/
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
/*對ZONE_MOVABLE區域設置為0*/
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes(zone_movable_pfn);/*找出每個區的movable的頁面數,關於movable為新引入的機制,在後面的文章中會對其詳細分析*/
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
printk(" %-8s %0#10lx -> %0#10lx\n",
zone_names[i],
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
printk("Movable zone start PFN for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
}
/* Print out the early_node_map[] */
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
for (i = 0; i < nr_nodemap_entries; i++)
printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
early_node_map[i].start_pfn,
early_node_map[i].end_pfn);
/* Initialise every node */
/*調試用*/
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
/*zone中數據的初始化,伙伴系統建立但是沒有頁面
和數據,頁面在後面的mem_init中得到*/
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
/*內存的相關檢查*/
check_for_regular_memory(pgdat);
}
}
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;/*這個在前面調用一個函數得到*/
/*計算系統中節點nid的所有物理頁面保存在數據結構中*/
calculate_node_totalpages(pgdat, zones_size, zholes_size);
/*當節點只有一個時,將節點的map保存到全局變量中*/
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
/*zone中相關數據的初始化,包括伙伴系統,等待隊列,相關變量,
數據結構、鏈表等;*/
free_area_init_core(pgdat, zones_size, zholes_size);
}
具體的區域的初始化在下面函數進行
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
enum lru_list l;
/*下面的兩個函數會獲得指定節點的真實內存大小*/
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust realsize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages =/*存放頁面所需要的內存大小*/
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
zone_names[j], memmap_pages, realsize);
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;/*減去為DMA保留的頁面*/
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))/*如果不是高端內存區*/ nr_kernel_pages += realsize;
nr_all_pages += realsize;
/*下面為初始化zone結構的相關變量*/
zone->spanned_pages = size;
zone->present_pages = realsize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone->prev_priority = DEF_PRIORITY;
zone_pcp_init(zone);
for_each_lru(l) {//初始化鏈表
INIT_LIST_HEAD(&zone->lru[l].list);
zone->reclaim_stat.nr_saved_scan[l] = 0;
}
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
zone->reclaim_stat.recent_scanned[1] = 0;
zap_zone_vm_stats(zone);/*將變量zone->vm_stat變量置0*/
zone->flags = 0;
if (!size)
continue;
/*需要定義相關宏該版本沒定義*/
set_pageblock_order(pageblock_default_order());
/zone中變量pageblock_flags內存申請,從啟動分配器中*/
setup_usemap(pgdat, zone, size);
/*zone中的任務等待隊列和zone的伙伴系統(MAX_ORDER個鏈表)的初始化,關於伙伴系統將單獨在後面總結*/
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
/*zone中page相關屬性的初始化工作*/
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
三、分配內存的備用區域初始化(非CONFIG_NUMA)
數據結構表示
x; /* zone_idx(zoneref->zone) */
};
/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
* If zlcache_ptr is not NULL, then it is just the address of zlcache,
* as explained above. If zlcache_ptr is NULL, there is no zlcache.
* *
* To speed the reading of the zonelist, the zonerefs contain the zone index
* of the entry being read. Helper functions to access information given
* a struct zoneref are
*
* zonelist_zone() - Return the struct zone * for an entry in _zonerefs
* zonelist_zone_idx() - Return the index of the zone for an entry
* zonelist_node_idx() - Return the index of the node for an entry
*///zone分配方案
struct zonelist {
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
struct zonelist_cache zlcache; // optional ...
#endif
};
代碼中的英文注釋很詳細了
初始化
Start_kernel()->build_all_zonelists()
void build_all_zonelists(void)
{
/*設置全局變量current_zonelist_order*/
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
/*對所有節點創建zonelists*/
__build_all_zonelists(NULL);
/*調試用*/
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, NULL, NULL);
/* cpuset refresh routine should be here */
}
/*計算所有zone中可分配的頁面數之和*/
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
printk("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
{
int nid;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
/*創建zonelists,這個數組用來在分配內存中坐回繞,循環訪問*/
build_zonelists(pgdat);
/*在UMA中,這個僅僅是把相關的變量設置成了NULL*/
build_zonelist_cache(pgdat);
}
return 0;
}
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
enum zone_type j;
struct zonelist *zonelist;
local_node = pgdat->node_id;
zonelist = &pgdat->node_zonelists[0];
/*將zone添加到zone鏈表中,這樣,zone中page的
分配等操作將依靠這個環形的鏈表;*/
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*//*對其他在線的節點創建zonelist*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
}
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
/*
* Builds allocation fallback zone lists.
*
* Add all populated zones of a node to the zonelist.
*/
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones, enum zone_type zone_type)
{
struct zone *zone;
BUG_ON(zone_type >= MAX_NR_ZONES);
zone_type++;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {/*如果以頁面為單位的管理區的總大小不為0*/
zoneref_set_zone(zone,/*設置管理區鏈表,將相關信息加入*/
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
return nr_zones;
}
內存管理區初始化主要是借助於啟動分配器和以初始化的e820全局變量。內存管理區初始化後相應的伙伴系統、slab機制等等就可以在此基礎上建立了,在後面會一點一點總結
bullbat的專欄