linux啟動內存分配器是在伙伴系統、slab機制實現之前,為滿足內核中內存的分配而建立的。本身的機制比較簡單,使用位圖來進行標志分配和釋放。
一、數據結構介紹
1,保留區間
因為在建立啟動內存分配器的時候,會涉及保留內存。也就是說,之前保留給頁表、分配器本身(用於映射的位圖)、io等得內存在分配器建立後,當用它來分配內存空間時,保留出來的那些部分就不能再分配了。linux中對保留內存空間的部分用下列數據結構表示
/*
* Early reserved memory areas.
*/
#define MAX_EARLY_RES 20/*保留空間最大塊數*/
struct early_res {/*保留空間結構*/
u64 start, end;
char name[16];
char overlap_ok;
};
/*保留內存空間全局變量*/
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
{}
};
2,bootmem分配器
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
*/
/*用於bootmem分配器的節點數據結構*/
typedef struct bootmem_data {
unsigned long node_min_pfn;/*存放bootmem位圖的第一個頁面(即內核映象結束處的第一個頁面)。*/
unsigned long node_low_pfn;/*物理內存的頂點,最高不超過896MB。*/
void *node_bootmem_map;
unsigned long last_end_off;/*用來存放在前一次分配中所分配的最後一個字節相對於last_pos的位移量*/
unsigned long hint_idx;/*存放前一次分配的最後一個頁面號*/
struct list_head list;
} bootmem_data_t;
全局鏈表
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
二、啟動分配器的建立
啟動分配器的建立主要的流程為初始化映射位圖、活動內存區的映射位置0(表示可用)、保留內存區域處理,其中保留區存放在上面介紹的全局數組中,這裡只是將分配器中對應映射位圖值1,表示已經分配。
下面我們看內核中具體的初始化流程。
start_kernel()->setup_arch()->initmem_init()
void __init setup_arch(char **cmdline_p)
{
.......
<span style="white-space: pre; "> </span>/*此函數在開始對bootmem分配制度建立做些准備工作
然後調用相關函數建立bootmem分配制度*/
initmem_init(0, max_pfn);
.......
}
<span style="font-family: Arial, Verdana, sans-serif; "><span style="white-space: normal; "></span></span>
<span style="font-family: Arial, Verdana, sans-serif; "><span style="white-space: normal; "></span></span><pre name="code" class="cpp">void __init initmem_init(unsigned long start_pfn,
unsigned long end_pfn)
{
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
/*將活動內存放到early_node_map中,前面已經分析過了*/
e820_register_active_regions(0, 0, highend_pfn);
/*設置上面變量中的內存為當前,在這裡沒有
設置相關的宏*/
sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
/*高端內存開始地址物理*/
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
e820_register_active_regions(0, 0, max_low_pfn);
sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
__vmalloc_start_set = true;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
/*安裝bootmem分配器,此分配器在伙伴系統起來之前
用來進行承擔內存的分配等管理*/
setup_bootmem_allocator();
}
void __init setup_bootmem_allocator(void)
{
int nodeid;
unsigned long bootmap_size, bootmap;
/*
* Initialize the boot-time allocator (with low memory only):
*/
/*計算所需要的映射頁面大小一個字節一位,
所以需要對總的頁面大小除以8*/
bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
/*直接中e820中找到一個大小合適的內存塊,返回基址*/
bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
/*將用於位圖映射的頁面保留*/
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
/*對每一個在線的node*/
for_each_online_node(nodeid) {
unsigned long start_pfn, end_pfn;
#ifdef CONFIG_NEED_MULTIPLE_NODES/*not set*/
start_pfn = node_start_pfn[nodeid];
end_pfn = node_end_pfn[nodeid];
if (start_pfn > max_low_pfn)
continue;
if (end_pfn > max_low_pfn)
end_pfn = max_low_pfn;
#else
start_pfn = 0;
end_pfn = max_low_pfn;
#endif
/*對指定節點安裝啟動分配器*/
bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
bootmap);
}
/*bootmem的分配制度到這裡就已經建立完成,把after_bootmem
變量置成1,標識*/
after_bootmem = 1;
}
static unsigned long __init setup_node_bootmem(int nodeid,
unsigned long start_pfn,
unsigned long end_pfn,
unsigned long bootmap)
{
unsigned long bootmap_size;
/* don't touch min_low_pfn */
/*初始化映射位圖,將位圖中的所有位置1*/
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap >> PAGE_SHIFT,
start_pfn, end_pfn);
printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
nodeid, bootmap, bootmap + bootmap_size);
/*將活動內存區對應位圖相關位置0,表示可被分配的*/
free_bootmem_with_active_regions(nodeid, end_pfn);
/*對置保留位的相關頁面對應的位圖設置為1,表示已經分配
或者不可用(不能被分配)*/
early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
/*返回映射頁面的最後地址,下次映射即可以從這裡開始*/
return bootmap + bootmap_size;
}
對於初始化映射位圖,最終調用init_bootmem_core()
/*
* Called once to set up the allocator itself.
*/
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
unsigned long mapstart, unsigned long start, unsigned long end)
{
unsigned long mapsize;
mminit_validate_memmodel_limits(&start, &end);
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
bdata->node_min_pfn = start;
bdata->node_low_pfn = end;
/*添加bdata變量到鏈表中*/
link_bootmem(bdata);
/*
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
/*計算本bdata的mapsize,也就是內存頁面大小的1/8*/
mapsize = bootmap_bytes(end - start);
/*將所有map置1*/
memset(bdata->node_bootmem_map, 0xff, mapsize);
bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
bdata - bootmem_node_data, start, mapstart, end, mapsize);
return mapsize;
}
/*
* link bdata in order
*/
/*添加到鏈表,由添加的代碼可知
鏈表中的數據開始位置為遞增的*/
static void __init link_bootmem(bootmem_data_t *bdata)
{
struct list_head *iter;
/*添加到全局鏈表bdata_list中*/
list_for_each(iter, &bdata_list) {
bootmem_data_t *ent;
ent = list_entry(iter, bootmem_data_t, list);
if (bdata->node_min_pfn < ent->node_min_pfn)
break;
}
list_add_tail(&bdata->list, iter);
}
/**
* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
* this function may be used instead of calling free_bootmem() manually.
*/
/*用active_region來初始化bootmem分配器,基於低端內存區*/
void __init free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn)
{
int i;
/*對每個節點上得活動內存區*/
for_each_active_range_index_in_nid(i, nid) {
unsigned long size_pages = 0;
unsigned long end_pfn = early_node_map[i].end_pfn;
if (early_node_map[i].start_pfn >= max_low_pfn)
continue;
if (end_pfn > max_low_pfn)
end_pfn = max_low_pfn;
/*計算活動區的頁面數*/
size_pages = end_pfn - early_node_map[i].start_pfn;
/*釋放這部分內存,起始就是對應位圖值0*/
free_bootmem_node(NODE_DATA(early_node_map[i].nid),
PFN_PHYS(early_node_map[i].start_pfn),
size_pages << PAGE_SHIFT);
}
}
/**
* free_bootmem_node - mark a page range as usable
* @pgdat: node the range resides on
* @physaddr: starting address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must reside completely on the specified node.
*/
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
unsigned long start, end;
/*相關宏進行控制,調試用*/
kmemleak_free_part(__va(physaddr), size);
start = PFN_UP(physaddr);/*取上界*/
end = PFN_DOWN(physaddr + size);/*取下界*/
/*調用此函數對相關bit位清0,表示沒有分配,這裡保留位為0*/
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
static int __init mark_bootmem_node(bootmem_data_t *bdata,
unsigned long start, unsigned long end,
int reserve, int flags)
{
unsigned long sidx, eidx;
bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
bdata - bootmem_node_data, start, end, reserve, flags);
BUG_ON(start < bdata->node_min_pfn);
BUG_ON(end > bdata->node_low_pfn);
/*此兩個變量為到節點最小內存頁面的偏移量*/
sidx = start - bdata->node_min_pfn;
eidx = end - bdata->node_min_pfn;
if (reserve)/*如果設置了保留位*/
return __reserve(bdata, sidx, eidx, flags);
else/*相關的map位清0*/
__free(bdata, sidx, eidx);
return 0;
}
/*bootmem分配器的保留操作*/
static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
unsigned long eidx, int flags)
{
unsigned long idx;
int exclusive = flags & BOOTMEM_EXCLUSIVE;
bdebug("nid=%td start=%lx end=%lx flags=%x\n",
bdata - bootmem_node_data,
sidx + bdata->node_min_pfn,
eidx + bdata->node_min_pfn,
flags);
/*對連續的幾個頁面設置為保留*/
for (idx = sidx; idx < eidx; idx++)
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
if (exclusive) {
__free(bdata, sidx, idx);
return -EBUSY;
}
bdebug("silent double reserve of PFN %lx\n",
idx + bdata->node_min_pfn);
}
return 0;
}
/*bootmem分配器中釋放內存*/
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
unsigned long idx;
bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
sidx + bdata->node_min_pfn,
eidx + bdata->node_min_pfn);
if (bdata->hint_idx > sidx)
bdata->hint_idx = sidx;/*更新變量hint_idx,用於分配*/
for (idx = sidx; idx < eidx; idx++)/*對應位清0*/
if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
BUG();
}
void __init early_res_to_bootmem(u64 start, u64 end)
{
int i, count;
u64 final_start, final_end;
count = 0;
for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
count++;/*計算保留塊的個數*/
printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
count, start, end);
for (i = 0; i < count; i++) {
struct early_res *r = &early_res[i];
printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
r->start, r->end, r->name);
final_start = max(start, r->start);
final_end = min(end, r->end);
if (final_start >= final_end) {
printk(KERN_CONT "\n");
continue;
}
printk(KERN_CONT " ==> [%010llx - %010llx]\n",
final_start, final_end);
/*將指定區間置為保留*/
reserve_bootmem_generic(final_start, final_end - final_start,
BOOTMEM_DEFAULT);
}
}
上面的保留指定區間reserve_bootmem_generic()函數最終調用如下函數
/**
* reserve_bootmem - mark a page range as usable
* @addr: starting address of the range
* @size: size of the range in bytes
* @flags: reservation flags (see linux/bootmem.h)
*
* Partial pages will be reserved.
*
* The range must be contiguous but may span node boundaries.
*/
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
unsigned long start, end;
start = PFN_DOWN(addr);/*下界*/
end = PFN_UP(addr + size);/*上界*/
return mark_bootmem(start, end, 1, flags);
}
/*保留指定內存區間*/
static int __init mark_bootmem(unsigned long start, unsigned long end,
int reserve, int flags)
{
unsigned long pos;
bootmem_data_t *bdata;
pos = start;
/*通過bdata_list鏈表找到在指定區間的bdata*/
list_for_each_entry(bdata, &bdata_list, list) {
int err;
unsigned long max;
if (pos < bdata->node_min_pfn ||
pos >= bdata->node_low_pfn) {
BUG_ON(pos != start);
continue;
}
max = min(bdata->node_low_pfn, end);
/*設置為保留*/
err = mark_bootmem_node(bdata, pos, max, reserve, flags);
if (reserve && err) {/*如果出錯,遞歸調用*/
mark_bootmem(start, pos, 0, 0);
return err;
}
if (max == end)
return 0;
pos = bdata->node_low_pfn;
}
BUG();
}
三、內存的分配和釋放
介紹了上面的初始化流程,對於分配和釋放就簡單了,分配就是將分配器映射位圖中對應的位置1,釋放過程相反。
/*分配size大小的空間*/
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
align, goal, limit);
BUG_ON(!size);
BUG_ON(align & (align - 1));
BUG_ON(limit && goal + size > limit);
/*如果沒有映射位圖返回空,分配失敗*/
if (!bdata->node_bootmem_map)
return NULL;
min = bdata->node_min_pfn;
max = bdata->node_low_pfn;
goal >>= PAGE_SHIFT;
limit >>= PAGE_SHIFT;
if (limit && max > limit)
max = limit;
if (max <= min)
return NULL;
/*step為需要對齊於頁面數*/
step = max(align >> PAGE_SHIFT, 1UL);
/*計算起始頁面*/
if (goal && min < goal && goal < max)
start = ALIGN(goal, step);
else
start = ALIGN(min, step);
/*計算分配頁面區間*/
sidx = start - bdata->node_min_pfn;
midx = max - bdata->node_min_pfn;
/*前一次分配的頁號比這次開始分配的頁面號大
那麼,如果第一次沒有分配到,回退到這次的
開始重新試,因為第一次分配是從上一次分配
的位置開始的*/
if (bdata->hint_idx > sidx) {
* Handle the valid case of sidx being zero and still
* catch the fallback below.
*/
fallback = sidx + 1;
/*從上一次分配的位置開始,對齊與頁面*/
sidx = align_idx(bdata, bdata->hint_idx, step);
}
while (1) {
int merge;
void *region;
unsigned long eidx, i, start_off, end_off;
find_block:
/*查找第一個為0的位*/
sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
sidx = align_idx(bdata, sidx, step);
eidx = sidx + PFN_UP(size);/*結束位置*/
if (sidx >= midx || eidx > midx)/*找到結束了*/
break;
for (i = sidx; i < eidx; i++)/*檢查這段區域是否空閒*/
if (test_bit(i, bdata->node_bootmem_map)) {/*如果不是,將跳過這段繼續查找*/
sidx = align_idx(bdata, i, step);
if (sidx == i)
sidx += step;
goto find_block;
}
if (bdata->last_end_off & (PAGE_SIZE - 1) &&/*如果為相鄰的頁面,也就是說上次分配的頁面和這次分配的開始頁面為相鄰的*/
PFN_DOWN(bdata->last_end_off) + 1 == sidx)
start_off = align_off(bdata, bdata->last_end_off, align);
else
start_off = PFN_PHYS(sidx);
/*merge==1表示上次結束和這次開始不在同一個頁面上*/
merge = PFN_DOWN(start_off) < sidx;
end_off = start_off + size;
/*更新數據*/
bdata->last_end_off = end_off;
bdata->hint_idx = PFN_UP(end_off);
/*
* Reserve the area now:
*/
/*設定新加入的頁面為保留,就是將對應的映射位置1*/
if (__reserve(bdata, PFN_DOWN(start_off) + merge,
PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
BUG();
/*對應開始地址的虛擬地址返回*/
region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
start_off);
memset(region, 0, size);/*分配的大小*/
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
*/
/*調試用*/
kmemleak_alloc(region, size, 0, 0);
return region;
}
if (fallback) {/*回退,重新查看*/
sidx = align_idx(bdata, fallback - 1, step);
fallback = 0;
goto find_block;
}
return NULL;
}
摘自 bullbat的專欄