前面簡單的分析了內核處理用戶空間缺頁異常的流程,進入到了handle_mm_fault()函數,該函數為觸發缺頁異常的地址address分配各級的頁目錄,也就是說現在已經擁有了一個和address配對的pte了,但是這個pte如何去映射物理頁框,內核又得根據pte的狀態進行分類和判斷,而這個過程又會牽扯出一些其他的概念……這也是初讀linux內核源碼的最大障礙吧,在一些復雜的處理中,一個點往往可以延伸出一個面,容易讓人迷失方向……因此後面打算分幾次將這個函數分析完,自己也沒有完全理解透,所以不到位的地方歡迎大家指出,一起交流~
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
entry = *pte;
if (!pte_present(entry)) {//如果頁不在主存中
if (pte_none(entry)) {//頁表項內容為0,表明進程未訪問過該頁
/*如果vm_ops字段和fault字段都不為空,則說明這是一個基於文件的映射*/
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
/*否則分配匿名頁*/
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
/*屬於非線性文件映射且已被換出*/
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
/*頁不在主存中,但是頁表項保存了相關信息,則表明該頁被內核換出,則要進行換入操作*/
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
...
...
}
首先要確定的一點就是pte對應的頁是否駐留在主存中,因為pte有可能之前映射了頁,但是該頁被換出了。上面的代碼給出了pte對應的頁沒有駐留在主存中的情況。如果pte對應的頁沒有駐留在主存中,且沒有映射任何頁,即pte_present()返回0,pte_none()返回0,則要判斷要分配一個匿名頁還是一個映射頁。在Linux虛擬內存中,如果頁對應的vma映射的是文件,則稱為映射頁,如果不是映射的文件,則稱為匿名頁。兩者最大的區別體現在頁和vma的組織上,因為在頁框回收處理時要通過頁來逆向搜索映射了該頁的vma。對於匿名頁的逆映射,vma都是通過vma結構體中的vma_anon_node(鏈表節點)和anon_vma(鏈表頭)組織起來,再把該鏈表頭的信息保存在頁描述符中;而映射頁和vma的組織是通過vma中的優先樹節點和頁描述符中的mapping->i_mmap優先樹樹根進行組織的,具體可以參看ULK3。
來看基於文件的映射的處理:
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);//如果page_table之前用來建立了臨時內核映射,則釋放該映射
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
關鍵函數__do_fault():
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
pte_t entry;
int anon = 0;
int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);//調用定義好的fault函數,確保將所需的文件數據讀入到映射頁
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
return VM_FAULT_HWPOISON;
}
/*
* For consistency in subsequent calls, make the faulted page always
* locked.
*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
VM_BUG_ON(!PageLocked(vmf.page));
/*
* Should we do an early C-O-W break?
*/
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {//寫訪問
if (!(vma->vm_flags & VM_SHARED)) {//私有映射,則要創建一個副本進行寫時復制
anon = 1;// 標記為一個匿名映射
if (unlikely(anon_vma_prepare(vma))) {//創建一個anon_vma實例給vma
ret = VM_FAULT_OOM;
goto out;
}
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,//分配一個頁
vma, address);
if (!page) {
ret = VM_FAULT_OOM;
goto out;
}
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
page_cache_release(page);
goto out;
}
charged = 1;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED)
clear_page_mlock(vmf.page);
/*創建數據的副本,將數據拷貝到新分配的頁*/
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
/*
* If the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable
*/
if (vma->vm_ops->page_mkwrite) {
int tmp;
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if FAULT_FLAG_WRITE is set, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {//確定沒有競爭,也就是頁表項中的內容和之前是一樣的
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);//頁表項指向對應的物理頁
/*如果是寫操作,則將頁的訪問權限置為RW*/
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*如果之前生成的頁是匿名的,則將其集成到逆向映射當中*/
if (anon) {
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);//建立匿名頁與第一個vma的逆向映射
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);//建立頁與vma的普通映射
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
get_page(dirty_page);
}
}
set_pte_at(mm, address, page_table, entry);//修改page_table使其指向entry對應的頁框
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
anon = 1; /* no anon but release faulted_page */
}
pte_unmap_unlock(page_table, ptl);
out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
if (set_page_dirty(dirty_page))
page_mkwrite = 1;
unlock_page(dirty_page);
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
if (anon)
page_cache_release(vmf.page);
}
return ret;
unwritable_page:
page_cache_release(page);
return ret;
}