Linux 內核通過定期檢查並且預先將若干頁面換出, 實現減輕系統在缺頁異常時候所產生的負擔。 雖然, 無法避免需要臨時尋找可以換出的頁面, 但是, 可以減少這種事件發生的概率。Linux 內核中設置一個專門用來定期將頁面換出的線程 kswapd。
kswapd 相當於一個進程 有自己的進程控制塊 task_struct 結構, 但是呢, 他沒有自己獨立的地址空間, 我們可以將它理解為是線程。
==================== mm/vmscan.c 1146 1153 ==================== 1146 static int __init kswapd_init(void) 1147 { 1148 printk("Starting kswapd v1.8\n"); 1149 swap_setup(); 1150 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); 1151 kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); 1152 return 0; 1153 } ==================== mm/swap.c 293 305 ==================== [kswapd_init()>swap_setup()] 293 /* 294 * Perform any setup for the swap system 295 */ 296 void __init swap_setup(void) 297 { 298 /* Use a smaller cluster for memory <16MB or <32MB */ 299 if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT)) 300 page_cluster = 2; 301 else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT)) 302 page_cluster = 3; 303 else 304 page_cluster = 4; 305 }kswapd_init 在系統初始化期間受到調用,主要負責 2 個事情:
==================== mm/vmscan.c 947 1046 ==================== 947 /* 948 * The background pageout daemon, started as a kernel thread 949 * from the init process. 950 * 951 * This basically trickles out pages so that we have _some_ 95 952 * free memory available even if there is no other activity 953 * that frees anything up. This is needed for things like routing 954 * etc, where we otherwise might have all activity going on in 955 * asynchronous contexts that cannot page things out. 956 * 957 * If there are applications that are active memory-allocators 958 * (most normal use), this basically shouldn't matter. 959 */ 960 int kswapd(void *unused) 961 { 962 struct task_struct *tsk = current; 963 964 tsk->session = 1; 965 tsk->pgrp = 1; 966 strcpy(tsk->comm, "kswapd"); 967 sigfillset(&tsk->blocked); 968 kswapd_task = tsk; 969 970 /* 971 * Tell the memory management that we're a "memory allocator", 972 * and that if we need more memory we should get access to it 973 * regardless (see "__alloc_pages()"). "kswapd" should 974 * never get caught in the normal page freeing logic. 975 * 976 * (Kswapd normally doesn't need memory anyway, but sometimes 977 * you need a small amount of memory in order to be able to 978 * page out something else, and this flag essentially protects 979 * us from recursively trying to free more memory as we're 980 * trying to free the first piece of memory in the first place). 981 */ 982 tsk->flags |= PF_MEMALLOC; 983 984 /* 985 * Kswapd main loop. 986 */ 987 for (;;) { 988 static int recalc = 0; 989 990 /* If needed, try to free some memory. */ 991 if (inactive_shortage() || free_shortage()) { 992 int wait = 0; 993 /* Do we need to do some synchronous flushing? */ 994 if (waitqueue_active(&kswapd_done)) 995 wait = 1; 996 do_try_to_free_pages(GFP_KSWAPD, wait); 997 } 998 999 /* 1000 * Do some (very minimal) background scanning. This 1001 * will scan all pages on the active list once 1002 * every minute. This clears old referenced bits 1003 * and moves unused pages to the inactive list. 1004 */ 1005 refill_inactive_scan(6, 0); 1006 1007 /* Once a second, recalculate some VM stats. */ 1008 if (time_after(jiffies, recalc + HZ)) { 1009 recalc = jiffies; 1010 recalculate_vm_stats(); 1011 } 1012 1013 /* 1014 * Wake up everybody waiting for free memory 1015 * and unplug the disk queue. 1016 */ 1017 wake_up_all(&kswapd_done); 1018 run_task_queue(&tq_disk); 1019 1020 /* 1021 * We go to sleep if either the free page shortage 1022 * or the inactive page shortage is gone. We do this 1023 * because: 1024 * 1) we need no more free pages or 1025 * 2) the inactive pages need to be flushed to disk, 1026 *it wouldn't help to eat CPU time now ... 1027 * 1028 * We go to sleep for one second, but if it's needed 1029 * we'll be woken up earlier... 1030 */ 1031 if (!free_shortage() || !inactive_shortage()) { 1032 interruptible_sleep_on_timeout(&kswapd_wait, HZ); 1033 /* 1034 * If we couldn't free enough memory, we see if it was 1035 * due to the system just not having enough memory. 1036 * If that is the case, the only solution is to kill 1037 * a process (the alternative is enternal deadlock). 1038 * 1039 * If there still is enough memory around, we just loop 1040 * and try free some more memory... 1041 */ 1042 } else if (out_of_memory()) { 1043 oom_kill(); 1044 } 1045 } 1046 }就如同程序開始的注釋中所描述的, 這個kswapd 是一個頁面換出的守護進程, 他可以在沒有進程主動釋放內存的時候, 抽取出一些空閒頁面, 來為後面的頁面分配做准備。 通過設置PF_MEMALLOC 聲明自己的權限, 表明自己是內存分配者 注意到, 這是一個死循環, 在循環的末尾,我們會調用interruptible_sleep_on_timeout 進入睡眠,允許內核自由的調度其他的進程運行。 而這裡的 HZ 決定了內核中每秒有多少次時鐘的中斷。 分析可知這個循環至少每隔 1s 執行一遍, 這就是kswapd 的例行路線了。 接下來, 我們來看一下 這個例行路線中一般會做些什麼呢?
==================== mm/vmscan.c 805 822 ==================== [kswapd()>inactive_shortage()] 805 /* 806 * How many inactive pages are we short? 807 */ 808 int inactive_shortage(void) 809 { 810 int shortage = 0; 811 812 shortage += freepages.high; 813 shortage += inactive_target; 814 shortage -= nr_free_pages(); 815 shortage -= nr_inactive_clean_pages(); 816 shortage -= nr_inactive_dirty_pages; 817 818 if (shortage > 0) 819 return shortage; 820 821 return 0; 822 }這段代碼主要用來檢查內存中可供分配或者周轉的物理頁面是否短缺 freepages.high 表征空閒頁面的數量, inactive_target 表征不活躍頁面的數量。 而他們兩者的和表征了正常情況下, 系統潛在的頁面的供應量。而這部分供應量又來自於3個部分:
==================== include/linux/wait.h 152 161 ==================== [kswapd()>waitqueue_active()] 152 static inline int waitqueue_active(wait_queue_head_t *q) 153 { 154 #if WAITQUEUE_DEBUG 155 if (!q) 156 WQ_BUG(); 157 CHECK_MAGIC_WQHEAD(q); 158 #endif 159 160 return !list_empty(&q->task_list); 161 }當發現頁面短缺的時候, 我們需要設法釋放和換出若干頁面, 通過do_try_to_free_pages 完成。 通過waitqueue_active, 我們可以查看 kswapd_done 隊列中是否由函數在等待執行。 這個kswapd_done 隊列, 會保證在kswapd每完成一次例行操作的時候, 會執行其中的掛載到隊列中的函數。
==================== mm/vmscan.c 907 941 ==================== [kswapd()>do_try_to_free_pages()] 907 static int do_try_to_free_pages(unsigned int gfp_mask, int user) 908 { 909 int ret = 0; 910 911 /* 912 * If we're low on free pages, move pages from the 913 * inactive_dirty list to the inactive_clean list. 914 * 915 * Usually bdflush will have pre-cleaned the pages 916 * before we get around to moving them to the other 917 * list, so this is a relatively cheap operation. 918 */ 919 if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() + 920 nr_inactive_clean_pages()) 921 ret += page_launder(gfp_mask, user); 922 923 /* 924 * If needed, we move pages from the active list 925 * to the inactive list. We also "eat" pages from 926 * the inode and dentry cache whenever we do this. 927 */ 928 if (free_shortage() || inactive_shortage()) { 929 shrink_dcache_memory(6, gfp_mask); 930 shrink_icache_memory(6, gfp_mask); 931 ret += refill_inactive(gfp_mask, user); 932 } else { 933 /* 934 * Reclaim unused slab cache memory. 935 */ 936 kmem_cache_reap(gfp_mask); 937 ret = 1; 938 } 939 940 return ret; 941 }當我們空閒頁面比較短缺, 或者非活躍髒頁面的數量相對較多的時候, 我們就利用page_launder 獲取一部分干淨的頁面。 如果經過上面1 的操作, 頁面還是短缺的話, 就考慮將活躍頁面轉為非活躍頁面進行處理。
==================== mm/vmscan.c 465 670 ==================== [kswapd()>do_try_to_free_pages()>page_launder()] 465 /** 466 * page_launder - clean dirty inactive pages, move to inactive_clean list 467 * @gfp_mask: what operations we are allowed to do 468 * @sync: should we wait synchronously for the cleaning of pages 469 * 470 * When this function is called, we are most likely low on free + 471 * inactive_clean pages. Since we want to refill those pages as 472 * soon as possible, we'll make two loops over the inactive list, 473 * one to move the already cleaned pages to the inactive_clean lists 474 * and one to (often asynchronously) clean the dirty inactive pages. 475 * 476 * In situations where kswapd cannot keep up, user processes will 477 * end up calling this function. Since the user process needs to 478 * have a page before it can continue with its allocation, we'll 479 * do synchronous page flushing in that case. 480 * 481 * This code is heavily inspired by the FreeBSD source code. Thanks 482 * go out to Matthew Dillon. 483 */ 484 #define MAX_LAUNDER (4 * (1 << page_cluster)) 485 int page_launder(int gfp_mask, int sync) 486 { 487 int launder_loop, maxscan, cleaned_pages, maxlaunder; 488 int can_get_io_locks; 489 struct list_head * page_lru; 490 struct page * page; 491 492 /* 493 * We can only grab the IO locks (eg. for flushing dirty 494 * buffers to disk) if __GFP_IO is set. 495 */ 496 can_get_io_locks = gfp_mask & __GFP_IO; 497 498 launder_loop = 0; 499 maxlaunder = 0; 500 cleaned_pages = 0; 501 502 dirty_page_rescan: 503 spin_lock(&pagemap_lru_lock); 504 maxscan = nr_inactive_dirty_pages; 505 while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && 506 maxscan-- > 0) { 507 page = list_entry(page_lru, struct page, lru); 508 509 /* Wrong page on list?! (list corruption, should not happen) */ 510 if (!PageInactiveDirty(page)) { 511 printk("VM: page_launder, wrong page on list.\n"); 512 list_del(page_lru); 513 nr_inactive_dirty_pages--; 514 page->zone->inactive_dirty_pages--; 515 continue; 516 } 517 518 /* Page is or was in use? Move it to the active list. */ 519 if (PageTestandClearReferenced(page) || page->age > 0 || 520 (!page->buffers && page_count(page) > 1) || 521 page_ramdisk(page)) { 522 del_page_from_inactive_dirty_list(page); 523 add_page_to_active_list(page); 524 continue; 525 } 526 527 /* 528 * The page is locked. IO in progress? 529 * Move it to the back of the list. 530 */ 531 if (TryLockPage(page)) { 532 list_del(page_lru); 533 list_add(page_lru, &inactive_dirty_list); 534 continue; 535 } 536 537 /* 538 * Dirty swap-cache page? Write it out if 539 * last copy.. 540 */ 541 if (PageDirty(page)) { 542 int (*writepage)(struct page *) = page->mapping->a_ops->writepage; 543 int result; 544 545 if (!writepage) 546 goto page_active; 547 548 /* First time through? Move it to the back of the list */ 549 if (!launder_loop) { 550 list_del(page_lru); 551 list_add(page_lru, &inactive_dirty_list); 552 UnlockPage(page); 553 continue; 554 } 555 556 /* OK, do a physical asynchronous write to swap. */ 557 ClearPageDirty(page); 558 page_cache_get(page); 559 spin_unlock(&pagemap_lru_lock); 560 561 result = writepage(page); 562 page_cache_release(page); 563 564 /* And re-start the thing.. */ 565 spin_lock(&pagemap_lru_lock); 566 if (result != 1) 567 continue; 568 /* writepage refused to do anything */ 569 set_page_dirty(page); 570 goto page_active; 571 } 572 573 /* 574 * If the page has buffers, try to free the buffer mappings 575 * associated with this page. If we succeed we either free 576 * the page (in case it was a buffercache only page) or we 577 * move the page to the inactive_clean list. 578 * 579 * On the first round, we should free all previously cleaned 580 * buffer pages 581 */ 582 if (page->buffers) { 583 int wait, clearedbuf; 584 int freed_page = 0; 585 /* 586 * Since we might be doing disk IO, we have to 587 * drop the spinlock and take an extra reference 588 * on the page so it doesn't go away from under us. 589 */ 590 del_page_from_inactive_dirty_list(page); 591 page_cache_get(page); 592 spin_unlock(&pagemap_lru_lock); 593 594 /* Will we do (asynchronous) IO? */ 595 if (launder_loop && maxlaunder == 0 && sync) 596 wait = 2; /* Synchrounous IO */ 597 else if (launder_loop && maxlaunder-- > 0) 598 wait = 1; /* Async IO */ 599 else 600 wait = 0; /* No IO */ 601 602 /* Try to free the page buffers. */ 603 clearedbuf = try_to_free_buffers(page, wait); 604 605 /* 606 * Re-take the spinlock. Note that we cannot 607 * unlock the page yet since we're still 608 * accessing the page_struct here... 609 */ 610 spin_lock(&pagemap_lru_lock); 611 612 /* The buffers were not freed. */ 613 if (!clearedbuf) { 614 add_page_to_inactive_dirty_list(page); 615 616 /* The page was only in the buffer cache. */ 617 } else if (!page->mapping) { 618 atomic_dec(&buffermem_pages); 619 freed_page = 1; 620 cleaned_pages++; 621 622 /* The page has more users besides the cache and us. */ 623 } else if (page_count(page) > 2) { 624 add_page_to_active_list(page); 625 626 /* OK, we "created" a freeable page. */ 627 } else /* page->mapping && page_count(page) == 2 */ { 628 add_page_to_inactive_clean_list(page); 629 cleaned_pages++; 630 } 631 632 /* 633 * Unlock the page and drop the extra reference. 634 * We can only do it here because we ar accessing 635 * the page struct above. 636 */ 637 UnlockPage(page); 638 page_cache_release(page); 639 640 /* 641 * If we're freeing buffer cache pages, stop when 642 * we've got enough free memory. 643 */ 644 if (freed_page && !free_shortage()) 645 break; 646 continue; 647 } else if (page->mapping && !PageDirty(page)) { 648 /* 649 * If a page had an extra reference in 650 * deactivate_page(), we will find it here. 651 * Now the page is really freeable, so we 652 * move it to the inactive_clean list. 653 */ 654 del_page_from_inactive_dirty_list(page); 655 add_page_to_inactive_clean_list(page); 656 UnlockPage(page); 657 leaned_pages++; 658 } else { 659 page_active: 660 /* 661 * OK, we don't know what to do with the page. 662 * It's no use keeping it here, so we move it to 663 * the active list. 664 */ 665 del_page_from_inactive_dirty_list(page); 666 add_page_to_active_list(page); 667 UnlockPage(page); 668 } 669 } 670 spin_unlock(&pagemap_lru_lock); ==================== mm/vmscan.c 671 697 ==================== [kswapd()>do_try_to_free_pages()>page_launder()] 671 672 /* 673 * If we don't have enough free pages, we loop back once 674 * to queue the dirty pages for writeout. When we were called 675 * by a user process (that /needs/ a free page) and we didn't 676 * free anything yet, we wait synchronously on the writeout of 677 * MAX_SYNC_LAUNDER pages. 678 * 679 * We also wake up bdflush, since bdflush should, under most 680 * loads, flush out the dirty pages before we have to wait on 681 * IO. 682 */ 683 if (can_get_io_locks && !launder_loop && free_shortage()) { 684 launder_loop = 1; 685 /* If we cleaned pages, never do synchronous IO. */ 686 if (cleaned_pages) 687 sync = 0; 688 * We only do a few "out of order" flushes. */ 689 maxlaunder = MAX_LAUNDER; 690 /* Kflushd takes care of the rest. */ 691 wakeup_bdflush(0); 692 goto dirty_page_rescan; 693 } 694 695 /* Return the number of pages moved to the inactive_clean list. */ 696 return cleaned_pages; 697 }這個函數很長, 主要用來清理非活躍髒頁面, 並將他們移動到非活躍干淨鏈表中。就如注釋中描述的, 它使用了兩個循環來掃描非活躍鏈表, 一個循環用來將已經cleaned 的頁面移動到 inactive_clean_list 中去, 另一個循環用來異步的清理非活躍的髒頁面, 將他們變成干淨的頁面。 使用 cleaned_pages 表示累計被 洗干淨的頁面的數量, 用 launder_loop 來控制掃描不活躍髒頁面隊列的次數。我們可以看到 代碼 683 行處 使用了 !launder_loop 作為判斷條件, ie, 最多只會對非活躍頁面進行 兩次掃描。 通過maxscan 設置最大的掃描數量,為什麼需要這麼做呢?這是因為, 我們在循環掃描一個不活躍髒頁面隊列的時候, 可能會把有些頁面從當前位置移動到隊列的尾部, 設置最大的掃描數量可以避免對一個頁面的重復操作。 510 ~ 516 檢查當前頁面是不是非活躍髒頁面, 如果不是, 就應該把它移除出這個隊列。 518 ~ 524 檢查這個頁面是否是正在使用的, 如果是的話, 就把他移入活躍隊列。 當頁面被移入非活躍髒頁面時候, 然後又受到了訪問, 就有可能發生這樣的情形。這裡面有 4 個判斷條件, 頁面沒有老化, 或者 頁面沒有使用緩沖區 但是 頁面的引用計數 大於 1, 或者
感覺這個page_launder 主要用來將不活躍髒隊列中的頁面, 轉化成不活躍干淨的頁面。 他一般進行兩次掃描, 第一次掃描時候, 釋放之前已經清理過的所有 用作buffer 的page, 將一些需要處理的頁面設置為 PG_dirty, 並將他們放到 inactive_dirty_list 的末尾。第二次掃描的時候, 才將他們換出, 並移入到相應的隊列中去。
回到do_try_to_free_pages的代碼中, 在經過page_launder 處理過後, 如果可以分配的物理頁面的數量還是不足, 就需要進一步設法回收頁面了。主要有4個途徑:
1. shrink_dcache_memory, shrink_icache_memory, 由於打開文件的過程中需要分配和使用代表著目錄項的dentry 數據結構, 以及代表著文件索引節點的 inode 數據結構。 這些結構在文件關閉之後不會立即被釋放, 而是放在 LRU 隊列中作為後備, 以防止在不久的將來的文件操作中還會被用到。這兩個函數呢, 就是對這部分空間的回收。
2. kmem_cache_reap, 由於在內核中運行需要動態分配很多的數據結構, 因此在內核中采用了一種 稱為是 slab 的管理機制。這個機制將物理頁面分成小塊進行分配。 但是他不會主動退還這部分空間, 就需要kmem_cache_reap 進行回收。
3. refill_inactive 操作。
==================== mm/vmscan.c 824 905 ==================== [kswapd()>do_try_to_free_pages()>refill_inactive()] 824 /* 825 * We need to make the locks finer granularity, but right 826 * now we need this so that we can do page allocations 827 * without holding the kernel lock etc. 828 * 829 * We want to try to free "count" pages, and we want to 830 * cluster them so that we get good swap-out behaviour. 831 * 832 * OTOH, if we're a user process (and not kswapd), we 833 * really care about latency. In that case we don't try 834 * to free too many pages. 835 */ 836 static int refill_inactive(unsigned int gfp_mask, int user) 837 { 838 int priority, count, start_count, made_progress; 839 840 count = inactive_shortage() + free_shortage(); 841 if (user) 842 count = (1 << page_cluster); 843 start_count = count; 844 845 /* Always trim SLAB caches when memory gets low. */ 846 kmem_cache_reap(gfp_mask); 847 848 priority = 6; 849 do { 850 made_progress = 0; 851 852 if (current->need_resched) { 853 __set_current_state(TASK_RUNNING); 854 schedule(); 855 } 856 857 while (refill_inactive_scan(priority, 1)) { 858 made_progress = 1; 859 if (--count <= 0) 860 goto done; 861 } 862 863 /* 864 * don't be too light against the d/i cache since 865 * refill_inactive() almost never fail when there's 866 * really plenty of memory free. 867 */ 868 shrink_dcache_memory(priority, gfp_mask); 869 shrink_icache_memory(priority, gfp_mask); 870 871 /* 872 * Then, try to page stuff out.. 873 */ 874 while (swap_out(priority, gfp_mask)) { 875 made_progress = 1; 876 if (--count <= 0) 877 goto done; 878 } 879 880 /* 881 * If we either have enough free memory, or if 882 * page_launder() will be able to make enough 883 * free memory, then stop. 884 */ 885 if (!inactive_shortage() || !free_shortage()) 886 goto done; 887 888 /* 889 * Only switch to a lower "priority" if we 890 * didn't make any useful progress in the 891 * last loop. 892 */ 893 if (!made_progress) 894 priority--; 895 } while (priority >= 0); 896 897 /* Always end on a refill_inactive.., may sleep... */ 898 while (refill_inactive_scan(0, 1)) { 899 if (--count <= 0) 900 goto done; 901 } 902 903 done: 904 return (count < start_count); 905 }代碼開始處的注釋說了, 我們試圖釋放 count 個 頁面, 為了保證有更好的swap_out 的性能, 我們將他們以簇為單位進行數據處理。 這裡的user 表示是否有函數在 kswapd_done 隊列中等待。他決定了這次回收頁面的緊要程度, 對我們回收的頁面的數量有一定影響。(841 ~ 842)。 846 通過kmem_cache_reap 回收一部分的 slab 空間。 848 ~ 895 通過一個大的do .. while 循環, 逐步加大力度, 進行頁面的回收。 852 ~ 855 先檢查task_struct 結構中的need_resched 是否 為 1.如果是的話, 就使用 schedule 使內核可以執行一次調度。為什麼需要這樣操作呢? 這是因為, kswapd 是個內核線程, 他永遠不會返回自己的用戶空間, (因為他沒有), 這時候, 為了避免kswapd 占用cpu 長期不釋放, 就需要通過這個操作實現, ‘自律’。 這個函數主要進行兩個操作:
==================== mm/vmscan.c 699 769 ==================== 699 /** 700 * refill_inactive_scan - scan the active list and find pages to deactivate 701 * @priority: the priority at which to scan 702 * @oneshot: exit after deactivating one page 703 * 704 * This function will scan a portion of the active list to find 705 * unused pages, those pages will then be moved to the inactive list. 706 */ 707 int refill_inactive_scan(unsigned int priority, int oneshot) 708 { 709 struct list_head * page_lru; 710 struct page * page; 711 int maxscan, page_active = 0; 712 int ret = 0; 713 714 /* Take the lock while messing with the list... */ 715 spin_lock(&pagemap_lru_lock); 716 maxscan = nr_active_pages >> priority; 717 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { 718 page = list_entry(page_lru, struct page, lru); 719 720 /* Wrong page on list?! (list corruption, should not happen) */ 721 if (!PageActive(page)) { 722 printk("VM: refill_inactive, wrong page on list.\n"); 723 list_del(page_lru); 724 nr_active_pages--; 725 continue; 726 } 727 728 /* Do aging on the pages. */ 729 if (PageTestandClearReferenced(page)) { 730 age_page_up_nolock(page); 731 page_active = 1; 732 } else { 733 age_page_down_ageonly(page); 734 /* 735 * Since we don't hold a reference on the page 736 * ourselves, we have to do our test a bit more 737 * strict then deactivate_page(). This is needed 738 * since otherwise the system could hang shuffling 739 * unfreeable pages from the active list to the 740 * inactive_dirty list and back again... 741 * 742 * SUBTLE: we can have buffer pages with count 1. 743 */ 744 if (page->age == 0 && page_count(page) <= 745 (page->buffers ? 2 : 1)) { 746 deactivate_page_nolock(page); 747 page_active = 0; 748 } else { 749 page_active = 1; 750 } 751 } 752 /* 753 * If the page is still on the active list, move it 754 * to the other end of the list. Otherwise it was 755 * deactivated by age_page_down and we exit successfully. 756 */ 757 if (page_active || PageActive(page)) { 758 list_del(page_lru); 759 list_add(page_lru, &active_list); 760 } else { 761 ret = 1; 762 if (oneshot) 763 break; 764 } 765 } 766 spin_unlock(&pagemap_lru_lock); 767 768 return ret; 769 }
1. 像注釋裡面所說的, 這個函數用來掃描 active_list, 找出可以轉化為非活躍狀態的頁面。
2. 類似於 page_launder, 這個函數也維護了一個 maxscan。但是呢, 這個maxscan 是和我們的 priority 相關的。ie, 只有在priority 為 0 的時候, 才會掃描整個隊列。
3. 720 ~ 726 判斷這個頁面是不是活躍的, 如果不是刪除他。
4. 729 中的 PageTestandClearReferenced 用來檢測這個頁面是否還在使用(受到訪問), 如果還在使用, 還有引用就不去清理他。並將它的age 遞增。
5. 732 ~ 751 如果頁面不再受到訪問, 就遞減他的age 壽命。 當這個頁面的壽命達到 0 的時候,就表明這個頁面很久沒有受到訪問了。
然後, 再看他的空間映射, 如果頁面並不用作文件系統的讀寫緩沖, 只要頁面計數大於 1 就表明還有用戶空間映射。 如果頁面有文件系統的讀寫緩沖, 此時頁面計數應該與 2 做比較。通過將 page_active 記為 0, 表明頁面應該從活躍狀態轉為 非活躍狀態。
6. 752 ~ 763 如果頁面被標記成了page_active 就把他移動到隊列尾部, 否則, 根據oneshot 判斷是否退出。
==================== mm/vmscan.c 297 378 ==================== [kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()] 297 /* 298 * Select the task with maximal swap_cnt and try to swap out a page. 299 * N.B. This function returns only 0 or 1. Return values != 1 from 300 * the lower level routines result in continued processing. 301 */ 302 #define SWAP_SHIFT 5 303 #define SWAP_MIN 8 304 305 static int swap_out(unsigned int priority, int gfp_mask) 306 { 307 int counter; 308 int __ret = 0; 309 310 /* 311 * We make one or two passes through the task list, indexed by 312 * assign = {0, 1}: 313 * Pass 1: select the swappable task with maximal RSS that has 314 * not yet been swapped out. 315 * Pass 2: re-assign rss swap_cnt values, then select as above. 316 * 317 * With this approach, there's no need to remember the last task 318 * swapped out. If the swap-out fails, we clear swap_cnt so the 319 * task won't be selected again until all others have been tried. 320 * 321 * Think of swap_cnt as a "shadow rss" - it tells us which process 322 * we want to page out (always try largest first). 323 */ 324 counter = (nr_threads << SWAP_SHIFT) >> priority; 325 if (counter < 1) 326 counter = 1; 327 328 for (; counter >= 0; counter--) { 329 struct list_head *p; 330 unsigned long max_cnt = 0; 331 struct mm_struct *best = NULL; 332 int assign = 0; 333 int found_task = 0; 334 select: 335 spin_lock(&mmlist_lock); 336 p = init_mm.mmlist.next; 337 for (; p != &init_mm.mmlist; p = p->next) { 338 struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); 339 if (mm->rss <= 0) 340 continue; 341 found_task++; 342 /* Refresh swap_cnt? */ 343 if (assign == 1) { 344 mm->swap_cnt = (mm->rss >> SWAP_SHIFT); 345 if (mm->swap_cnt < SWAP_MIN) 346 mm->swap_cnt = SWAP_MIN; 347 } 348 if (mm->swap_cnt > max_cnt) { 349 max_cnt = mm->swap_cnt; 350 best = mm; 351 } 352 } 353 354 /* Make sure it doesn't disappear */ 355 if (best) 356 atomic_inc(&best->mm_users); 357 spin_unlock(&mmlist_lock); 358 359 /* 360 * We have dropped the tasklist_lock, but we 361 * know that "mm" still exists: we are running 362 * with the big kernel lock, and exit_mm() 363 * cannot race with us. 364 */ 112 365 if (!best) { 366 if (!assign && found_task > 0) { 367 assign = 1; 368 goto select; 369 } 370 break; 371 } else { 372 __ret = swap_out_mm(best, gfp_mask); 373 mmput(best); 374 break; 375 } 376 } 377 return __ret; 378 }
1. 類似於refill_inactive_scan 循環的時候依賴於maxscan, 這個函數在循環的時候依賴於 counter, 而這個counter 與系統當前的線程的數量, 以及調用時候的優先級 priority 相關。
2. 這個函數呢, 並不會做物理意義上的換出操作, 只是為把一些頁面交換到交換設備上做准備而已。
3. 每個進程都有自身的虛存空間, 在這個空間中已經分配並且建立了映射的頁面構成了一個集合。而這個集合中對應的物理頁面在內存中的頁面的集合, 一般是這個集合的一個子集。我們稱為是“駐內存頁集合”, 其大小為rss。
4. rss表示一個進程占用的內存頁面的數量, *而swap_cnt 表示一個進程在一輪換出內存頁面的努力中尚未考察的數量。***ie, rss > swap_cnt
5. 335 ~ 352 遍歷除init 進程以外的所有進程, 來尋找一塊swap_cnt 數量最大的進程。如果第一輪掃描沒有找到這樣的進程, 就把mm->rss 拷貝到mm->swap_cnt 再查找一次, 最大的swap_cnt。
6. 354 ~ 357 為進程塊添加一個用戶, 避免進程塊消失
7. 從進程的角度而言, 對進程頁面的占用主要有兩個方面:
1. 由於頁面異常而產生頁面建立或者頁面映射恢復
2. 調用swap_out 切斷若干頁面的映射
8. 找到一個最大的進程塊之後, 就使用 swap_out_mm 完成頁面的具體的換出工作。然後用mmput 遞減他的用戶計數。
本質上也是一個兩層掃描, 找到最大 swap_cnt 的進程塊, 並將它換出。
==================== mm/vmscan.c 257 295 ==================== [kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()] 257 static int swap_out_mm(struct mm_struct * mm, int gfp_mask) 258 { 259 int result = 0; 260 unsigned long address; 261 struct vm_area_struct* vma; 262 263 /* 264 * Go through process' page directory. 265 */ 266 267 /* 268 * Find the proper vm-area after freezing the vma chain 269 * and ptes. 270 */ 271 spin_lock(&mm->page_table_lock); 272 address = mm->swap_address; 273 vma = find_vma(mm, address); 274 if (vma) { 275 if (address < vma->vm_start) 276 address = vma->vm_start; 277 278 for (;;) { 279 result = swap_out_vma(mm, vma, address, gfp_mask); 280 if (result) 281 goto out_unlock; 282 vma = vma->vm_next; 283 if (!vma) 284 break; 285 address = vma->vm_start; 286 } 287 } 288 /* Reset to 0 when we reach the end of address space */ 289 mm->swap_address = 0; 290 mm->swap_cnt = 0; 291 292 out_unlock: 293 spin_unlock(&mm->page_table_lock); 294 return result; 295 }
1. mm->swap_address 表征了在執行過程中需要接著考察的頁面的地址。
2. 先找到相應需要考察的虛存區域 vma。然後通過 swap_out_vma 試圖換出一個頁面, 如果不行, 繼續向後嘗試。
3. 這個過程中有個非常重要的調用: try_to_swap_out
==================== mm/vmscan.c 27 56 ==================== [kswapd()>do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()] 27 /* 28 * The swap-out functions return 1 if they successfully 29 * threw something out, and we got a free page. It returns 30 * zero if it couldn't do anything, and any other value 31 * indicates it decreased rss, but the page was shared. 32 * 33 * NOTE! If it sleeps, it *must* return 1 to make sure we 34 * don't continue with the swap-out. Otherwise we may be 35 * using a process that no longer actually exists (it might 36 * have died while we slept). 37 */ 38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) 39 { 40 pte_t pte; 41 swp_entry_t entry; 42 struct page * page; 43 int onlist; 44 45 pte = *page_table; 46 if (!pte_present(pte)) 47 goto out_failed; 48 page = pte_page(pte); 49 if ((!VALID_PAGE(page)) || PageReserved(page)) 50 goto out_failed; 51 52 if (!mm->swap_cnt) 53 return 1; 54 55 mm->swap_cnt--; 56
1. 這裡的page_table 實際上指向一個頁面表項, 而不是一個頁面表。
2. 上述代碼主要是對page 做有效性驗證。
附加代碼:
==================== mm/vmscan.c 106 107 ==================== 106 out_failed: 107 return 0; ==================== include/asm-i386/page.h 118 118 ==================== 118 #define VALID_PAGE(page) ((page - mem_map) < max_mapnr)valid_page 主要根據頁面在數組中的下標不能超過 最大的物理內存頁面序號 max_mapnr 來判斷。
==================== mm/vmscan.c 57 74 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()] 57 onlist = PageActive(page); 58 /* Don't look at this pte if it's been accessed recently. */ 59 if (ptep_test_and_clear_young(page_table)) { 60 age_page_up(page); 61 goto out_failed; 62 } 63 if (!onlist) 64 /* The page is still mapped, so it can't be freeable... */ 65 age_page_down_ageonly(page); 66 67 /* 68 * If the page is in active use by us, or if the page 69 * is in active use by others, don't unmap it or 70 * (worse) start unneeded IO. 71 */ 72 if (page->age > 0) 73 goto out_failed; 74 ==================== include/linux/mm.h 230 230 ==================== 230 #define PageActive(page) test_bit(PG_active, &(page)->flags) ==================== include/asm-i386/pgtable.h 285 285 ==================== 285 static inline int ptep_test_and_clear_young(pte_t *ptep) { return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); } ==================== mm/swap.c 125 138 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()>age_page_up()] 125 void age_page_up(struct page * page) 126 { 127 /* 128 * We're dealing with an inactive page, move the page 129 * to the active list. 130 */ 131 if (!page->age) 132 activate_page(page); 133 134 /* The actual page aging bit */ 135 page->age += PAGE_AGE_ADV; 136 if (page->age > PAGE_AGE_MAX) 137 page->age = PAGE_AGE_MAX; 138 } ==================== mm/swap.c 103 110 ==================== 103 /* 104 * We use this (minimal) function in the case where we 105 * know we can't deactivate the page (yet). 106 */ 107 void age_page_down_ageonly(struct page * page) 108 { 109 page->age /= 2; 110 }通過 ptep_test_and_clear_young 來測試一個頁面是否年輕。ie, 從上一次try_to_swap_out 掃描後, 如果又被訪問過了, 說明還是很年輕的。 另外, 這個操作之後 PAGE_ACCESSED 標志位會被置為 0. 當然對年輕的頁面一定是不會進行換出的。但是我們會增加他的壽命 age。 那如果頁面不在 活躍隊列中的話, 就將他的壽命 age 減半, 衰老。age 沒有降到 0 的話, 就不可以把他換出來。
==================== mm/vmscan.c 75 108 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd()>swap_o ut_pmd()>try_to_swap_out()] 75 if (TryLockPage(page)) 76 goto out_failed; 77 78 /* From this point on, the odds are that we're going to 79 * nuke this pte, so read and clear the pte. This hook 80 * is needed on CPUs which update the accessed and dirty 81 * bits in hardware. 82 */ 83 pte = ptep_get_and_clear(page_table); 84 flush_tlb_page(vma, address); 85 86 /* 87 * Is the page already in the swap cache? If so, then 88 * we can just drop our reference to it without doing 89 * any IO - it's already up-to-date on disk. 90 * 91 * Return 0, as we didn't actually free any real 92 * memory, and we should just continue our scan. 93 */ 94 if (PageSwapCache(page)) { 95 entry.val = page->index; 96 if (pte_dirty(pte)) 97 set_page_dirty(page); 98 set_swap_pte: 99 swap_duplicate(entry); 100 set_pte(page_table, swp_entry_to_pte(entry)); 101 drop_pte: 102 UnlockPage(page); 103 mm->rss--; 104 deactivate_page(page); 105 page_cache_release(page); 106 out_failed: 107 return 0; 108 } ==================== include/linux/mm.h 183 183 ==================== 183 #define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags) ==================== include/linux/mm.h 217 217 ==================== 217 #define PageSwapCache(page) test_bit(PG_swap_cache, &(page)->flags)
1. 如果一個頁面的PG_locked 標志位 已經為 1 了, 表明他已經被別的進程鎖定了, 不能才對他處理了。
2. 如果頁面已經被緩存過了,ie, 頁面內容已經在 交換設備上了, 只要將映射斷開即可。PG_swap_cache 表示的就是page 結構在 swapper_space 隊列中。而這時候 的 index 字段就是一個32 bit 的索引項 swp_entry_t 指向頁面在交換設備上映像的指針。
3. 使用swap_duplicate 檢測索引項的內容, 並遞增相應盤上頁面的共享計數。
4. 使用 set_pte 將指向盤上頁面的索引項置入相應的頁面表項, 這樣原先對內存頁面的映射, 現在轉變成了對盤上頁面的映射。從而完成斷開物理頁面映射的操作。
==================== mm/swapfile.c 820 871 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()>swap_duplicate()] 119 820 /* 821 * Verify that a swap entry is valid and increment its swap map count. 822 * Kernel_lock is held, which guarantees existance of swap device. 823 * 824 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 825 * "permanent", but will be reclaimed by the next swapoff. 826 */ 827 int swap_duplicate(swp_entry_t entry) 828 { 829 struct swap_info_struct * p; 830 unsigned long offset, type; 831 int result = 0; 832 833 /* Swap entry 0 is illegal */ 834 if (!entry.val) 835 goto out; 836 type = SWP_TYPE(entry); 837 if (type >= nr_swapfiles) 838 goto bad_file; 839 p = type + swap_info; 840 offset = SWP_OFFSET(entry); 841 if (offset >= p->max) 842 goto bad_offset; 843 if (!p->swap_map[offset]) 844 goto bad_unused; 845 /* 846 * Entry is valid, so increment the map count. 847 */ 848 swap_device_lock(p); 849 if (p->swap_map[offset] < SWAP_MAP_MAX) 850 p->swap_map[offset]++; 851 else { 852 static int overflow = 0; 853 if (overflow++ < 5) 854 printk("VM: swap entry overflow\n"); 855 p->swap_map[offset] = SWAP_MAP_MAX; 856 } 857 swap_device_unlock(p); 858 result = 1; 859 out: 860 return result; 861 862 bad_file: 863 printk("Bad swap file entry %08lx\n", entry.val); 864 goto out; 865 bad_offset: 866 printk("Bad swap offset entry %08lx\n", entry.val); 867 goto out; 868 bad_unused: 120 869 printk("Unused swap offset entry in swap_dup %08lx\n", entry.val); 870 goto out; 871 }
1. 這裡的type 表征交換設備的序號
2. 833 ~ 844 檢驗entry 的合法性
3. 845 ~ 856 遞增設備文件上的共享計數
==================== mm/swap.c 189 194 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()>deactivate_page()] 189 void deactivate_page(struct page * page) 190 { 191 spin_lock(&pagemap_lru_lock); 192 deactivate_page_nolock(page); 193 spin_unlock(&pagemap_lru_lock); 194 } [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()>deactivate_page()>deactivate_page_nolock()] 154 /** 155 * (de)activate_page - move pages from/to active and inactive lists 156 * @page: the page we want to move 157 * @nolock - are we already holding the pagemap_lru_lock? 158 * 159 * Deactivate_page will move an active page to the right 160 * inactive list, while activate_page will move a page back 161 * from one of the inactive lists to the active list. If 162 * called on a page which is not on any of the lists, the 163 * page is left alone. 164 */ 165 void deactivate_page_nolock(struct page * page) 166 { 167 /* 168 * One for the cache, one for the extra reference the 169 * caller has and (maybe) one for the buffers. 170 * 171 * This isn't perfect, but works for just about everything. 172 * Besides, as long as we don't move unfreeable pages to the 173 * inactive_clean list it doesn't need to be perfect... 174 */ 175 int maxcount = (page->buffers ? 3 : 2); 176 page->age = 0; 177 ClearPageReferenced(page); 178 179 /* 180 * Don't touch it if it's not on the active list. 181 * (some pages aren't on any list at all) 182 */ 183 if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { 184 del_page_from_active_list(page); 185 add_page_to_inactive_dirty_list(page); 186 } 187 } ==================== include/linux/swap.h 234 240 ==================== 234 #define del_page_from_active_list(page) { \ 122 235 list_del(&(page)->lru); \ 236 ClearPageActive(page); \ 237 nr_active_pages--; \ 238 DEBUG_ADD_PAGE \ 239 ZERO_PAGE_BUG \ 240 } ==================== include/linux/swap.h 217 224 ==================== 217 #define add_page_to_inactive_dirty_list(page) { \ 218 DEBUG_ADD_PAGE \ 219 ZERO_PAGE_BUG \ 220 SetPageInactiveDirty(page); \ 221 list_add(&(page)->lru, &inactive_dirty_list); \ 222 nr_inactive_dirty_pages++; \ 223 page->zone->inactive_dirty_pages++; \ 224 } ==================== include/linux/pagemap.h 34 34 ==================== 34 #define page_cache_release(x) __free_page(x) ==================== include/linux/mm.h 379 379 ==================== 379 #define __free_page(page) __free_pages((page), 0) ==================== mm/page_alloc.c 549 553 ==================== 549 void __free_pages(struct page *page, unsigned long order) 550 { 551 if (!PageReserved(page) && put_page_testzero(page)) 552 __free_pages_ok(page, order); 553 } ==================== include/linux/mm.h 152 152 ==================== 152 #define put_page_testzero(p) atomic_dec_and_test(&(p)->count)page 中有個值 count。
==================== mm/vmscan.c 110 157 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()] 110 /* 111 * Is it a clean page? Then it must be recoverable 112 * by just paging it in again, and we can just drop 113 * it.. 114 * 115 * However, this won't actually free any real 116 * memory, as the page will just be in the page cache 117 * somewhere, and as such we should just continue 118 * our scan. 119 * 120 * Basically, this just makes it possible for us to do 121 * some real work in the future in "refill_inactive()". 122 */ 123 flush_cache_page(vma, address); 124 if (!pte_dirty(pte)) 125 goto drop_pte; 126 127 /* 128 * Ok, it's really dirty. That means that 129 * we should either create a new swap cache 130 * entry for it, or we should write it back 131 * to its own backing store. 132 */ 133 if (page->mapping) { 134 set_page_dirty(page); 135 goto drop_pte; 136 } 137 138 /* 139 * This is a dirty, swappable page. First of all, 140 * get a suitable swap entry for it, and make sure 141 * we have the swap cache set up to associate the 142 * page with that swap entry. 143 */ 144 entry = get_swap_page(); 145 if (!entry.val) 146 goto out_unlock_restore; /* No swap space left */ 147 148 /* Add it to the swap cache and mark it dirty */ 149 add_to_swap_cache(page, entry); 150 set_page_dirty(page); 151 goto set_swap_pte; 152 153 out_unlock_restore: 154 set_pte(page_table, pte); 155 UnlockPage(page); 156 return 0; 157 } ==================== include/asm-i386/pgtable.h 269 269 ==================== 269 static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } ==================== include/linux/mm.h 187 191 ==================== [kswapd()>_do_try_to_free_pages()>refill_inactive()>swap_out()>swap_out_mm()>swap_out_vma()>swap_out_pgd() >swap_out_pmd()>try_to_swap_out()>set_page_dirty()] 187 static inline void set_page_dirty(struct page * page) 188 { 189 if (!test_and_set_bit(PG_dirty, &page->flags)) 190 __set_page_dirty(page); 191 } ==================== mm/filemap.c 134 147 ==================== 134 /* 135 * Add a page to the dirty page list. 136 */ 137 void __set_page_dirty(struct page *page) 138 { 139 struct address_space *mapping = page->mapping; 140 141 spin_lock(&pagecache_lock); 142 list_del(&page->list); 143 list_add(&page->list, &mapping->dirty_pages); 144 spin_unlock(&pagecache_lock); 145 146 mark_inode_dirty_pages(mapping->host); 147 } ==================== include/linux/swap.h 150 150 ==================== 150 #define get_swap_page() __get_swap_page(1)頁表項中有個標志位 _PAGE_DIRTY, 如果這個標記為 0, 表示這個頁面沒有被寫過, 對這樣的頁面, 如果長期沒有寫入, 我們可以解除他的映射。 如果是通過mmap 建立起來的文件映射, 則其page 結構中的指針 mapping 指向相應的address_space 數據結構。對這樣的頁面, 需要特殊處理, set_page_dirty 將頁面轉移到文件映射的 髒 頁面隊列中。 對於一個髒的, 不在swapper_space 隊列中的頁面,我們可以通過 get_swap_page 從交換設備上 分配一個頁面, 然後利用 add_to_swap_cache 將頁面鏈入到 swapper_space 隊列中, 以及活躍隊列中。 頁面老化的速度和 掃描的次數有關。