Buddy 内存管理机制(下)
作者簡介:偉林,中年碼農,從事過電信、手機、安全、芯片等行業,目前依舊從事Linux方向開發工作,個人愛好Linux相關知識分享。
內存釋放
內存分配
gfp_mask
node 候選策略
zone 候選策略
zone fallback 策略
lowmem reserve 機制
order fallback 策略
migrate type 候選策略
migrate fallback 策略
reclaim watermark
reclaim 方式
alloc_pages()
內存釋放
Buddy 系統中,相比較內存的分配,內存的釋放過程更簡單,我們先來解析這部分。
這里體現了 Buddy 的核心思想:在內存釋放時判斷其 buddy 兄弟 page 是不是 order 大小相等的 free page,如果是則合并成更高一階 order。這樣的目的是最大可能的減少內存碎片化。
內存釋放最后都會落到 __free_pages() 函數:
void?__free_pages(struct?page?*page,?unsigned?int?order) {/*?(1)?對page->_refcount減1后并判斷是否為0如果引用計數為0了,說明可以釋放page了*/if?(put_page_testzero(page))free_the_page(page,?order); }↓static?inline?void?free_the_page(struct?page?*page,?unsigned?int?order) {/*?(1)?單個?page?首先嘗試釋放到?pcp?*/if?(order?==?0)??/*?Via?pcp??*/free_unref_page(page);/*?(2)?大于?1?的?2^order?個?page,釋放到?order?free_area_?當中?*/else__free_pages_ok(page,?order); }↓static?void?__free_pages_ok(struct?page?*page,?unsigned?int?order) {unsigned?long?flags;int?migratetype;unsigned?long?pfn?=?page_to_pfn(page);/*?(2.1)?page釋放前的一些動作:清理一些成員做一些檢查執行一些回調函數*/if?(!free_pages_prepare(page,?order,?true))return;/*?(2.2)?獲取到page所在pageblock的migrate?type當前page會被釋放到對應order?free_area的對應?migrate?freelist鏈表當中*/migratetype?=?get_pfnblock_migratetype(page,?pfn);local_irq_save(flags);__count_vm_events(PGFREE,?1?<<?order);/*?(2.3)?向zone中釋放page?*/free_one_page(page_zone(page),?page,?pfn,?order,?migratetype);local_irq_restore(flags); }↓ free_one_page() ↓static?inline?void?__free_one_page(struct?page?*page,unsigned?long?pfn,struct?zone?*zone,?unsigned?int?order,int?migratetype) {unsigned?long?combined_pfn;unsigned?long?uninitialized_var(buddy_pfn);struct?page?*buddy;unsigned?int?max_order;max_order?=?min_t(unsigned?int,?MAX_ORDER,?pageblock_order?+?1);VM_BUG_ON(!zone_is_initialized(zone));VM_BUG_ON_PAGE(page->flags?&?PAGE_FLAGS_CHECK_AT_PREP,?page);VM_BUG_ON(migratetype?==?-1);if?(likely(!is_migrate_isolate(migratetype)))__mod_zone_freepage_state(zone,?1?<<?order,?migratetype);VM_BUG_ON_PAGE(pfn?&?((1?<<?order)?-?1),?page);VM_BUG_ON_PAGE(bad_range(zone,?page),?page);continue_merging:/*?(2.3.1)?嘗試對釋放的(2^order)長度的page進行逐級向上合并?*/while?(order?<?max_order?-?1)?{/*?(2.3.1.1)?得到當前釋放的(2^order)長度page對應的buddy伙伴page指針計算伙伴buddy使用和(1<<order)進行異或:(0<<order)pfn對應的伙伴page為(1<<order)pfn,(1<<order)pfn對應的伙伴page為(0<<order)pfn*/buddy_pfn?=?__find_buddy_pfn(pfn,?order);buddy?=?page?+?(buddy_pfn?-?pfn);if?(!pfn_valid_within(buddy_pfn))goto?done_merging;/*?(2.3.1.2)?判斷伙伴page的是否是buddy狀態:是否是free狀態在buddy系統中(page->_mapcount?==?PAGE_BUDDY_MAPCOUNT_VALUE)當前的free?order和要釋放的order相等(page->private?==?order)*/if?(!page_is_buddy(page,?buddy,?order))goto?done_merging;/**?Our?buddy?is?free?or?it?is?CONFIG_DEBUG_PAGEALLOC?guard?page,*?merge?with?it?and?move?up?one?order.*/if?(page_is_guard(buddy))?{clear_page_guard(zone,?buddy,?order,?migratetype);}?else?{/*?(2.3.1.3)?如果滿足合并的條件,則準備開始合并把伙伴page從原freelist中刪除*/list_del(&buddy->lru);zone->free_area[order].nr_free--;/*?清理page中保存的order信息:page->_mapcount?=?-1page->private?=?0*/rmv_page_order(buddy);}/*?(2.3.1.4)?組成了更高一級order的空閑內存?*/combined_pfn?=?buddy_pfn?&?pfn;page?=?page?+?(combined_pfn?-?pfn);pfn?=?combined_pfn;order++;}if?(max_order?<?MAX_ORDER)?{/*?If?we?are?here,?it?means?order?is?>=?pageblock_order.*?如果在這里,意味著order??>=?pageblock_order。*?We?want?to?prevent?merge?between?freepages?on?isolate*?pageblock?and?normal?pageblock.?Without?this,?pageblock*?isolation?could?cause?incorrect?freepage?or?CMA?accounting.*?我們要防止隔離頁面塊和正常頁面塊上的空閑頁面合并。?否則,頁面塊隔離可能導致不正確的空閑頁面或CMA計數。**?We?don't?want?to?hit?this?code?for?the?more?frequent*?low-order?merging.*?我們不想命中此代碼進行頻繁的低階合并。*/if?(unlikely(has_isolate_pageblock(zone)))?{int?buddy_mt;buddy_pfn?=?__find_buddy_pfn(pfn,?order);buddy?=?page?+?(buddy_pfn?-?pfn);buddy_mt?=?get_pageblock_migratetype(buddy);if?(migratetype?!=?buddy_mt&&?(is_migrate_isolate(migratetype)?||is_migrate_isolate(buddy_mt)))goto?done_merging;}max_order++;goto?continue_merging;}/*?(2.3.2)?開始掛載合并成order的空閑內存?*/ done_merging:/*?(2.3.2.1)?page中保存order大小:page->_mapcount?=?PAGE_BUDDY_MAPCOUNT_VALUE(-128)page->private?=?order*/set_page_order(page,?order);/**?If?this?is?not?the?largest?possible?page,?check?if?the?buddy*?of?the?next-highest?order?is?free.?If?it?is,?it's?possible*?that?pages?are?being?freed?that?will?coalesce?soon.?In?case,*?that?is?happening,?add?the?free?page?to?the?tail?of?the?list*?so?it's?less?likely?to?be?used?soon?and?more?likely?to?be?merged*?as?a?higher?order?page*?如果這不是最大的頁面,請檢查倒數第二個order的伙伴是否空閑。?如果是這樣,則可能是頁面即將被釋放,即將合并。?萬一發生這種情況,請將空閑頁面添加到列表的末尾,這樣它就不太可能很快被使用,而更有可能被合并為高階頁面*//*?(2.3.2.2)?將空閑page加到對應order鏈表的尾部?*/if?((order?<?MAX_ORDER-2)?&&?pfn_valid_within(buddy_pfn))?{struct?page?*higher_page,?*higher_buddy;combined_pfn?=?buddy_pfn?&?pfn;higher_page?=?page?+?(combined_pfn?-?pfn);buddy_pfn?=?__find_buddy_pfn(combined_pfn,?order?+?1);higher_buddy?=?higher_page?+?(buddy_pfn?-?combined_pfn);if?(pfn_valid_within(buddy_pfn)?&&page_is_buddy(higher_page,?higher_buddy,?order?+?1))?{list_add_tail(&page->lru,&zone->free_area[order].free_list[migratetype]);goto?out;}}/*?(2.3.2.3)?將空閑page加到對應order鏈表的開始?*/list_add(&page->lru,?&zone->free_area[order].free_list[migratetype]); out:zone->free_area[order].nr_free++; }PageBuddy()用來判斷page是否在buddy系統中,還有很多類似的page操作函數都定義在page-flags.h當中:
linux-source-4.15.0\include\linux\page-flags.h:#define?PAGE_MAPCOUNT_OPS(uname,?lname)?????\ static?__always_inline?int?Page##uname(struct?page?*page)??\ {?????????\return?atomic_read(&page->_mapcount)?==????\PAGE_##lname##_MAPCOUNT_VALUE;??\ }?????????\ static?__always_inline?void?__SetPage##uname(struct?page?*page)??\ {?????????\VM_BUG_ON_PAGE(atomic_read(&page->_mapcount)?!=?-1,?page);?\atomic_set(&page->_mapcount,?PAGE_##lname##_MAPCOUNT_VALUE);?\ }?????????\ static?__always_inline?void?__ClearPage##uname(struct?page?*page)?\ {?????????\VM_BUG_ON_PAGE(!Page##uname(page),?page);???\atomic_set(&page->_mapcount,?-1);????\ }/**?PageBuddy()?indicate?that?the?page?is?free?and?in?the?buddy?system*?(see?mm/page_alloc.c).*/ #define?PAGE_BUDDY_MAPCOUNT_VALUE??(-128) PAGE_MAPCOUNT_OPS(Buddy,?BUDDY)對于單個page,會首先釋放到percpu緩存中:
start_kernel()?→?mm_init()?→?mem_init()?→?free_all_bootmem()?free_low_memory_core_early()?→?__free_memory_core()?→?__free_pages_memory()?→?__free_pages_bootmem()?→?__free_pages()?→?free_the_page()?→?free_unref_page():↓void?free_unref_page(struct?page?*page) {unsigned?long?flags;unsigned?long?pfn?=?page_to_pfn(page);/*?(1)?一些初始化準備工作page->index?=?migratetype;*/if?(!free_unref_page_prepare(page,?pfn))return;local_irq_save(flags);/*?(2)?釋放page到pcp中?*/free_unref_page_commit(page,?pfn);local_irq_restore(flags); }↓static?void?free_unref_page_commit(struct?page?*page,?unsigned?long?pfn) {struct?zone?*zone?=?page_zone(page);struct?per_cpu_pages?*pcp;int?migratetype;/*?(2.1)?migratetype?=?page->index?*/migratetype?=?get_pcppage_migratetype(page);__count_vm_event(PGFREE);/*?(2.2)?對于某些migratetype的特殊處理?*/if?(migratetype?>=?MIGRATE_PCPTYPES)?{/*?(2.2.1)?對于isolate類型,free到全局的freelist中?*/if?(unlikely(is_migrate_isolate(migratetype)))?{free_one_page(zone,?page,?pfn,?0,?migratetype);return;}migratetype?=?MIGRATE_MOVABLE;}/*?(2.3)?獲取到zone當前cpu?pcp的鏈表頭?*/pcp?=?&this_cpu_ptr(zone->pageset)->pcp;/*?(2.4)?將空閑的單page加入到pcp對應鏈表中?*/list_add(&page->lru,?&pcp->lists[migratetype]);pcp->count++;/*?(2.5)?如果pcp中的page數量過多(大于pcp->high),釋放pcp->batch個page到全局free?list當中去?*/if?(pcp->count?>=?pcp->high)?{unsigned?long?batch?=?READ_ONCE(pcp->batch);free_pcppages_bulk(zone,?batch,?pcp);pcp->count?-=?batch;} }pcp->high 和 pcp->batch 的賦值過程:
start_kernel()?→?setup_per_cpu_pageset()?→?setup_zone_pageset()?→?zone_pageset_init()?→?pageset_set_high_and_batch():|→static?int?zone_batchsize(struct?zone?*zone) {/*?batch?的大小?=?(zone_size?/?(1024*4))?*?(3/2)?*/batch?=?zone->managed_pages?/?1024;if?(batch?*?PAGE_SIZE?>?512?*?1024)batch?=?(512?*?1024)?/?PAGE_SIZE;batch?/=?4;??/*?We?effectively?*=?4?below?*/if?(batch?<?1)batch?=?1;batch?=?rounddown_pow_of_two(batch?+?batch/2)?-?1;return?batch; }|→static?void?pageset_set_batch(struct?per_cpu_pageset?*p,?unsigned?long?batch) {/*?high?=?6?*?batch?*/pageset_update(&p->pcp,?6?*?batch,?max(1UL,?1?*?batch)); }內存分配
相比較釋放,內存分配的策略要復雜的多,要考慮的因素也多很多,讓我們一一來解析。
gfp_mask
gfp_mask是GFP(Get Free Page)相關的一系列標志,控制了分配page的一系列行為。
node 候選策略
在 NUMA 的情況下,會有多個 memory node 可供選擇,系統會根據 policy 選擇當前分配的 node。
alloc_pages()?→?alloc_pages_current():struct?page?*alloc_pages_current(gfp_t?gfp,?unsigned?order) {/*?(1.1)?使用默認NUMA策略?*/struct?mempolicy?*pol?=?&default_policy;struct?page?*page;/*?(1.2)?獲取當前進程的NUMA策略?*/if?(!in_interrupt()?&&?!(gfp?&?__GFP_THISNODE))pol?=?get_task_policy(current);/**?No?reference?counting?needed?for?current->mempolicy*?nor?system?default_policy*/if?(pol->mode?==?MPOL_INTERLEAVE)page?=?alloc_page_interleave(gfp,?order,?interleave_nodes(pol));else/*?(2)?從NUMA策略指定的首選node和備選node組上,進行內存頁面的分配?*/page?=?__alloc_pages_nodemask(gfp,?order,policy_node(gfp,?pol,?numa_node_id()),policy_nodemask(gfp,?pol));return?page; }zone 候選策略
Buddy 系統中對每一個 node 定義了多個類型的 zone :
enum?zone_type?{ZONE_DMA,ZONE_DMA32,ZONE_NORMAL,ZONE_HIGHMEM,ZONE_MOVABLE,ZONE_DEVICE,__MAX_NR_ZONES };gfp_mask 中也定義了一系列選擇 zone 的flag:
/**?Physical?address?zone?modifiers?(see?linux/mmzone.h?-?low?four?bits)*/ #define?__GFP_DMA?((__force?gfp_t)___GFP_DMA) #define?__GFP_HIGHMEM?((__force?gfp_t)___GFP_HIGHMEM) #define?__GFP_DMA32?((__force?gfp_t)___GFP_DMA32) #define?__GFP_MOVABLE?((__force?gfp_t)___GFP_MOVABLE)??/*?ZONE_MOVABLE?allowed?*/ #define?GFP_ZONEMASK?(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)怎么樣根據 gfp_mask 中的 zone modifiers 來選擇分配鎖使用的 zone 呢?系統設計了一套算法來進行轉換:
具體的代碼如下:
alloc_pages()?→?alloc_pages_current()?→?__alloc_pages_nodemask()?→?prepare_alloc_pages()?→?gfp_zone():static?inline?enum?zone_type?gfp_zone(gfp_t?flags) {enum?zone_type?z;/*?(1)?gfp?標志中低4位為?zone?modifiers?*/int?bit?=?(__force?int)?(flags?&?GFP_ZONEMASK);/*?(2)?查表得到最后的候選zone內核規定?___GFP_DMA,___GFP_HIGHMEM?和?___GFP_DMA32?其兩個或全部不能同時存在于?gfp?標志中*/z?=?(GFP_ZONE_TABLE?>>?(bit?*?GFP_ZONES_SHIFT))?&((1?<<?GFP_ZONES_SHIFT)?-?1);VM_BUG_ON((GFP_ZONE_BAD?>>?bit)?&?1);return?z; }#define?GFP_ZONE_TABLE?(?\(ZONE_NORMAL?<<?0?*?GFP_ZONES_SHIFT)???????????\|?(OPT_ZONE_DMA?<<?___GFP_DMA?*?GFP_ZONES_SHIFT)?????????\|?(OPT_ZONE_HIGHMEM?<<?___GFP_HIGHMEM?*?GFP_ZONES_SHIFT)????????\|?(OPT_ZONE_DMA32?<<?___GFP_DMA32?*?GFP_ZONES_SHIFT)?????????\|?(ZONE_NORMAL?<<?___GFP_MOVABLE?*?GFP_ZONES_SHIFT)?????????\|?(OPT_ZONE_DMA?<<?(___GFP_MOVABLE?|?___GFP_DMA)?*?GFP_ZONES_SHIFT)????\|?(ZONE_MOVABLE?<<?(___GFP_MOVABLE?|?___GFP_HIGHMEM)?*?GFP_ZONES_SHIFT)\|?(OPT_ZONE_DMA32?<<?(___GFP_MOVABLE?|?___GFP_DMA32)?*?GFP_ZONES_SHIFT)\ )#define?GFP_ZONE_BAD?(?\1?<<?(___GFP_DMA?|?___GFP_HIGHMEM)??????????\|?1?<<?(___GFP_DMA?|?___GFP_DMA32)??????????\|?1?<<?(___GFP_DMA32?|?___GFP_HIGHMEM)??????????\|?1?<<?(___GFP_DMA?|?___GFP_DMA32?|?___GFP_HIGHMEM)????????\|?1?<<?(___GFP_MOVABLE?|?___GFP_HIGHMEM?|?___GFP_DMA)????????\|?1?<<?(___GFP_MOVABLE?|?___GFP_DMA32?|?___GFP_DMA)????????\|?1?<<?(___GFP_MOVABLE?|?___GFP_DMA32?|?___GFP_HIGHMEM)????????\|?1?<<?(___GFP_MOVABLE?|?___GFP_DMA32?|?___GFP_DMA?|?___GFP_HIGHMEM)??\ )zone fallback 策略
通過上述的候選策略,我們選定了內存分配的 node 和 zone,然后開始分配。如果分配失敗,我們并不會馬上啟動內存回收,而是通過 fallback 機制嘗試從其他低級的 zone 中看看能不能借用一些內存。
fallback 的借用,只能從高級到低級的借用,而不能從低級到高級的借用。比如:原本想分配 Normal zone 的內存,失敗的情況下可以嘗試從 DMA32 zone 中分配內存,因為能用 normal zone 地址范圍的內存肯定也可以用 DMA32 zone 地址范圍的內存。但是反過來就不行,原本需要 DMA32 zone 地址范圍的內存,你給他一個 normal zone 的內存,地址超過了4G,可能就超過了 DMA 設備的尋址能力。
系統還定義了一個 __GFP_THISNODE 標志,用來限制 fallback 時只能在本 node 上尋找合適的低級 zone。否則會在所有 node 上尋找合適的低級 zone。
該算法的具體實現如下:
1、每個 node 定義了 fallback 時用到的候選 zone 鏈表:
系統啟動時初始化這些鏈表:
start_kernel()?→?build_all_zonelists()?→?__build_all_zonelists()?→?build_zonelists()?→?build_zonelists_in_node_order()/build_thisnode_zonelists()?→?build_zonerefs_node():2、內存分配時確定使用的 fallback 鏈表:
3、從原有zone分配失敗時,嘗試從 fallback zone 中分配內存:
lowmem reserve 機制
承接上述的 fallback 機制,高等級的 zone 可以借用低等級 zone 的內存。但是從理論上說,低等級的內存更加的寶貴因為它的空間更小,如果被高等級的侵占完了,那么用戶需要低層級內存的時候就會分配失敗。
為了解決這個問題,系統給每個 zone 能夠給其他高等級 zone 借用的內存設置了一個預留值,可以借用內存但是本zone保留的內存不能小于這個值。
我們可以通過命令來查看每個 zone 的 lowmem reserve 大小設置,protection 字段描述了本zone給其他zone借用時必須保留的內存:
pwl@ubuntu:~$?cat?/proc/zoneinfo Node?0,?zone??????DMApages?free?????3968min??????67low??????83high?????99spanned??4095present??3997managed??3976//?本?zone?為?DMA?//?給?DMA?zone?借用時必須保留?0?pages//?給?DMA32?zone?借用時必須保留?2934?pages//?給?Normal/Movable/Device?zone?借用時必須保留?3859?pagesprotection:?(0,?2934,?3859,?3859,?3859)?Node?0,?zone????DMA32pages?free?????418978min??????12793low??????15991high?????19189spanned??1044480present??782288managed??759701//?本?zone?為?DMA32?//?給?DMA/DMA32?zone?借用時必須保留?0?pages//?給?Normal/Movable/Device?zone?借用時必須保留?925?pagesprotection:?(0,?0,?925,?925,?925)nr_free_pages?418978Node?0,?zone???Normalpages?free?????4999min??????4034low??????5042high?????6050spanned??262144present??262144managed??236890//?本?zone?為?Normal?//?因為?Movable/Device?zone?大小為0,所以給所有?zone?借用時必須保留?0?pagesprotection:?(0,?0,?0,?0,?0)Node?0,?zone??Movablepages?free?????0min??????0low??????0high?????0spanned??0present??0managed??0protection:?(0,?0,?0,?0,?0) Node?0,?zone???Devicepages?free?????0min??????0low??????0high?????0spanned??0present??0managed??0protection:?(0,?0,?0,?0,?0)可以通過lowmem_reserve_ratio來調節這個值的大小:
pwl@ubuntu:~$?cat?/proc/sys/vm/lowmem_reserve_ratio 256?????256?????32??????0???????0order fallback 策略
Buddy 系統中對每一個 zone 又細分了多個 order 的 free_area:
#ifndef?CONFIG_FORCE_MAX_ZONEORDER #define?MAX_ORDER?11 #else #define?MAX_ORDER?CONFIG_FORCE_MAX_ZONEORDER #endif如果在對應 order 的 free_area 中找不多 free 內存的話,會逐個往高級別 order free_area 中查找,直至 max_order。
對高級別 order 的 freelist ,會被分割成多個低級別 order 的 freelist。
migrate type 候選策略
Buddy 系統中對每一個 zone 中的每一個 order free_area 又細分了多個 migrate type :
enum?migratetype?{MIGRATE_UNMOVABLE,MIGRATE_MOVABLE,MIGRATE_RECLAIMABLE,MIGRATE_PCPTYPES,?/*?the?number?of?types?on?the?pcp?lists?*/MIGRATE_HIGHATOMIC?=?MIGRATE_PCPTYPES,MIGRATE_CMA,MIGRATE_ISOLATE,?/*?can't?allocate?from?here?*/MIGRATE_TYPES };gfp_mask 中也定義了一系列選擇 migrate type 的flag:
#define?__GFP_MOVABLE?((__force?gfp_t)___GFP_MOVABLE)??/*?ZONE_MOVABLE?allowed?*/ #define?__GFP_RECLAIMABLE?((__force?gfp_t)___GFP_RECLAIMABLE) #define?GFP_MOVABLE_MASK?(__GFP_RECLAIMABLE|__GFP_MOVABLE)根據 gfp_mask 轉換成 migrate type 的代碼如下:
alloc_pages()?→?alloc_pages_current()?→?__alloc_pages_nodemask()?→?prepare_alloc_pages()?→?gfpflags_to_migratetype():static?inline?int?gfpflags_to_migratetype(const?gfp_t?gfp_flags) {VM_WARN_ON((gfp_flags?&?GFP_MOVABLE_MASK)?==?GFP_MOVABLE_MASK);BUILD_BUG_ON((1UL?<<?GFP_MOVABLE_SHIFT)?!=?___GFP_MOVABLE);BUILD_BUG_ON((___GFP_MOVABLE?>>?GFP_MOVABLE_SHIFT)?!=?MIGRATE_MOVABLE);if?(unlikely(page_group_by_mobility_disabled))return?MIGRATE_UNMOVABLE;/*?Group?based?on?mobility?*//*?(1)?轉換的結果僅為3種類型:MIGRATE_UNMOVABLE/MIGRATE_MOVABLE/MIGRATE_RECLAIMABLE?*/?return?(gfp_flags?&?GFP_MOVABLE_MASK)?>>?GFP_MOVABLE_SHIFT; }migrate fallback 策略
在指定 migrate type 的 order 和大于 order 的 free list 分配失敗時,可以從同一 zone 的其他 migrate type freelist 中偷取內存。
static?int?fallbacks[MIGRATE_TYPES][4]?=?{[MIGRATE_UNMOVABLE]???=?{?MIGRATE_RECLAIMABLE,?MIGRATE_MOVABLE,???MIGRATE_TYPES?},[MIGRATE_RECLAIMABLE]?=?{?MIGRATE_UNMOVABLE,???MIGRATE_MOVABLE,???MIGRATE_TYPES?},[MIGRATE_MOVABLE]?????=?{?MIGRATE_RECLAIMABLE,?MIGRATE_UNMOVABLE,?MIGRATE_TYPES?}, #ifdef?CONFIG_CMA[MIGRATE_CMA]?????????=?{?MIGRATE_TYPES?},?/*?Never?used?*/ #endif #ifdef?CONFIG_MEMORY_ISOLATION[MIGRATE_ISOLATE]?????=?{?MIGRATE_TYPES?},?/*?Never?used?*/ #endif };fallbacks[] 數組定義了當前 migrate 可以從偷取哪些其他 migrate 的空閑內存,基本就是 MIGRATE_UNMOVABLE、MIGRATE_RECLAIMABLE、MIGRATE_MOVABLE 可以相互偷取。
具體的代碼如下:
alloc_pages()?→?alloc_pages_current()?→?__alloc_pages_nodemask()?→?get_page_from_freelist()?→?rmqueue()?→?__rmqueue()?→?__rmqueue_fallback():reclaim watermark
分配時如果 freelist 中現有的內存不能滿足需求,則會啟動內充回收。系統對每個 zone 定義了三種內存水位 high/low/min,針對不同的水位采取不同的回收策略:
pwl@ubuntu:~$?cat?/proc/zoneinfo Node?0,?zone??????DMApages?free?????3968min??????67low??????83high?????99具體三種水位的回收策略如下:
reclaim 方式
系統設計了幾種回收內存的手段:
alloc_pages()
Buddy 內存分配的核心代碼實現。
alloc_pages()?→?alloc_pages_current()?→?__alloc_pages_nodemask():struct?page?* __alloc_pages_nodemask(gfp_t?gfp_mask,?unsigned?int?order,?int?preferred_nid,nodemask_t?*nodemask) {struct?page?*page;/*?(1.1)?默認的允許水位為low?*/unsigned?int?alloc_flags?=?ALLOC_WMARK_LOW;gfp_t?alloc_mask;?/*?The?gfp_t?that?was?actually?used?for?allocation?*/struct?alloc_context?ac?=?{?};/**?There?are?several?places?where?we?assume?that?the?order?value?is?sane*?so?bail?out?early?if?the?request?is?out?of?bound.*//*?(1.2)?order長度的合法性判斷?*/if?(unlikely(order?>=?MAX_ORDER))?{WARN_ON_ONCE(!(gfp_mask?&?__GFP_NOWARN));return?NULL;}/*?(1.3)?gfp_mask的過濾?*/gfp_mask?&=?gfp_allowed_mask;alloc_mask?=?gfp_mask;/*?(1.4)?根據gfp_mask,決定的high_zoneidx、候選zone?list、migrate?type?*/if?(!prepare_alloc_pages(gfp_mask,?order,?preferred_nid,?nodemask,?&ac,?&alloc_mask,?&alloc_flags))return?NULL;/*?(1.5)?挑選第一個合適的zone?*/finalise_ac(gfp_mask,?order,?&ac);/*?First?allocation?attempt?*//*?(2)?第1次分配:使用low水位嘗試直接從free?list分配page?*/page?=?get_page_from_freelist(alloc_mask,?order,?alloc_flags,?&ac);if?(likely(page))goto?out;/**?Apply?scoped?allocation?constraints.?This?is?mainly?about?GFP_NOFS*?resp.?GFP_NOIO?which?has?to?be?inherited?for?all?allocation?requests*?from?a?particular?context?which?has?been?marked?by*?memalloc_no{fs,io}_{save,restore}.*//*?(3.1)?如果使用?memalloc_no{fs,io}_{save,restore}?設置了?NOFS和NOIO從?current->flags?解析出相應的值,用來清除?gfp_mask?中相應的?__GFP_FS?和?__GFP_IO?標志*/alloc_mask?=?current_gfp_context(gfp_mask);ac.spread_dirty_pages?=?false;/**?Restore?the?original?nodemask?if?it?was?potentially?replaced?with*?&cpuset_current_mems_allowed?to?optimize?the?fast-path?attempt.*//*?(3.2)?恢復原有的nodemask?*/if?(unlikely(ac.nodemask?!=?nodemask))ac.nodemask?=?nodemask;/*?(4)?慢速分配路徑:使用min水位,以及各種手段進行內存回收后,再嘗試分配內存?*/page?=?__alloc_pages_slowpath(alloc_mask,?order,?&ac);out:if?(memcg_kmem_enabled()?&&?(gfp_mask?&?__GFP_ACCOUNT)?&&?page?&&unlikely(memcg_kmem_charge(page,?gfp_mask,?order)?!=?0))?{__free_pages(page,?order);page?=?NULL;}trace_mm_page_alloc(page,?order,?alloc_mask,?ac.migratetype);return?page; }|→static?inline?bool?prepare_alloc_pages(gfp_t?gfp_mask,?unsigned?int?order,int?preferred_nid,?nodemask_t?*nodemask,struct?alloc_context?*ac,?gfp_t?*alloc_mask,unsigned?int?*alloc_flags) {/*?(1.4.1)?根據gfp_mask,獲取到可能的最高優先級的zone?*/ac->high_zoneidx?=?gfp_zone(gfp_mask);/*?(1.4.2)?根據gfp_mask,獲取到可能候選node的所有zone鏈表?*/ac->zonelist?=?node_zonelist(preferred_nid,?gfp_mask);ac->nodemask?=?nodemask;/*?(1.4.3)?根據gfp_mask,獲取到migrate?typeMIGRATE_UNMOVABLE/MIGRATE_MOVABLE/MIGRATE_RECLAIMABLE*/ac->migratetype?=?gfpflags_to_migratetype(gfp_mask);/*?(1.4.4)?如果cpuset?cgroup使能,設置相應標志位?*/if?(cpusets_enabled())?{*alloc_mask?|=?__GFP_HARDWALL;if?(!ac->nodemask)ac->nodemask?=?&cpuset_current_mems_allowed;else*alloc_flags?|=?ALLOC_CPUSET;}/*?(1.4.5)?如果指定了__GFP_FS,則嘗試獲取fs鎖?*/fs_reclaim_acquire(gfp_mask);fs_reclaim_release(gfp_mask);/*?(1.4.6)?如果指定了__GFP_DIRECT_RECLAIM,判斷當前是否是非原子上下文可以睡眠?*/might_sleep_if(gfp_mask?&?__GFP_DIRECT_RECLAIM);if?(should_fail_alloc_page(gfp_mask,?order))return?false;/*?(1.4.7)?讓MIGRATE_MOVABLE可以使用MIGRATE_CMA區域?*/if?(IS_ENABLED(CONFIG_CMA)?&&?ac->migratetype?==?MIGRATE_MOVABLE)*alloc_flags?|=?ALLOC_CMA;return?true; }get_page_from_freelist()
第一次的快速內存分配,和后續的慢速內存分配,最后都是調用 get_page_from_freelist() 從freelist中獲取內存。
static?struct?page?* get_page_from_freelist(gfp_t?gfp_mask,?unsigned?int?order,?int?alloc_flags,const?struct?alloc_context?*ac) {struct?zoneref?*z?=?ac->preferred_zoneref;struct?zone?*zone;struct?pglist_data?*last_pgdat_dirty_limit?=?NULL;/*?(2.5.1)?輪詢?fallback?zonelist鏈表,在符合條件(idx<=high_zoneidx)的zone中嘗試分配內存?*/for_next_zone_zonelist_nodemask(zone,?z,?ac->zonelist,?ac->high_zoneidx,ac->nodemask)?{struct?page?*page;unsigned?long?mark;if?(cpusets_enabled()?&&(alloc_flags?&?ALLOC_CPUSET)?&&!__cpuset_zone_allowed(zone,?gfp_mask))continue;/*?(2.5.2)?如果__GFP_WRITE指示了分配頁的用途是dirty,平均分布臟頁查詢node上分配的臟頁是否超過限制,超過則換node*/if?(ac->spread_dirty_pages)?{if?(last_pgdat_dirty_limit?==?zone->zone_pgdat)continue;if?(!node_dirty_ok(zone->zone_pgdat))?{last_pgdat_dirty_limit?=?zone->zone_pgdat;continue;}}/*?(2.5.3)?獲取當前分配能超越的水位線?*/mark?=?zone->watermark[alloc_flags?&?ALLOC_WMARK_MASK];/*?(2.5.4)?判斷當前zone中的free?page是否滿足條件:1、total?free?page?>=?(2^order)?+?watermark?+?lowmem_reserve2、是否有符合要求的長度為(2^order)的連續內存*/if?(!zone_watermark_fast(zone,?order,?mark,ac_classzone_idx(ac),?alloc_flags))?{int?ret;/*?(2.5.5)?如果沒有足夠的free內存,則進行下列的判斷?*//*?Checked?here?to?keep?the?fast?path?fast?*/BUILD_BUG_ON(ALLOC_NO_WATERMARKS?<?NR_WMARK);/*?(2.5.6)?如果可以忽略水位線,則直接進行分配嘗試?*/if?(alloc_flags?&?ALLOC_NO_WATERMARKS)goto?try_this_zone;if?(node_reclaim_mode?==?0?||!zone_allows_reclaim(ac->preferred_zoneref->zone,?zone))continue;/*?(2.5.7)?快速內存回收嘗試回收(2^order)個page快速回收不能進行unmap,writeback操作,回收priority為4,即最多嘗試調用shrink_node進行回收的次數為priority值在__node_reclaim()中使用以下?scan_control?參數來調用shrink_node(),struct?scan_control?sc?=?{.nr_to_reclaim?=?max(nr_pages,?SWAP_CLUSTER_MAX),.gfp_mask?=?current_gfp_context(gfp_mask),.order?=?order,.priority?=?NODE_RECLAIM_PRIORITY,.may_writepage?=?!!(node_reclaim_mode?&?RECLAIM_WRITE),?//?默認為0.may_unmap?=?!!(node_reclaim_mode?&?RECLAIM_UNMAP),??//?默認為0.may_swap?=?1,.reclaim_idx?=?gfp_zone(gfp_mask),};*/ret?=?node_reclaim(zone->zone_pgdat,?gfp_mask,?order);switch?(ret)?{case?NODE_RECLAIM_NOSCAN:/*?did?not?scan?*/continue;case?NODE_RECLAIM_FULL:/*?scanned?but?unreclaimable?*/continue;default:/*?did?we?reclaim?enough?*//*?(2.5.8)?如果回收成功,重新判斷空閑內存是否已經足夠?*/if?(zone_watermark_ok(zone,?order,?mark,ac_classzone_idx(ac),?alloc_flags))goto?try_this_zone;continue;}}try_this_zone:/*?(2.5.9)?滿足條件,嘗試實際的從free?list中摘取(2^order)個page?*/page?=?rmqueue(ac->preferred_zoneref->zone,?zone,?order,gfp_mask,?alloc_flags,?ac->migratetype);if?(page)?{/*?(2.5.10)?分配到內存后,對?struct?page?的一些處理?*/prep_new_page(page,?order,?gfp_mask,?alloc_flags);if?(unlikely(order?&&?(alloc_flags?&?ALLOC_HARDER)))reserve_highatomic_pageblock(page,?zone,?order);return?page;}}return?NULL; }||→static?inline?bool?zone_watermark_fast(struct?zone?*z,?unsigned?int?order,unsigned?long?mark,?int?classzone_idx,?unsigned?int?alloc_flags) {/*?(2.5.4.1)?獲取當前zone中free?page的數量??*/long?free_pages?=?zone_page_state(z,?NR_FREE_PAGES);long?cma_pages?=?0;#ifdef?CONFIG_CMA/*?If?allocation?can't?use?CMA?areas?don't?use?free?CMA?pages?*/if?(!(alloc_flags?&?ALLOC_CMA))cma_pages?=?zone_page_state(z,?NR_FREE_CMA_PAGES); #endif/*?(2.5.4.2)?對order=0的長度,進行快速檢測free內存是否夠用?*/if?(!order?&&?(free_pages?-?cma_pages)?>?mark?+?z->lowmem_reserve[classzone_idx])return?true;/*?(2.5.4.3)?慢速檢測free內存是否夠用?*/return?__zone_watermark_ok(z,?order,?mark,?classzone_idx,?alloc_flags,free_pages); }|||→bool?__zone_watermark_ok(struct?zone?*z,?unsigned?int?order,?unsigned?long?mark,int?classzone_idx,?unsigned?int?alloc_flags,long?free_pages) {long?min?=?mark;int?o;const?bool?alloc_harder?=?(alloc_flags?&?(ALLOC_HARDER|ALLOC_OOM));/*?free_pages?may?go?negative?-?that's?OK?*//*?(2.5.4.3.1)?首先用free?page總數減去需要的order長度,判斷剩下的長度是不是還超過水位線?*/free_pages?-=?(1?<<?order)?-?1;/*?(2.5.4.3.2)?如果是優先級高,水位線可以減半??*/if?(alloc_flags?&?ALLOC_HIGH)min?-=?min?/?2;/**?If?the?caller?does?not?have?rights?to?ALLOC_HARDER?then?subtract*?the?high-atomic?reserves.?This?will?over-estimate?the?size?of?the*?atomic?reserve?but?it?avoids?a?search.*//*?(2.5.4.3.3)?非harder類的分配,free內存還需預留nr_reserved_highatomic的內存?*/if?(likely(!alloc_harder))?{free_pages?-=?z->nr_reserved_highatomic;/*?(2.5.4.3.4)?harder類的分配,非常緊急了,水位線還可以繼續減半縮小?*/}?else?{/**?OOM?victims?can?try?even?harder?than?normal?ALLOC_HARDER*?users?on?the?grounds?that?it's?definitely?going?to?be?in*?the?exit?path?shortly?and?free?memory.?Any?allocation?it*?makes?during?the?free?path?will?be?small?and?short-lived.*/if?(alloc_flags?&?ALLOC_OOM)min?-=?min?/?2;elsemin?-=?min?/?4;}#ifdef?CONFIG_CMA/*?If?allocation?can't?use?CMA?areas?don't?use?free?CMA?pages?*//*?(2.5.4.3.5)?非CMA的分配,free內存還需預留CMA內存?*/if?(!(alloc_flags?&?ALLOC_CMA))free_pages?-=?zone_page_state(z,?NR_FREE_CMA_PAGES); #endif/**?Check?watermarks?for?an?order-0?allocation?request.?If?these*?are?not?met,?then?a?high-order?request?also?cannot?go?ahead*?even?if?a?suitable?page?happened?to?be?free.*//*?(2.5.4.3.6)?free內存還要預留(水位內存+lowmem_reserve[classzone_idx])如果減去上述所有的預留內存內存后,還大于請求的order長度,說明當前zone中的free內存總長度滿足請求分配的order但是有沒有符合要求的長度為(2^order)的連續內存,還要進一步查找判斷*/if?(free_pages?<=?min?+?z->lowmem_reserve[classzone_idx])return?false;/*?If?this?is?an?order-0?request?then?the?watermark?is?fine?*//*?(2.5.4.3.7)?如果order為0,不用進一步判斷了,總長度滿足,肯定能找到合適長度的page?*/if?(!order)return?true;/*?For?a?high-order?request,?check?at?least?one?suitable?page?is?free?*//*?(2.5.4.3.8)?逐個查詢當前zone中大于請求order的鏈表?*/for?(o?=?order;?o?<?MAX_ORDER;?o++)?{struct?free_area?*area?=?&z->free_area[o];int?mt;if?(!area->nr_free)continue;/*?(2.5.4.3.9)?逐個查詢當前order中的每個migrate?type鏈表,如果不為空則返回成功?*/for?(mt?=?0;?mt?<?MIGRATE_PCPTYPES;?mt++)?{if?(!list_empty(&area->free_list[mt]))return?true;}#ifdef?CONFIG_CMAif?((alloc_flags?&?ALLOC_CMA)?&&!list_empty(&area->free_list[MIGRATE_CMA]))?{return?true;} #endifif?(alloc_harder?&&!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))return?true;}return?false; }rmqueue()
找到合適有足夠 free 內存的zone以后,rmqueue()負責從 freelist 中摘取 page。
rmqueue()?→?__rmqueue():static?__always_inline?struct?page?* __rmqueue(struct?zone?*zone,?unsigned?int?order,?int?migratetype) {struct?page?*page;retry:/*?(1)?從原始指定的?migrate?freeist?中分配內存?*/page?=?__rmqueue_smallest(zone,?order,?migratetype);if?(unlikely(!page))?{if?(migratetype?==?MIGRATE_MOVABLE)page?=?__rmqueue_cma_fallback(zone,?order);/*?(2)?如果上一步分配失敗,嘗試從其他?migrate?list?中偷取內存來分配?*/if?(!page?&&?__rmqueue_fallback(zone,?order,?migratetype))goto?retry;}trace_mm_page_alloc_zone_locked(page,?order,?migratetype);return?page; }↓static?__always_inline struct?page?*__rmqueue_smallest(struct?zone?*zone,?unsigned?int?order,int?migratetype) {unsigned?int?current_order;struct?free_area?*area;struct?page?*page;/*?Find?a?page?of?the?appropriate?size?in?the?preferred?list?*//*?(1.1)?逐個查詢?>=?order?的?freaa_area?中?migratetype?的freelist,看看是否有free內存?*/for?(current_order?=?order;?current_order?<?MAX_ORDER;?++current_order)?{area?=?&(zone->free_area[current_order]);page?=?list_first_entry_or_null(&area->free_list[migratetype],struct?page,?lru);if?(!page)continue;/*?(1.1.1)?從?freelist?中摘取內存?*/list_del(&page->lru);/*?清理page中保存的order信息:page->_mapcount?=?-1page->private?=?0*/rmv_page_order(page);area->nr_free--;/*?(1.1.2)?把剩余內存重新掛載到低階?order?的freelist中?*/expand(zone,?page,?order,?current_order,?area,?migratetype);set_pcppage_migratetype(page,?migratetype);return?page;}return?NULL; }__alloc_pages_slowpath()
static?inline?struct?page?* __alloc_pages_slowpath(gfp_t?gfp_mask,?unsigned?int?order,struct?alloc_context?*ac) {bool?can_direct_reclaim?=?gfp_mask?&?__GFP_DIRECT_RECLAIM;const?bool?costly_order?=?order?>?PAGE_ALLOC_COSTLY_ORDER;struct?page?*page?=?NULL;unsigned?int?alloc_flags;unsigned?long?did_some_progress;enum?compact_priority?compact_priority;enum?compact_result?compact_result;int?compaction_retries;int?no_progress_loops;unsigned?int?cpuset_mems_cookie;int?reserve_flags;/**?We?also?sanity?check?to?catch?abuse?of?atomic?reserves?being?used?by*?callers?that?are?not?in?atomic?context.*/if?(WARN_ON_ONCE((gfp_mask?&?(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))?==(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))gfp_mask?&=?~__GFP_ATOMIC;retry_cpuset:compaction_retries?=?0;no_progress_loops?=?0;compact_priority?=?DEF_COMPACT_PRIORITY;cpuset_mems_cookie?=?read_mems_allowed_begin();/**?The?fast?path?uses?conservative?alloc_flags?to?succeed?only?until*?kswapd?needs?to?be?woken?up,?and?to?avoid?the?cost?of?setting?up*?alloc_flags?precisely.?So?we?do?that?now.*//*?(1)?設置各種標志:ALLOC_WMARK_MIN,水位降低到?minALLOC_HARDER,如果是?atomic?或者?rt_task,進一步降低水位*/alloc_flags?=?gfp_to_alloc_flags(gfp_mask);/**?We?need?to?recalculate?the?starting?point?for?the?zonelist?iterator*?because?we?might?have?used?different?nodemask?in?the?fast?path,?or*?there?was?a?cpuset?modification?and?we?are?retrying?-?otherwise?we*?could?end?up?iterating?over?non-eligible?zones?endlessly.*//*?(2)?重新安排?fallback?zone?list?*/ac->preferred_zoneref?=?first_zones_zonelist(ac->zonelist,ac->high_zoneidx,?ac->nodemask);if?(!ac->preferred_zoneref->zone)goto?nopage;/*?(3)?進入慢速路徑,說明在?low?水位下已經分配失敗了,所以先喚醒?kswapd?異步回收線程*/if?(gfp_mask?&?__GFP_KSWAPD_RECLAIM)wake_all_kswapds(order,?ac);/**?The?adjusted?alloc_flags?might?result?in?immediate?success,?so?try*?that?first*//*?(4)?第2次分配:使用min水位嘗試直接從free?list分配page?*/page?=?get_page_from_freelist(gfp_mask,?order,?alloc_flags,?ac);if?(page)goto?got_pg;/**?For?costly?allocations,?try?direct?compaction?first,?as?it's?likely*?that?we?have?enough?base?pages?and?don't?need?to?reclaim.?For?non-*?movable?high-order?allocations,?do?that?as?well,?as?compaction?will*?try?prevent?permanent?fragmentation?by?migrating?from?blocks?of?the*?same?migratetype.*?對于昂貴的分配,首先嘗試直接壓縮,因為我們可能有足夠的基本頁,不需要回收。對于不可移動的高階分配,也要這樣做,因為壓縮將嘗試通過從相同migratetype的塊遷移來防止永久的碎片化。*?Don't?try?this?for?allocations?that?are?allowed?to?ignore*?watermarks,?as?the?ALLOC_NO_WATERMARKS?attempt?didn't?yet?happen.*?不要嘗試這個分配而允許忽略水位,因為alloc_no_watermark嘗試還沒有發生。*/if?(can_direct_reclaim?&&(costly_order?||(order?>?0?&&?ac->migratetype?!=?MIGRATE_MOVABLE))&&?!gfp_pfmemalloc_allowed(gfp_mask))?{/*?(5)?第3次分配:內存壓縮compact后,嘗試分配?get_page_from_freelist()?*/page?=?__alloc_pages_direct_compact(gfp_mask,?order,alloc_flags,?ac,INIT_COMPACT_PRIORITY,&compact_result);if?(page)goto?got_pg;/**?Checks?for?costly?allocations?with?__GFP_NORETRY,?which*?includes?THP?page?fault?allocations*/if?(costly_order?&&?(gfp_mask?&?__GFP_NORETRY))?{/**?If?compaction?is?deferred?for?high-order?allocations,*?it?is?because?sync?compaction?recently?failed.?If*?this?is?the?case?and?the?caller?requested?a?THP*?allocation,?we?do?not?want?to?heavily?disrupt?the*?system,?so?we?fail?the?allocation?instead?of?entering*?direct?reclaim.*/if?(compact_result?==?COMPACT_DEFERRED)goto?nopage;/**?Looks?like?reclaim/compaction?is?worth?trying,?but*?sync?compaction?could?be?very?expensive,?so?keep*?using?async?compaction.*/compact_priority?=?INIT_COMPACT_PRIORITY;}}retry:/*?Ensure?kswapd?doesn't?accidentally?go?to?sleep?as?long?as?we?loop?*//*?(6)?再一次喚醒?kswapd?異步回收線程,可能ac參數變得更嚴苛了?*/if?(gfp_mask?&?__GFP_KSWAPD_RECLAIM)wake_all_kswapds(order,?ac);/*?(7)?設置各種標志:ALLOC_NO_WATERMARKS,進一步降低水位,直接忽略水位*/reserve_flags?=?__gfp_pfmemalloc_flags(gfp_mask);if?(reserve_flags)alloc_flags?=?reserve_flags;/**?Reset?the?zonelist?iterators?if?memory?policies?can?be?ignored.*?These?allocations?are?high?priority?and?system?rather?than?user*?orientated.*/if?(!(alloc_flags?&?ALLOC_CPUSET)?||?reserve_flags)?{ac->preferred_zoneref?=?first_zones_zonelist(ac->zonelist,ac->high_zoneidx,?ac->nodemask);}/*?Attempt?with?potentially?adjusted?zonelist?and?alloc_flags?*//*?(8)?第4次分配:使用no水位嘗試直接從free?list分配page?*/page?=?get_page_from_freelist(gfp_mask,?order,?alloc_flags,?ac);if?(page)goto?got_pg;/*?Caller?is?not?willing?to?reclaim,?we?can't?balance?anything?*//*?(9)?如果當前不支持直接回收,則退出,等待?kswapd?異步線程的回收?*/if?(!can_direct_reclaim)goto?nopage;/*?Avoid?recursion?of?direct?reclaim?*//*?(10)?避免遞歸回收?*/if?(current->flags?&?PF_MEMALLOC)goto?nopage;/*?Try?direct?reclaim?and?then?allocating?*//*?(11)?第5次分配:直接啟動內存回收后,并嘗試page?get_page_from_freelist()?*/page?=?__alloc_pages_direct_reclaim(gfp_mask,?order,?alloc_flags,?ac,&did_some_progress);if?(page)goto?got_pg;/*?Try?direct?compaction?and?then?allocating?*//*?(12)?第6次分配:直接啟動內存壓縮后,并嘗試page?get_page_from_freelist()?*/page?=?__alloc_pages_direct_compact(gfp_mask,?order,?alloc_flags,?ac,compact_priority,?&compact_result);if?(page)goto?got_pg;/*?Do?not?loop?if?specifically?requested?*//*?(13)?如果還是分配失敗,且不支持重試,出錯返回?*/if?(gfp_mask?&?__GFP_NORETRY)goto?nopage;/**?Do?not?retry?costly?high?order?allocations?unless?they?are*?__GFP_RETRY_MAYFAIL*/if?(costly_order?&&?!(gfp_mask?&?__GFP_RETRY_MAYFAIL))goto?nopage;/*?(14)?檢查重試內存回收是否有意義?*/if?(should_reclaim_retry(gfp_mask,?order,?ac,?alloc_flags,did_some_progress?>?0,?&no_progress_loops))goto?retry;/**?It?doesn't?make?any?sense?to?retry?for?the?compaction?if?the?order-0*?reclaim?is?not?able?to?make?any?progress?because?the?current*?implementation?of?the?compaction?depends?on?the?sufficient?amount*?of?free?memory?(see?__compaction_suitable)*//*?(15)?檢查重試內存壓縮是否有意義?*/if?(did_some_progress?>?0?&&should_compact_retry(ac,?order,?alloc_flags,compact_result,?&compact_priority,&compaction_retries))goto?retry;/*?Deal?with?possible?cpuset?update?races?before?we?start?OOM?killing?*//*?(16)?在啟動?OOM?kiling?之前,是否有可能更新?cpuset?來進行重試?*/if?(check_retry_cpuset(cpuset_mems_cookie,?ac))goto?retry_cpuset;/*?Reclaim?has?failed?us,?start?killing?things?*//*?(17)?第7次分配:所有的內存回收嘗試都已經失敗,祭出最后的大招:通過殺進程來釋放內存?*/page?=?__alloc_pages_may_oom(gfp_mask,?order,?ac,?&did_some_progress);if?(page)goto?got_pg;/*?Avoid?allocations?with?no?watermarks?from?looping?endlessly?*//*?(18)?避免無止境循環的無水位分配?*/if?(tsk_is_oom_victim(current)?&&(alloc_flags?==?ALLOC_OOM?||(gfp_mask?&?__GFP_NOMEMALLOC)))goto?nopage;/*?Retry?as?long?as?the?OOM?killer?is?making?progress?*//*?(19)?在OOM?killing取得進展時重試?*/if?(did_some_progress)?{no_progress_loops?=?0;goto?retry;}nopage:/*?Deal?with?possible?cpuset?update?races?before?we?fail?*//*?(20)?在我們失敗之前處理可能的cpuset更新?*/if?(check_retry_cpuset(cpuset_mems_cookie,?ac))goto?retry_cpuset;/**?Make?sure?that?__GFP_NOFAIL?request?doesn't?leak?out?and?make?sure*?we?always?retry*//*?(21)?如果指定了?__GFP_NOFAIL,只能不停的進行重試?*/if?(gfp_mask?&?__GFP_NOFAIL)?{/**?All?existing?users?of?the?__GFP_NOFAIL?are?blockable,?so?warn*?of?any?new?users?that?actually?require?GFP_NOWAIT*/if?(WARN_ON_ONCE(!can_direct_reclaim))goto?fail;WARN_ON_ONCE(current->flags?&?PF_MEMALLOC);WARN_ON_ONCE(order?>?PAGE_ALLOC_COSTLY_ORDER);page?=?__alloc_pages_cpuset_fallback(gfp_mask,?order,?ALLOC_HARDER,?ac);if?(page)goto?got_pg;cond_resched();goto?retry;} fail:/*?(22)?構造分配失敗的告警信息?*/warn_alloc(gfp_mask,?ac->nodemask,"page?allocation?failure:?order:%u",?order); got_pg:return?page; }end
人人極客社區?
關注,回復【peter】海量Linux資料贈送
精彩文章合集
文章推薦
?【專輯】Linux內存管理
?【專輯】Linux進程管理
?【專輯】Linux文件系統
?【專輯】Linux中斷管理
?【專輯】Linux同步管理
?【專輯】Linux電源管理
?【專輯】Linux性能分析
?【專輯】Linux DMA
?【專輯】Linux 驅動
?【專輯】圖形顯示
總結
以上是生活随笔為你收集整理的Buddy 内存管理机制(下)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 如何启用计算机的休眠,win7休眠-wi
- 下一篇: 微软2010(附下载地址),yozo,w