當前位置：首頁 > 运维知识 > linux >内容正文

linux

【Linux 内核内存管理】物理分配页 ⑦ ( __alloc_pages_slowpath 慢速路径调用函数源码分析 | 判断页阶数 | 读取 mems_allowed | 分配标志位转换 )

發布時間：2023/12/20 linux 23 豆豆

生活随笔收集整理的這篇文章主要介紹了【Linux 内核内存管理】物理分配页 ⑦ ( __alloc_pages_slowpath 慢速路径调用函数源码分析 | 判断页阶数 | 读取 mems_allowed | 分配标志位转换 ) 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

文章目錄

一、__alloc_pages_slowpath 慢速路徑調用函數
二、判斷頁階數
三、讀取進程 mems_allowed 成員
四、分配標志位轉換
五、__alloc_pages_slowpath 慢速路徑調用完整函數源碼

在【Linux 內核內存管理】物理分配頁 ② ( __alloc_pages_nodemask 函數參數分析 | __alloc_pages_nodemask 函數分配物理頁流程 ) 博客中 , 分析了 __alloc_pages_nodemask 函數分配物理頁流程如下 :

首先 , 根據 gfp_t gfp_mask 分配標志位參數 , 得到 " 內存節點 “ 的首選 ” 區域類型 " 和 " 遷移類型 " ;

然后 , 執行 " 快速路徑 " , 第一次分配嘗試使用低水線分配 ;

如果上述 " 快速路徑 " 分配失敗 , 則執行 " 慢速路徑 " 分配 ;

上述涉及到了 " 快速路徑 " 和 " 慢速路徑 " $2$ 種物理頁分配方式 ;

前面幾篇博客 , 分析了 " 快速路徑 " 內存分配核心函數 get_page_from_freelist , 本博客開始分析 " 慢速路徑 " 內存分配函數 __alloc_pages_slowpath 函數 ;

一、__alloc_pages_slowpath 慢速路徑調用函數

內存區域內進行物理頁分配時 , 優先嘗試使用 " 快速路徑 " 內存分配 , 執行 get_page_from_freelist 核心函數 ;

假如上述 " 低水線內存分配 " 分配 , 即 " 快速路徑 " 內存分配失敗 , 則執行 " 慢速路徑 " 內存分配 ;

" 慢速路徑 " 內存分配的核心函數是 __alloc_pages_slowpath 函數 , 定義在 Linux 內核源碼的 linux-4.12\mm\page_alloc.c#3676 位置 ;

源碼路徑 : linux-4.12\mm\page_alloc.c#3676

二、判斷頁階數

先判斷內存分配的物理頁的階數 , 申請物理頁內存的 " 階數 " , 必須小于頁分配器支持的最大分配階數 ;

階 ( Order ) : 物理頁的數量單位 , $n$ 階頁塊指的是 $2^n$ 個連續的 " 物理頁 " ; 完整概念參考【Linux 內核內存管理】伙伴分配器 ① ( 伙伴分配器引入 | 頁塊、階 | 伙伴 ) ;

/** In the slowpath, we sanity check order to avoid ever trying to* reclaim >= MAX_ORDER areas which will never succeed. Callers may* be using allocators in order of preference for an area that is* too large.*/if (order >= MAX_ORDER) {WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));return NULL;}

源碼路徑 : linux-4.12\mm\page_alloc.c#3699

三、讀取進程 mems_allowed 成員

在后面代碼中 , 會檢查 cpuset , 查看是否允許當前進程從內存節點申請物理頁 ,

上述判斷 , 需要讀取當前進程的 mems_allowed 成員 , 讀取時需要使用 " 順序保護鎖 " ;

cpuset_mems_cookie = read_mems_allowed_begin();

源碼路徑 : linux-4.12\mm\page_alloc.c#3716

四、分配標志位轉換

將 " 分配標志位 " 轉為 " 內部分配標志位 " ;

/** The fast path uses conservative alloc_flags to succeed only until* kswapd needs to be woken up, and to avoid the cost of setting up* alloc_flags precisely. So we do that now.*/alloc_flags = gfp_to_alloc_flags(gfp_mask);

源碼路徑 : linux-4.12\mm\page_alloc.c#3723

五、__alloc_pages_slowpath 慢速路徑調用完整函數源碼

static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,struct alloc_context *ac) {bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;struct page *page = NULL;unsigned int alloc_flags;unsigned long did_some_progress;enum compact_priority compact_priority;enum compact_result compact_result;int compaction_retries;int no_progress_loops;unsigned long alloc_start = jiffies;unsigned int stall_timeout = 10 * HZ;unsigned int cpuset_mems_cookie;/** In the slowpath, we sanity check order to avoid ever trying to* reclaim >= MAX_ORDER areas which will never succeed. Callers may* be using allocators in order of preference for an area that is* too large.*/if (order >= MAX_ORDER) {WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));return NULL;}/** We also sanity check to catch abuse of atomic reserves being used by* callers that are not in atomic context.*/if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))gfp_mask &= ~__GFP_ATOMIC;retry_cpuset:compaction_retries = 0;no_progress_loops = 0;compact_priority = DEF_COMPACT_PRIORITY;cpuset_mems_cookie = read_mems_allowed_begin();/** The fast path uses conservative alloc_flags to succeed only until* kswapd needs to be woken up, and to avoid the cost of setting up* alloc_flags precisely. So we do that now.*/alloc_flags = gfp_to_alloc_flags(gfp_mask);/** We need to recalculate the starting point for the zonelist iterator* because we might have used different nodemask in the fast path, or* there was a cpuset modification and we are retrying - otherwise we* could end up iterating over non-eligible zones endlessly.*/ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);if (!ac->preferred_zoneref->zone)goto nopage;if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);/** The adjusted alloc_flags might result in immediate success, so try* that first*/page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/** For costly allocations, try direct compaction first, as it's likely* that we have enough base pages and don't need to reclaim. For non-* movable high-order allocations, do that as well, as compaction will* try prevent permanent fragmentation by migrating from blocks of the* same migratetype.* Don't try this for allocations that are allowed to ignore* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.*/if (can_direct_reclaim &&(costly_order ||(order > 0 && ac->migratetype != MIGRATE_MOVABLE))&& !gfp_pfmemalloc_allowed(gfp_mask)) {page = __alloc_pages_direct_compact(gfp_mask, order,alloc_flags, ac,INIT_COMPACT_PRIORITY,&compact_result);if (page)goto got_pg;/** Checks for costly allocations with __GFP_NORETRY, which* includes THP page fault allocations*/if (costly_order && (gfp_mask & __GFP_NORETRY)) {/** If compaction is deferred for high-order allocations,* it is because sync compaction recently failed. If* this is the case and the caller requested a THP* allocation, we do not want to heavily disrupt the* system, so we fail the allocation instead of entering* direct reclaim.*/if (compact_result == COMPACT_DEFERRED)goto nopage;/** Looks like reclaim/compaction is worth trying, but* sync compaction could be very expensive, so keep* using async compaction.*/compact_priority = INIT_COMPACT_PRIORITY;}}retry:/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */if (gfp_mask & __GFP_KSWAPD_RECLAIM)wake_all_kswapds(order, ac);if (gfp_pfmemalloc_allowed(gfp_mask))alloc_flags = ALLOC_NO_WATERMARKS;/** Reset the zonelist iterators if memory policies can be ignored.* These allocations are high priority and system rather than user* orientated.*/if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,ac->high_zoneidx, ac->nodemask);}/* Attempt with potentially adjusted zonelist and alloc_flags */page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);if (page)goto got_pg;/* Caller is not willing to reclaim, we can't balance anything */if (!can_direct_reclaim)goto nopage;/* Make sure we know about allocations which stall for too long */if (time_after(jiffies, alloc_start + stall_timeout)) {warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,"page allocation stalls for %ums, order:%u",jiffies_to_msecs(jiffies-alloc_start), order);stall_timeout += 10 * HZ;}/* Avoid recursion of direct reclaim */if (current->flags & PF_MEMALLOC)goto nopage;/* Try direct reclaim and then allocating */page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,&did_some_progress);if (page)goto got_pg;/* Try direct compaction and then allocating */page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);if (page)goto got_pg;/* Do not loop if specifically requested */if (gfp_mask & __GFP_NORETRY)goto nopage;/** Do not retry costly high order allocations unless they are* __GFP_REPEAT*/if (costly_order && !(gfp_mask & __GFP_REPEAT))goto nopage;if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,did_some_progress > 0, &no_progress_loops))goto retry;/** It doesn't make any sense to retry for the compaction if the order-0* reclaim is not able to make any progress because the current* implementation of the compaction depends on the sufficient amount* of free memory (see __compaction_suitable)*/if (did_some_progress > 0 &&should_compact_retry(ac, order, alloc_flags,compact_result, &compact_priority,&compaction_retries))goto retry;/** It's possible we raced with cpuset update so the OOM would be* premature (see below the nopage: label for full explanation).*/if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;/* Reclaim has failed us, start killing things */page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);if (page)goto got_pg;/* Avoid allocations with no watermarks from looping endlessly */if (test_thread_flag(TIF_MEMDIE) &&(alloc_flags == ALLOC_NO_WATERMARKS ||(gfp_mask & __GFP_NOMEMALLOC)))goto nopage;/* Retry as long as the OOM killer is making progress */if (did_some_progress) {no_progress_loops = 0;goto retry;}nopage:/** When updating a task's mems_allowed or mempolicy nodemask, it is* possible to race with parallel threads in such a way that our* allocation can fail while the mask is being updated. If we are about* to fail, check if the cpuset changed during allocation and if so,* retry.*/if (read_mems_allowed_retry(cpuset_mems_cookie))goto retry_cpuset;/** Make sure that __GFP_NOFAIL request doesn't leak out and make sure* we always retry*/if (gfp_mask & __GFP_NOFAIL) {/** All existing users of the __GFP_NOFAIL are blockable, so warn* of any new users that actually require GFP_NOWAIT*/if (WARN_ON_ONCE(!can_direct_reclaim))goto fail;/** PF_MEMALLOC request from this context is rather bizarre* because we cannot reclaim anything and only can loop waiting* for somebody to do a work for us*/WARN_ON_ONCE(current->flags & PF_MEMALLOC);/** non failing costly orders are a hard requirement which we* are not prepared for much so let's warn about these users* so that we can identify them and convert them to something* else.*/WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);/** Help non-failing allocations by giving them access to memory* reserves but do not use ALLOC_NO_WATERMARKS because this* could deplete whole memory reserves which would just make* the situation worse*/page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);if (page)goto got_pg;cond_resched();goto retry;} fail:warn_alloc(gfp_mask, ac->nodemask,"page allocation failure: order:%u", order); got_pg:return page; }