DPDK初始化hugepages
hugepages初始化入口函數(shù)hugepage_info_init。hugepages目錄sys_dir_path指向"/sys/kernel/mm/hugepages",目前此目錄下包含兩個(gè)子目錄,分別對(duì)應(yīng)1G和2M頁面大小的hugepages:
$ ls /sys/kernel/mm/hugepages -l total 0 drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-1048576kB drwxr-xr-x 2 root root 0 7月 7 15:08 hugepages-2048kB遍歷sys_dir_path的子目錄,解析頁面大小保存到全局hugepage_info數(shù)組中。
static int hugepage_info_init(void) { const char dirent_start_text[] = "hugepages-";const size_t dirent_start_len = sizeof(dirent_start_text) - 1;unsigned int i, num_sizes = 0;struct internal_config *internal_conf = eal_get_internal_configuration();dir = opendir(sys_dir_path);for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {struct hugepage_info *hpi;if (strncmp(dirent->d_name, dirent_start_text,dirent_start_len) != 0)continue;if (num_sizes >= MAX_HUGEPAGE_SIZES) break;hpi = &internal_conf->hugepage_info[num_sizes];hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);檢查當(dāng)前頁面大小的hugepage是否已經(jīng)進(jìn)行了mount操作,將mount目錄保存到hugedir中。如果還沒有執(zhí)行mount,使用get_num_hugepages獲取可用的頁面數(shù)量。
/* first, check if we have a mountpoint */if (get_hugepage_dir(hpi->hugepage_sz,hpi->hugedir, sizeof(hpi->hugedir)) < 0) {uint32_t num_pages;num_pages = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, 0);if (num_pages > 0)RTE_LOG(NOTICE, EAL,"%" PRIu32 " hugepages of size ""%" PRIu64 " reserved, but no mounted ""hugetlbfs found for that size\n",num_pages, hpi->hugepage_sz);/* if we have kernel support for reserving hugepages* through mmap, and we're in in-memory mode, treat this* page size as valid. we cannot be in legacy mode at* this point because we've checked this earlier in the* init process.*/ #ifdef MAP_HUGE_SHIFTif (internal_conf->in_memory) {RTE_LOG(DEBUG, EAL, "In-memory mode enabled, ""hugepages of size %" PRIu64 " bytes ""will be allocated anonymously\n",hpi->hugepage_sz);calc_num_pages(hpi, dirent, 0);num_sizes++;} #endifcontinue;}否則,如果頁面已經(jīng)進(jìn)行了mount操作,鎖定mount的掛載目錄,由inspect_hugedir檢查目錄中可重用的空間大小,根據(jù)頁面大小,獲得可重用的頁面數(shù)量。
/* try to obtain a writelock */hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);/* if blocking lock failed */if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {RTE_LOG(CRIT, EAL,"Failed to lock hugepage directory!\n");break;}/* Check for existing hugepage files and either remove them* or count how many of them can be reused.*/reusable_pages = 0;if (!internal_conf->hugepage_file.unlink_existing) {reusable_bytes = 0;if (inspect_hugedir(hpi->hugedir, &reusable_bytes) < 0)break;RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);reusable_pages = reusable_bytes / hpi->hugepage_sz;} else if (clear_hugedir(hpi->hugedir) < 0) {break;}calc_num_pages(hpi, dirent, reusable_pages);num_sizes++;}closedir(dir);變量num_hugepage_sizes記錄有多少種可用的頁面大小,qsort按照頁面大小將hugepage_info進(jìn)行排序。最后,檢查是否有可用的頁面(num_pages大于0)。
/* something went wrong, and we broke from the for loop above */if (dirent != NULL)return -1;internal_conf->num_hugepage_sizes = num_sizes;/* sort the page directory entries by size, largest to smallest */qsort(&internal_conf->hugepage_info[0], num_sizes,sizeof(internal_conf->hugepage_info[0]), compare_hpi);/* now we have all info, check we have at least one valid size */for (i = 0; i < num_sizes; i++) {/* pages may no longer all be on socket 0, so check all */unsigned int j, num_pages = 0;struct hugepage_info *hpi = &internal_conf->hugepage_info[i];for (j = 0; j < RTE_MAX_NUMA_NODES; j++)num_pages += hpi->num_pages[j];if (num_pages > 0)return 0;}/* no valid hugepage mounts available, return error */return -1;獲取hugetlbfs掛載點(diǎn)
查看PROC文件mounts可看到所有掛載的文件系統(tǒng),如下。
$ cat /proc/mounts | grep hugetlbfs hugetlbfs /dev/hugepages hugetlbfs rw,relatime,pagesize=2M 0 0文件mounts中內(nèi)容的格式如proc_mount_fieldnames中的定義,分隔符為空格。
static int get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) {enum proc_mount_fieldnames {DEVICE = 0,MOUNTPT,FSTYPE,OPTIONS,_FIELDNAME_MAX};static uint64_t default_size = 0;const char proc_mounts[] = "/proc/mounts";const char hugetlbfs_str[] = "hugetlbfs";const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;const char pagesize_opt[] = "pagesize=";const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;const char split_tok = ' ';char *splitstr[_FIELDNAME_MAX];char found[PATH_MAX] = "";const struct internal_config *internal_conf =eal_get_internal_configuration();/* If the specified dir doesn't exist, we can't match it.*/if (internal_conf->hugepage_dir != NULL &&stat(internal_conf->hugepage_dir, &st) != 0) {return -1;}默認(rèn)的hugepage頁面大小可在/proc/meminfo中查看(get_default_hp_size)。打開/proc/mounts文件,逐行遍歷,尋找文件系統(tǒng)類型(FSTYPE)字段等于hugetlbfs的行。在OPTIONS字段中查找"pagesize="字符串。
FILE *fd = fopen(proc_mounts, "r");if (fd == NULL)rte_panic("Cannot open %s\n", proc_mounts);if (default_size == 0)default_size = get_default_hp_size();while (fgets(buf, sizeof(buf), fd)){const char *pagesz_str;if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,split_tok) != _FIELDNAME_MAX) {RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);break; /* return NULL */}if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) != 0)continue;pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);如果沒有找到"pagesize=",使用默認(rèn)的頁面大小,檢查其是否與參數(shù)hugepage_sz相等,否則,檢查pagesize自定的頁面大小是否與參數(shù)hugepage_sz相等,相等的話,表明找到了。
/* if no explicit page size, the default page size is compared */if (pagesz_str == NULL) {if (hugepage_sz != default_size)continue;}/* there is an explicit page size, so check it */else {uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);if (pagesz != hugepage_sz)continue;}如果DPDK沒有指定–huge-dir參數(shù),將當(dāng)前遍歷行中的MOUNTPT掛載點(diǎn)字段拷貝到found中。否則,如果指定了–huge-dir,當(dāng)前遍歷行的掛載點(diǎn)字段與指定的不相等,繼續(xù)遍歷下一行。
最后,如果存在兩個(gè)滿足條件的掛載點(diǎn),使用最長(zhǎng)匹配的那一個(gè)。
/* If no --huge-dir option has been given, we're done.*/if (internal_conf->hugepage_dir == NULL) {strlcpy(found, splitstr[MOUNTPT], len);break;}/* Ignore any mount that doesn't contain the --huge-dir* directory.*/if (strncmp(internal_conf->hugepage_dir, splitstr[MOUNTPT],strlen(splitstr[MOUNTPT])) != 0) {continue;}/* We found a match, but only prefer it if it's a longer match* (so /mnt/1 is preferred over /mnt for matching /mnt/1/2)).*/if (strlen(splitstr[MOUNTPT]) > strlen(found))strlcpy(found, splitstr[MOUNTPT], len);} /* end while fgets */fclose(fd);優(yōu)先使用–huge-dir指定的掛載點(diǎn)目錄,其次使用以上找到的目錄。
if (found[0] != '\0') {/* If needed, return the requested dir, not the mount point. */strlcpy(hugedir, internal_conf->hugepage_dir != NULL ?internal_conf->hugepage_dir : found, len);return 0;}return -1;獲取頁面大小(傳統(tǒng)方式)
根據(jù)以下2M大小頁面配置,獲取可用的頁面數(shù)量。
$ cat /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages 1003 $ cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages 0 $ cat /sys/kernel/mm/hugepages/hugepages-2048kB/resv_hugepages 0 $ cat /sys/kernel/mm/hugepages/hugepages-2048kB/surplus_hugepages 0開始假定free_hugepages中的頁面都是可用的。首先,檢查保留頁面數(shù)量resv_hugepages,減去保留頁面數(shù)量,即為可用的頁面。
static uint32_t get_num_hugepages(const char *subdir, size_t sz, unsigned int reusable_pages) { unsigned long resv_pages, num_pages, over_pages, surplus_pages;const char *nr_hp_file = "free_hugepages";const char *nr_rsvd_file = "resv_hugepages";const char *nr_over_file = "nr_overcommit_hugepages";const char *nr_splus_file = "surplus_hugepages";/* first, check how many reserved pages kernel reports */if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)return 0;if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)over_pages = 0;if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)surplus_pages = 0;/* adjust num_pages */if (num_pages >= resv_pages)num_pages -= resv_pages;else if (resv_pages)num_pages = 0;可超限的頁面數(shù)量減去過剩頁面數(shù)量,等于可用的超限頁面數(shù)量。
if (over_pages >= surplus_pages)over_pages -= surplus_pages;elseover_pages = 0;if (num_pages == 0 && over_pages == 0 && reusable_pages)RTE_LOG(WARNING, EAL, "No available %zu kB hugepages reported\n",sz >> 10);可用頁面數(shù)量加上可超限使用的頁面數(shù)量,如果結(jié)果溢出,使用uint32的最大值。最后,加上可重用的頁面數(shù)量,即為最終的可用頁面數(shù)量,不超過無符號(hào)32位的最大值。
num_pages += over_pages;if (num_pages < over_pages) /* overflow */num_pages = UINT32_MAX;num_pages += reusable_pages;if (num_pages < reusable_pages) /* overflow */num_pages = UINT32_MAX;/* we want to return a uint32_t and more than this looks suspicious* anyway ... */if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;計(jì)算可重用頁面
在hugepage已經(jīng)mount的情況下,遍歷掛載點(diǎn)目錄(walk_hugedir),由回調(diào)函數(shù)inspect_hugedir_cb計(jì)算可重用描述所占用的空間。
static void inspect_hugedir_cb(const struct walk_hugedir_data *whd) {uint64_t *total_size = whd->user_data;if (fstat(whd->file_fd, &st) < 0)RTE_LOG(DEBUG, EAL, "%s(): stat(\"%s\") failed: %s",__func__, whd->file_name, strerror(errno));else(*total_size) += st.st_size; }/* Count the total size in bytes of all files in the directory* not mapped by other DPDK process.*/ static int inspect_hugedir(const char *hugedir, uint64_t *total_size) {return walk_hugedir(hugedir, inspect_hugedir_cb, total_size);遍歷掛載點(diǎn)目錄,查找符合"map_"命名規(guī)則的文件,如果成功鎖定此文件,即認(rèn)為其為可重用的文件,調(diào)用回調(diào)cb計(jì)算其空間大小。
/* Search the hugepage directory for whatever hugepage files there are.* Check if the file is in use by another DPDK process.* If not, execute a callback on it.*/ static int walk_hugedir(const char *hugedir, walk_hugedir_t *cb, void *user_data) {const char filter[] = "*map_*"; /* matches hugepage files */dir = opendir(hugedir);dir_fd = dirfd(dir);dirent = readdir(dir);while (dirent != NULL) {/* skip files that don't match the hugepage pattern */if (fnmatch(filter, dirent->d_name, 0) > 0) {dirent = readdir(dir);continue;}/* try and lock the file */fd = openat(dir_fd, dirent->d_name, O_RDONLY);/* skip to next file */if (fd == -1) {dirent = readdir(dir);continue;}/* non-blocking lock */lck_result = flock(fd, LOCK_EX | LOCK_NB);/* if lock succeeds, execute callback */if (lck_result != -1)cb(&(struct walk_hugedir_data){.dir_fd = dir_fd,.file_fd = fd,.file_name = dirent->d_name,.user_data = user_data,});close (fd);dirent = readdir(dir);刪除不再使用頁面
遍歷mount掛載點(diǎn)目錄,對(duì)于不再使用的文件,由clear_hugedir_cb回調(diào)函數(shù)執(zhí)行刪除操作。
static void clear_hugedir_cb(const struct walk_hugedir_data *whd) {unlinkat(whd->dir_fd, whd->file_name, 0); }/* Remove hugepage files not used by other DPDK processes from a directory. */ static int clear_hugedir(const char *hugedir) {return walk_hugedir(hugedir, clear_hugedir_cb, NULL);可用頁面計(jì)算
static void calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent,unsigned int reusable_pages) {uint64_t total_pages = 0;const struct internal_config *internal_conf =eal_get_internal_configuration();/** first, try to put all hugepages into relevant sockets, but* if first attempts fails, fall back to collecting all pages* in one socket and sorting them later*/total_pages = 0;首先,使用numa節(jié)點(diǎn)獲取可用的頁面數(shù)量。total_pages保存所有節(jié)點(diǎn)頁面數(shù)量的總和。
/** We also don't want to do this for legacy init.* When there are hugepage files to reuse it is unknown* what NUMA node the pages are on.* This could be determined by mapping,* but it is precisely what hugepage file reuse is trying to avoid.*/if (!internal_conf->legacy_mem && reusable_pages == 0)for (i = 0; i < rte_socket_count(); i++) {int socket = rte_socket_id_by_idx(i);unsigned int num_pages =get_num_hugepages_on_node(dirent->d_name, socket, hpi->hugepage_sz);hpi->num_pages[socket] = num_pages;total_pages += num_pages;}其次,如果以上沒有得到任何頁面,以下采用傳統(tǒng)的方式獲取可用頁面,此時(shí),所以可用的頁面數(shù)量保存在了num_pages數(shù)組的0索引位置,之后根據(jù)NUMA節(jié)點(diǎn)再進(jìn)行分配。
/* we failed to sort memory from the get go, so fall* back to old way*/if (total_pages == 0) {hpi->num_pages[0] = get_num_hugepages(dirent->d_name,hpi->hugepage_sz, reusable_pages);#ifndef RTE_ARCH_64/* for 32-bit systems, limit number of hugepages to* 1GB per page size */hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],RTE_PGSIZE_1G / hpi->hugepage_sz); #endif計(jì)算NUMA可用頁面(新方式)
目錄sys_pages_numa_dir_path[]定義為"/sys/devices/system/node",例如,對(duì)于node0有如下的目錄結(jié)構(gòu):
$ ls -l /sys/devices/system/node/node0/hugepages/ total 0 drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-1048576kB drwxr-xr-x 2 root root 0 7月 7 13:51 hugepages-2048kB $ $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages 1003 $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 1024 $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages 0對(duì)應(yīng)于socketdir目錄,根據(jù)頁面大小子目錄(subdir),獲取free_hugepages的大小。
static uint32_t get_num_hugepages_on_node(const char *subdir, unsigned int socket, size_t sz) {unsigned long num_pages = 0;const char *nr_hp_file = "free_hugepages";snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",sys_pages_numa_dir_path, socket);socketdir = opendir(socketpath);if (socketdir) {/* Keep calm and carry on */closedir(socketdir);} else {/* Can't find socket dir, so ignore it */return 0;}snprintf(path, sizeof(path), "%s/%s/%s",socketpath, subdir, nr_hp_file);if (eal_parse_sysfs_value(path, &num_pages) < 0)return 0;確保可用頁面的大小不大于uint32表示的最大值。
if (num_pages == 0)RTE_LOG(WARNING, EAL, "No free %zu kB hugepages reported on node %u\n",sz >> 10, socket);/* we want to return a uint32_t and more than this looks suspicious* anyway ...*/if (num_pages > UINT32_MAX)num_pages = UINT32_MAX;return num_pages;總結(jié)
以上是生活随笔為你收集整理的DPDK初始化hugepages的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 宏观经济学 索洛模型
- 下一篇: Chrome谷歌浏览器不能输入中文问题