Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)
目錄
0. 引言 1. open() syscall 2. close() syscall?
0. 引言
在linux的哲學(xué)中,所有的磁盤文件、目錄、外設(shè)設(shè)備、驅(qū)動設(shè)備全部被抽象為了"文件"這個概念,所以本文提到的"File IO"適用于linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系統(tǒng)調(diào)用對應(yīng)的內(nèi)核源代碼,linux下每一個系統(tǒng)調(diào)用都有對應(yīng)的內(nèi)核源代碼,而我們在ring3常用的glib c的編程所有的c庫API,它們只是對系統(tǒng)調(diào)用的一個封裝,最終還是要通過系統(tǒng)調(diào)用實現(xiàn)功能
0x1: SYSCALL_DEFINE宏定義
我們在學(xué)習(xí)內(nèi)核源代碼的時候經(jīng)常會遇到一個宏定義: SYSCALL_DEFINE,所有的系統(tǒng)調(diào)用的聲明都通過它來實現(xiàn)
\linux-2.6.32.63\include\linux\syscalls.h
#define SYSCALL_DEFINE0(sname) \SYSCALL_TRACE_ENTER_EVENT(_##sname); \SYSCALL_TRACE_EXIT_EVENT(_##sname); \static const struct syscall_metadata __used \__attribute__((__aligned__(4))) \__attribute__((section("__syscalls_metadata"))) \__syscall_meta_##sname = { \.name = "sys_"#sname, \.nb_args = 0, \.enter_event = &event_enter__##sname, \.exit_event = &event_exit__##sname, \}; \asmlinkage long sys_##sname(void) #else#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #endif#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)...
#ifdef CONFIG_FTRACE_SYSCALLS#define SYSCALL_DEFINEx(x, sname, ...) \static const char *types_##sname[] = { \__SC_STR_TDECL##x(__VA_ARGS__) \}; \static const char *args_##sname[] = { \__SC_STR_ADECL##x(__VA_ARGS__) \}; \SYSCALL_METADATA(sname, x); \__SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #else#define SYSCALL_DEFINEx(x, sname, ...) \__SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #endif#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS#define SYSCALL_DEFINE(name) static inline long SYSC_##name#define __SYSCALL_DEFINEx(x, name, ...) \asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \{ \__SC_TEST##x(__VA_ARGS__); \return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \} \SYSCALL_ALIAS(sys##name, SyS##name); \static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */#define SYSCALL_DEFINE(name) asmlinkage long sys_##name#define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */所以對函數(shù)定義
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等于
asmlinkage long sys_socket(int family, int type, int protocol)
Relevant Link:
http://blog.csdn.net/p_panyuch/article/details/5648007?
1. open() syscall
open()系統(tǒng)調(diào)用在kernel中對應(yīng)的是sys_open()
\linux-2.6.32.63\fs\open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) {long ret;if (force_o_largefile()){flags |= O_LARGEFILE;} //調(diào)用do_sys_open完成實際功能ret = do_sys_open(AT_FDCWD, filename, flags, mode);/* avoid REGPARM breakage on x86: */asmlinkage_protect(3, ret, filename, flags, mode);return ret; }繼續(xù)跟進(jìn)do_sys_open()函數(shù)
long do_sys_open(int dfd, const char __user *filename, int flags, int mode) {/*獲取文件名稱,由getname()函數(shù)完成,其內(nèi)部首先創(chuàng)建存取文件名稱的空間,然后從用戶空間把文件名拷貝過來*/char *tmp = getname(filename);int fd = PTR_ERR(tmp);if (!IS_ERR(tmp)) {/*獲取一個可用的fd,此函數(shù)調(diào)用alloc_fd()函數(shù)從fd_table中獲取一個可用fd,并進(jìn)行初始化*/fd = get_unused_fd_flags(flags);if (fd >= 0) {/*fd獲取成功則開始打開文件,此函數(shù)是主要完成打開功能的函數(shù)*/struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);if (IS_ERR(f)) {/*打開失敗,釋放fd*/put_unused_fd(fd);fd = PTR_ERR(f);} else {//文件如果已經(jīng)被打開了,調(diào)用fsnotify_open()函數(shù) fsnotify_open(f->f_path.dentry);//將文件指針安裝在fd數(shù)組中,每個進(jìn)程都會將打開的文件句柄保存在fd_array[]數(shù)組中 fd_install(fd, f);}}//釋放放置從用戶空間拷貝過來的文件名的存儲空間 putname(tmp);}return fd; }繼續(xù)跟進(jìn)do_file_open()函數(shù)
/** Note that the low bits of the passed in "open_flag"* are not the same as in the local variable "flag". See* open_to_namei_flags() for more details.*/ struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode) {/* 若干變量聲明 */struct file *filp;struct nameidata nd;int error;struct path path;struct dentry *dir;int count = 0;int will_write;/*改變參數(shù)flag的值,具體做法是flag+1*/int flag = open_to_namei_flags(open_flag);/*設(shè)置訪問權(quán)限*/if (!acc_mode){acc_mode = MAY_OPEN | ACC_MODE(flag);} /* O_TRUNC implies we need access checks for write permissions *//* 根據(jù)O_TRUNC標(biāo)志設(shè)置寫權(quán)限 */if (flag & O_TRUNC){acc_mode |= MAY_WRITE;} /* Allow the LSM permission hook to distinguish append access from general write access. *//* 設(shè)置O_APPEND標(biāo)志 */if (flag & O_APPEND){acc_mode |= MAY_APPEND;} /* The simplest case - just a plain lookup. *//* 如果不是創(chuàng)建文件 */if (!(flag & O_CREAT)) { /*當(dāng)內(nèi)核要訪問一個文件的時候,第一步要做的是找到這個文件,而查找文件的過程在vfs里面是由path_lookup或者path_lookup_open函數(shù)來完成的這兩個函數(shù)將用戶傳進(jìn)來的字符串表示的文件路徑轉(zhuǎn)換成一個dentry結(jié)構(gòu),并建立好相應(yīng)的inode和file結(jié)構(gòu),將指向file的描述符返回用戶用戶隨后通過文件描述符,來訪問這些數(shù)據(jù)結(jié)構(gòu)*/error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);if (error){return ERR_PTR(error);} goto ok;}/** Create - we need to know the parent.*///path-init為查找作準(zhǔn)備工作,path_walk真正上路查找,這兩個函數(shù)聯(lián)合起來根據(jù)一段路徑名找到對應(yīng)的dentry error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);if (error){return ERR_PTR(error);} /*這個函數(shù)相當(dāng)重要,是整個NFS的名字解析函數(shù),其實也是NFS得以構(gòu)筑的函數(shù)該函數(shù)采用一個for循環(huán),對name路徑根據(jù)目錄的層次,一層一層推進(jìn),直到終點或失敗。在推進(jìn)的過程中,一步步建立了目錄樹的dentry和對應(yīng)的inode*/error = path_walk(pathname, &nd);if (error) {if (nd.root.mnt){/*減少dentry和vsmount得計數(shù)*/path_put(&nd.root);} return ERR_PTR(error);}if (unlikely(!audit_dummy_context())){/*保存inode節(jié)點信息*/audit_inode(pathname, nd.path.dentry);} /** We have the parent and last component. First of all, check* that we are not asked to creat(2) an obvious directory - that* will not do.*/error = -EISDIR;/*父節(jié)點信息*/if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]){goto exit_parent;} error = -ENFILE;/* 返回特定的file結(jié)構(gòu)體指針 */filp = get_empty_filp();if (filp == NULL){goto exit_parent;} /* 填充nameidata結(jié)構(gòu) */nd.intent.open.file = filp;nd.intent.open.flags = flag;nd.intent.open.create_mode = mode;dir = nd.path.dentry;nd.flags &= ~LOOKUP_PARENT;nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;if (flag & O_EXCL){nd.flags |= LOOKUP_EXCL;} mutex_lock(&dir->d_inode->i_mutex);/*從哈希表中查找nd對應(yīng)的dentry*/path.dentry = lookup_hash(&nd);path.mnt = nd.path.mnt;do_last:error = PTR_ERR(path.dentry);if (IS_ERR(path.dentry)) {mutex_unlock(&dir->d_inode->i_mutex);goto exit;}if (IS_ERR(nd.intent.open.file)) {error = PTR_ERR(nd.intent.open.file);goto exit_mutex_unlock;}/* Negative dentry, just create the file *//*如果此dentry結(jié)構(gòu)沒有對應(yīng)的inode節(jié)點,說明是無效的,應(yīng)該創(chuàng)建文件節(jié)點 */if (!path.dentry->d_inode) {/** This write is needed to ensure that a* ro->rw transition does not occur between* the time when the file is created and when* a permanent write count is taken through* the 'struct file' in nameidata_to_filp().*//*write權(quán)限是必需的*/error = mnt_want_write(nd.path.mnt);if (error){goto exit_mutex_unlock;} /*按照namei格式的flag open*/error = __open_namei_create(&nd, &path, flag, mode);if (error) {mnt_drop_write(nd.path.mnt);goto exit;}/*根據(jù)nameidata 得到相應(yīng)的file結(jié)構(gòu)*/filp = nameidata_to_filp(&nd, open_flag);if (IS_ERR(filp)){ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));} /*放棄寫權(quán)限*/mnt_drop_write(nd.path.mnt);if (nd.root.mnt){/*計數(shù)減一*/path_put(&nd.root);} return filp;}/** It already exists.*//*要打開的文件已經(jīng)存在*/mutex_unlock(&dir->d_inode->i_mutex);/*保存inode節(jié)點*/audit_inode(pathname, path.dentry);error = -EEXIST;/*flag標(biāo)志檢查代碼*/if (flag & O_EXCL){goto exit_dput;} if (__follow_mount(&path)){error = -ELOOP;if (flag & O_NOFOLLOW){goto exit_dput;} }error = -ENOENT;if (!path.dentry->d_inode){goto exit_dput;} if (path.dentry->d_inode->i_op->follow_link){goto do_link;} /*路徑裝化為相應(yīng)的nameidata結(jié)構(gòu)*/path_to_nameidata(&path, &nd);error = -EISDIR;/*如果是文件夾*/if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)){goto exit;} ok:/** Consider:* 1. may_open() truncates a file* 2. a rw->ro mount transition occurs* 3. nameidata_to_filp() fails due to* the ro mount.* That would be inconsistent, and should* be avoided. Taking this mnt write here* ensures that (2) can not occur.*//*檢測是否截斷文件標(biāo)志*/will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);if (will_write) {/*要截斷的話就要獲取寫權(quán)限*/error = mnt_want_write(nd.path.mnt);if (error){goto exit;} }//may_open執(zhí)行權(quán)限檢測、文件打開和truncate的操作error = may_open(&nd.path, acc_mode, flag);if (error) {if (will_write){mnt_drop_write(nd.path.mnt);} goto exit;}filp = nameidata_to_filp(&nd, open_flag);if (IS_ERR(filp)){ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));}/** It is now safe to drop the mnt write* because the filp has had a write taken* on its behalf.*///安全的放棄寫權(quán)限if (will_write){mnt_drop_write(nd.path.mnt);} if (nd.root.mnt){path_put(&nd.root);} return filp;exit_mutex_unlock:mutex_unlock(&dir->d_inode->i_mutex); exit_dput:path_put_conditional(&path, &nd); exit:if (!IS_ERR(nd.intent.open.file)){release_open_intent(&nd);}exit_parent:if (nd.root.mnt){path_put(&nd.root);} path_put(&nd.path);return ERR_PTR(error);do_link: //允許遍歷連接文件,則手工找到連接文件對應(yīng)的文件error = -ELOOP;if (flag & O_NOFOLLOW){//不允許遍歷連接文件,返回錯誤goto exit_dput;} /** This is subtle. Instead of calling do_follow_link() we do the* thing by hands. The reason is that this way we have zero link_count* and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.* After that we have the parent and last component, i.e.* we are in the same situation as after the first path_walk().* Well, almost - if the last component is normal we get its copy* stored in nd->last.name and we will have to putname() it when we* are done. Procfs-like symlinks just set LAST_BIND.*//* 以下是手工找到鏈接文件對應(yīng)的文件dentry結(jié)構(gòu)代碼 *///設(shè)置查找LOOKUP_PARENT標(biāo)志nd.flags |= LOOKUP_PARENT;//判斷操作是否安全error = security_inode_follow_link(path.dentry, &nd);if (error){goto exit_dput;} //處理符號鏈接error = __do_follow_link(&path, &nd);if (error) {/* Does someone understand code flow here? Or it is only* me so stupid? Anathema to whoever designed this non-sense* with "intent.open".*/release_open_intent(&nd);if (nd.root.mnt){path_put(&nd.root);} return ERR_PTR(error);}nd.flags &= ~LOOKUP_PARENT;//檢查最后一段文件或目錄名的屬性情況if (nd.last_type == LAST_BIND){goto ok;} error = -EISDIR;if (nd.last_type != LAST_NORM){goto exit;} if (nd.last.name[nd.last.len]) {__putname(nd.last.name);goto exit;}error = -ELOOP;//出現(xiàn)回環(huán)標(biāo)志: 循環(huán)超過32次if (count++==32) {__putname(nd.last.name);goto exit;}dir = nd.path.dentry;mutex_lock(&dir->d_inode->i_mutex);//更新路徑的掛接點和dentrypath.dentry = lookup_hash(&nd);path.mnt = nd.path.mnt;__putname(nd.last.name);goto do_last; }總結(jié)一下流程
1. open系統(tǒng)調(diào)用訪問SYSCALL_DEFINE3函數(shù) 2. 在open系統(tǒng)調(diào)用中,調(diào)用do_sys_open函數(shù)完成主要功能 3. 在do_sys_open函數(shù)中,調(diào)用函數(shù)do_filp_open完成主要的打開功能 4. 在內(nèi)核中要打開一個文件,首先應(yīng)該找到這個文件,而查找文件的過程在vfs里面是由do_path_lookup或者path_lookup_open函數(shù)來完成的4.1 設(shè)置nd->root=根路徑(絕對地址)或者當(dāng)前工作目錄(相對地址)4.2 這一步做完了后,內(nèi)核會建立一些數(shù)據(jù)結(jié)構(gòu)(dentry,inode)來初始化查找的起點if(!retval){ retval = path_walk(name,nd);}4.3 path_walk會遍歷路徑的每一節(jié)點分量,也就是用"/"分隔開的每一部分,最終找到name指向的文件 int path_walk(const char *name,struct nameidata *nd){return link_path_walk(name,nd);//path_walk其實相當(dāng)于直接調(diào)用link_path_walk來完成工作 }4.4 link_path_walk的主要工作是有其內(nèi)部函數(shù)__link_path_walk 來完成的result = __link_path_walk(name,nd)4.5 __link_walk_path,該函數(shù)把傳進(jìn)來的字符串name,也就是用戶指定的路徑,按路徑分隔符分解成一系列小的component。比如用戶說,我要找"/path/to/dest"這個文件,那么我們的文件系統(tǒng)就會按path、to、dest一個一個來找,知道最后一個分量是文件或者查找完成。他找的時候,會先用path_init初始化過的根路徑去找第一個分量,也就是path。然后用path的dentry->d_inode去找to,這樣循環(huán)到最后一個。注意,內(nèi)核會緩存找到的路徑分量,
所以往往只有第一次訪問一個路徑的時候,才會去訪問磁盤,后面的訪問會直接從緩存里找,下面會看到,很多與頁告訴緩存打交道的代碼。但不管怎樣,第一遍查找總是會訪問磁盤的static int __link_path_walk(const char *name,strucy nameidata *nd){..} 至此,按照每一個component查找完成之后,就會找到相應(yīng)的文件,然后相應(yīng)的打開工作就基本完成了
Relevant Link:
http://oss.org.cn/kernel-book/ http://blog.csdn.net/f413933206/article/details/5701913?
2. close() syscall
close()系統(tǒng)調(diào)用對應(yīng)內(nèi)核中的函數(shù)為: sys_close()
\linux-2.6.32.63\fs\open.c
/** Careful here! We test whether the file pointer is NULL before* releasing the fd. This ensures that one clone task can't release* an fd while another clone is opening it.*/ SYSCALL_DEFINE1(close, unsigned int, fd) {struct file * filp;struct files_struct *files = current->files;struct fdtable *fdt;int retval;spin_lock(&files->file_lock);/*獲取指向struct fdtable結(jié)構(gòu)體的指針\linux-2.6.32.63\include\linux\fdtable.h#define files_fdtable(files) (rcu_dereference((files)->fdt))*/fdt = files_fdtable(files);if (fd >= fdt->max_fds){goto out_unlock;} //獲取需要關(guān)閉的文件描述符編號filp = fdt->fd[fd];if (!filp){goto out_unlock;} /*將fd_array[]中的的指定元素值置null */rcu_assign_pointer(fdt->fd[fd], NULL);FD_CLR(fd, fdt->close_on_exec); /*調(diào)用__put_unused_fd函數(shù),將當(dāng)前fd回收,則下一次打開新的文件又可以用這個fd了static void __put_unused_fd(struct files_struct *files, unsigned int fd){struct fdtable *fdt = files_fdtable(files);__FD_CLR(fd, fdt->open_fds);if (fd < files->next_fd){files->next_fd = fd;} }*/__put_unused_fd(files, fd);spin_unlock(&files->file_lock);retval = filp_close(filp, files);/* can't restart close syscall because file table entry was cleared */if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)){retval = -EINTR;} return retval;out_unlock:spin_unlock(&files->file_lock);return -EBADF; } EXPORT_SYMBOL(sys_close);對于,我們需要重點跟進(jìn)2個函數(shù): rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);
\linux-2.6.32.63\fs\rcupdate.h
/*** rcu_assign_pointer - assign (publicize) a pointer to a newly* initialized structure that will be dereferenced by RCU read-side* critical sections. Returns the value assigned.** Inserts memory barriers on architectures that require them* (pretty much all of them other than x86), and also prevents* the compiler from reordering the code that initializes the* structure after the pointer assignment. More importantly, this* call documents which pointers will be dereferenced by RCU read-side* code.*/#define rcu_assign_pointer(p, v) \({ \if (!__builtin_constant_p(v) || \((v) != NULL)) \smp_wmb(); \(p) = (v); \})我們知道,每個進(jìn)程在kernel中都有一個對應(yīng)的task_struct與之對應(yīng),而通過task_struct可以間接地獲得一個fd_array[]數(shù)組,表示當(dāng)前進(jìn)程已經(jīng)打開的文件,每一個元素都是一個文件描述符的值,只有通過這個fd_array[x]才能獲取當(dāng)前進(jìn)程打開的文件的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在于將將這個數(shù)組的指定元素置空,即斷開了這個引用的關(guān)系,至于之后內(nèi)核棧中的那個struct file*是否釋放,那內(nèi)存回收的事,至少現(xiàn)在進(jìn)程想通過task_stuct是無法再引用到之前打開過的文件了,這里面的關(guān)系圖可以參閱:
http://www.cnblogs.com/LittleHann/p/3865490.html //搜索: 用一張圖表示task_struct、fs_struct、files_struct、fdtable、file的關(guān)系我們繼續(xù)分析etval = filp_close(filp, files);
\linux-2.6.32.63\fs\open.c
/** "id" is the POSIX thread ID. We use the* files pointer for this..*/ int filp_close(struct file *filp, fl_owner_t id) {int retval = 0;if (!file_count(filp)) {printk(KERN_ERR "VFS: Close: file count is 0\n");return 0;}if (filp->f_op && filp->f_op->flush){retval = filp->f_op->flush(filp, id);} dnotify_flush(filp, id);locks_remove_posix(filp, id);fput(filp);return retval; }filp_close()負(fù)責(zé)將表示打開的文件的struct file*內(nèi)存空間進(jìn)行釋放,至此,內(nèi)核棧中就再也沒有之前打開過的文件的任何痕跡了
Relevant Link:
http://blog.csdn.net/ce123_zhouwei/article/details/8459794?
Copyright (c) 2014 LittleHann All rights reserved
?
轉(zhuǎn)載于:https://www.cnblogs.com/LittleHann/p/3932624.html
總結(jié)
以上是生活随笔為你收集整理的Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 链接详解--共享库命名
- 下一篇: 栈的应用--中序表达式转后序表达式