tcp/ip 协议栈Linux内核源码分析12 udp套接字发送流程一
內核版本:3.4.39
因為過往的開發工作中既包括內核網絡層模塊的開發,又包括應用層程序的開發,所以對于網絡數據的通信有那么一些了解。但是對于網絡通信過程中,內核和應用層之間接口是如何運作的不是很清楚,很多問題無從回答,比如應用層數據如何傳遞給內核協議棧,網卡硬件收到報文后傳遞給網絡協議棧,協議棧又是如何傳遞給用戶層的?多線程共用同一個UDP套接字發送,數據會錯亂嗎?那么多套接字,內核如何區分?UDP有發送隊列嗎等等。本篇文章主要分析UDP套接字發送數據過程中應用層和內核層主要做了哪些工作。
通常我們開發網絡通信程序的時候只需要調用gblic封裝的庫函數就可以了,比如說UDP通信,標準流程大概如下:
1. socket()函數創建套接字 2. bind()綁定本地地址或者連接connect() 3. send()、sendto()、sendmsg()發送順著函數調用順序來分析:
首先是socket()調用,socket()創建一個套接字,成功后返回一個文件描述符。該調用由glibc封裝,實際會調用內核的socket函數進行處理,簡單的流程圖如下:
socket內核實現
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) {int retval;struct socket *sock; //通用套接字結構int flags;.............//創建一個套接字retval = sock_create(family, type, protocol, &sock);if (retval < 0)goto out;retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));if (retval < 0)goto out_release;........ }?在成功創建套接字后,該套接字僅僅是一個文件描述符,并沒有任何地址與之關聯。使用該socket發送數據包時,由于該socket沒有任何IP地址,內核會根據策略自動選擇一個地址。但是,在某些情況下,我們需要手工指定socket使用哪個IP地址進行發送。這時,就需要使用bind系統調用了。
bind源碼入口位于net/socket.c中:
/** Bind a name to a socket. Nothing much to do here since it's* the protocol's responsibility to handle the local address.** We move the socket address to kernel space before we call* the protocol layer (having also checked the address is ok).*/SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) {struct socket *sock;struct sockaddr_storage address;int err, fput_needed;/* 由文件描述符得到套接字在內核中對應的結構struct socket */sock = sockfd_lookup_light(fd, &err, &fput_needed);if (sock) {/* umyaddr是用戶空間地址,這里將其復制到內核空間address變量中 */err = move_addr_to_kernel(umyaddr, addrlen, &address);if (err >= 0) {/* 對bind動作進行安全性檢查 */ err = security_socket_bind(sock,(struct sockaddr *)&address,addrlen);if (!err)/* 調用對應協議的bind動作 */err = sock->ops->bind(sock,(struct sockaddr *)&address, addrlen);}fput_light(sock->file, fput_needed);}return err; }?在bind的調用中,根據不同的協議調用不同的實現函數(Linux的內核代碼中,大量使用了這種面向對象的設計思路)。對于AF_INET協議族來說,無論是面向連接的SOCK_STREAM類型,還是SOCK_DGRAM協議類型,其實現函數均是inet_bind。下面來看一下inet_bind的具體實現:
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) {struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;struct sock *sk = sock->sk;struct inet_sock *inet = inet_sk(sk);unsigned short snum;int chk_addr_ret;int err;/* If the socket has its own bind function then use it. (RAW) *//*如果具體協議實現了bind函數,則調用協議的bind函數。AF_INET協議族中,只有IPPROTO_ICMP和IPPROTO_IP實現了自己的bind函數,IPPROTO_TCP和IPPROTO_UDP都使用AF_INET通用的函數,即這個inet_bind。 */ if (sk->sk_prot->bind) {err = sk->sk_prot->bind(sk, uaddr, addr_len);goto out;}err = -EINVAL;/* 檢查地址長度 */if (addr_len < sizeof(struct sockaddr_in))goto out;/* 本來要求地址的協議族要與sock相同,必須為AF_INET,但是這里有個兼容性問題。允許協議族為AF_UNSPEC并且地址為INADDR_ANY的任意地址 */ if (addr->sin_family != AF_INET) {/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)* only if s_addr is INADDR_ANY.*/err = -EAFNOSUPPORT;if (addr->sin_family != AF_UNSPEC ||addr->sin_addr.s_addr != htonl(INADDR_ANY))goto out;}/* 判斷地址類型 */chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);/* Not specified by any standard per-se, however it breaks too* many applications when removed. It is unfortunate since* allowing applications to make a non-local bind solves* several problems with systems using dynamic addressing.* (ie. your servers still start up even if your ISDN link* is temporarily down)*//*sysctl_ip_nonlocal_bind系統控制開關,允許bind非本地IP; inet->freebind為一個socket選項,允許該socket bind任意IP;在上面這些變量均不成立時,指定地址又不是任意的本地地址INADDR_ANY,地址類型又不是本地地址類型,多播或廣播時,則bind失敗。*/ err = -EADDRNOTAVAIL;if (!sysctl_ip_nonlocal_bind &&!(inet->freebind || inet->transparent) &&addr->sin_addr.s_addr != htonl(INADDR_ANY) &&chk_addr_ret != RTN_LOCAL &&chk_addr_ret != RTN_MULTICAST &&chk_addr_ret != RTN_BROADCAST)goto out;snum = ntohs(addr->sin_port);err = -EACCES;/* 如果源端口小于PROT_SOCK(1024),則需要檢查用戶是否有權限創建知名端口*/if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))goto out;/* We keep a pair of addresses. rcv_saddr is the one* used by hash lookups, and saddr is used for transmit.** In the BSD API these are the same except where it* would be illegal to use them (multicast/broadcast) in* which case the sending device address is used.*/lock_sock(sk);/* Check these errors (active socket, double bind). */err = -EINVAL;/* 確保套接字不會被bind兩次 */if (sk->sk_state != TCP_CLOSE || inet->inet_num)goto out_release_sock;/* 使用參數設置套接字的接收和發送地址 */inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;/* 如果參數地址是多播或廣播類型,則重置發送源地址為0,表示在發送時,使用的是設備地址 */if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)inet->inet_saddr = 0; /* Use device *//* Make sure we are allowed to bind here. *//* 調用協議自定義的操作函數get_port,判斷該端口是否可以使用。 雖然這里是一個查詢的動作,但是卻會有修改的動作。 當該端口可以使用時,會讓inet_sk(sk)->inet_num = snum; 這樣做,是因為查詢動作已經獲得了鎖。在確定可以使用該端口時,直接修 改inet_num,這樣既可以保證設置端口的原子性,同時還可以提高性能 */if (sk->sk_prot->get_port(sk, snum)) {inet->inet_saddr = inet->inet_rcv_saddr = 0;err = -EADDRINUSE;goto out_release_sock;}/* 如果設置了bind地址,則置上相應的標志 */ if (inet->inet_rcv_saddr)sk->sk_userlocks |= SOCK_BINDADDR_LOCK;/* 如果設置了源端口,則設置相應的標志 */if (snum)sk->sk_userlocks |= SOCK_BINDPORT_LOCK;/* 設置inet_sport,其為網絡序 */ inet->inet_sport = htons(inet->inet_num);/* 重置目的地址和端口 */inet->inet_daddr = 0;inet->inet_dport = 0;/* 重置該套接字的路由信息 */sk_dst_reset(sk);err = 0; out_release_sock:release_sock(sk); out:return err; } EXPORT_SYMBOL(inet_bind);connect的源碼入口位于socket.c,代碼如下:?
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,int, addrlen) {struct socket *sock;struct sockaddr_storage address;int err, fput_needed;/* 通過套接字文件描述符獲得對應的struct socket */sock = sockfd_lookup_light(fd, &err, &fput_needed);if (!sock)goto out;/* 將用戶空間地址復制到內核空間變量address中 */ err = move_addr_to_kernel(uservaddr, addrlen, &address);if (err < 0)goto out_put;/* 安全性檢查 */err =security_socket_connect(sock, (struct sockaddr *)&address, addrlen);if (err)goto out_put;/* 與bind類似,調用與協議族對應的connect操作函數 */err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,sock->file->f_flags); out_put:fput_light(sock->file, fput_needed); out:return err; }對于AF_INET協議族來說,面向連接的協議類型是SOCK_STREAM,其連接函數為inet_stream_connect,而非面向連接的協議類型SOCK_DGRAM,其連接函數為inet_dgram_connect。這很合理,因為從connect的功能實現上看,兩者的實現效果完全不同。
看下UDP的inet_dgram_connect:
int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,int addr_len, int flags) {struct sock *sk = sock->sk;/* 長度合法性檢查 */if (addr_len < sizeof(uaddr->sa_family))return -EINVAL;/* 如果協議族為AF_UNSPEC,則先執行disconnect */ if (uaddr->sa_family == AF_UNSPEC)return sk->sk_prot->disconnect(sk, flags);/* 如果該套接字沒有指定源端口,并且系統自動綁定端口失敗,則返回錯誤 */if (!inet_sk(sk)->inet_num && inet_autobind(sk))return -EAGAIN;/* 調用具體協議的connect實現函數 */ return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect);udp_prot是UDP協議中所有自定義操作函數的集合。其connect的實現函數為ip4_datagram_connect/net/ipv4/datagram.c。
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) {struct inet_sock *inet = inet_sk(sk);struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;struct flowi4 *fl4;struct rtable *rt;__be32 saddr;int oif;int err;/* 地址的長度性檢查 */if (addr_len < sizeof(*usin))return -EINVAL;/* 檢查是否為AF_INET協議族 */if (usin->sin_family != AF_INET)return -EAFNOSUPPORT;/* 因為connect會改變目的地址,所有socket中保存的路由緩存已經無用,必須重置。 */ sk_dst_reset(sk);lock_sock(sk);/* 得到套接字綁定的發送接口 */oif = sk->sk_bound_dev_if;saddr = inet->inet_saddr;/* 在目的地址是多播地址的情況下, 如果該套接字沒有綁定網卡,則出口網卡為設置的多播網卡索引; 如果該套接字沒有綁定源IP,則使用設置的多播源地址;*/if (ipv4_is_multicast(usin->sin_addr.s_addr)) {if (!oif)oif = inet->mc_index;if (!saddr)saddr = inet->mc_addr;}fl4 = &inet->cork.fl.u.ip4;/* 判斷設置的目的地址是否存在正確的路由 */rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,RT_CONN_FLAGS(sk), oif,sk->sk_protocol,inet->inet_sport, usin->sin_port, sk, true);if (IS_ERR(rt)) {err = PTR_ERR(rt);if (err == -ENETUNREACH)IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);goto out;}/* 如果路由是廣播類型,而套接字不是廣播類型,則出錯 */ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {ip_rt_put(rt);err = -EACCES;goto out;}/* 如果套接字沒有設置發送地址或接收地址,則使用對應路由的源地址*/if (!inet->inet_saddr)inet->inet_saddr = fl4->saddr; /* Update source address */if (!inet->inet_rcv_saddr) {inet->inet_rcv_saddr = fl4->saddr;if (sk->sk_prot->rehash)sk->sk_prot->rehash(sk);}/* 設置目的地址和端口 */inet->inet_daddr = fl4->daddr;inet->inet_dport = usin->sin_port;sk->sk_state = TCP_ESTABLISHED;inet->inet_id = jiffies;/* 重新設置路由信息 */sk_dst_set(sk, &rt->dst);err = 0; out:release_sock(sk);return err; } EXPORT_SYMBOL(ip4_datagram_connect);UDP套接字創建、綁定或者連接后就可以發送數據了。
Linux提供了如下發送接口:
#include <sys/types.h> #include <sys/socket.h> ssize_t send(int sockfd, const void *buf, size_t len, int flags); ssize_t sendto(int sockfd, const void *buf, size_t len, int flags,const struct sockaddr *dest_addr, socklen_t addrlen); ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags);send只能用于處理已連接狀態的套接字。而sendto可以在調用時,指定目的地址。這樣的話,如果套接字已經是連接狀態,那么目的地址dest_addr與地址長度就應該為NULL和0,不然就可能會返回錯誤。sendmsg則比較特殊,無論是要發送的數據還是目的地址,都保存在msg中。其中msg.msg_name和msg.msg_len用于指明目的地址,而msg.msg_iov則用于保存要發送的數據。這三個系統調用都支持設置指示標志位flags。
send的內核實現代碼如下:
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,unsigned, flags) {/*send可以視為sendto的一種特例,即不設置目的地址的sendto調用。所以內核實現也是讓send直接調用sendto。*/return sys_sendto(fd, buff, len, flags, NULL, 0); }既然其內核實現是讓send直接調用sendto,那么,下面我們就來看一下sendto的內核實現,代碼如下:
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,unsigned, flags, struct sockaddr __user *, addr,int, addr_len) {struct socket *sock;struct sockaddr_storage address;int err;struct msghdr msg;struct iovec iov;int fput_needed;/* 長度合法性檢查 */if (len > INT_MAX)len = INT_MAX;/* 從文件描述符獲得套接字socket的結構 */ sock = sockfd_lookup_light(fd, &err, &fput_needed);if (!sock)goto out;/* 將數據轉換為iovec結構,來調用后面的sendmsg */iov.iov_base = buff;iov.iov_len = len;msg.msg_name = NULL;msg.msg_iov = &iov;msg.msg_iovlen = 1;msg.msg_control = NULL;msg.msg_controllen = 0;msg.msg_namelen = 0;/* 如果設置了地址,則設置msg_name */if (addr) {/* 將地址參數復制到內核變量中 */err = move_addr_to_kernel(addr, addr_len, &address);if (err < 0)goto out_put;msg.msg_name = (struct sockaddr *)&address;msg.msg_namelen = addr_len;}/* 如果socket設置了非阻塞,則消息的標志設置為DONTWAIT(其實也是非阻塞的語義)*/if (sock->file->f_flags & O_NONBLOCK)flags |= MSG_DONTWAIT;msg.msg_flags = flags;/* 調用sock_sendmsg來發送數據包 */err = sock_sendmsg(sock, &msg, len);out_put:fput_light(sock->file, fput_needed); out:return err; }?這里最終調用sock_sendmsg,我們先看下sendmsg調用,看看最后是不是也是調用sock_sendmsg:
SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) {int fput_needed, err;struct msghdr msg_sys;/* 通過文件描述符獲得socket套接字結構 */struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);if (!sock)goto out;/* 調用__sys_sendmsg來發送數據包 */err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL);fput_light(sock->file, fput_needed); out:return err; }?接下來進入__sys_sendmsg,代碼如下:
static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,struct msghdr *msg_sys, unsigned flags,struct used_address *used_address) {struct compat_msghdr __user *msg_compat =(struct compat_msghdr __user *)msg;struct sockaddr_storage address;struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;unsigned char ctl[sizeof(struct cmsghdr) + 20]__attribute__ ((aligned(sizeof(__kernel_size_t))));/* 20 is size of ipv6_pktinfo */unsigned char *ctl_buf = ctl;int err, ctl_len, iov_size, total_len;err = -EFAULT;/* 從用戶空間得到用戶消息 */if (MSG_CMSG_COMPAT & flags) {/* 緊湊消息類型 */if (get_compat_msghdr(msg_sys, msg_compat))return -EFAULT;} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))return -EFAULT;/* do not move before msg_sys is valid */err = -EMSGSIZE;/* 消息數據塊個數檢查 */if (msg_sys->msg_iovlen > UIO_MAXIOV)goto out;/* Check whether to allocate the iovec area */err = -ENOMEM;/* 在內核空間申請消息數據長度 */iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);if (msg_sys->msg_iovlen > UIO_FASTIOV) {iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);if (!iov)goto out;}/* This will also move the address data into kernel space *//* 前面只是將消息頭,或者說消息的結構體,復制到內核空間,現在是將消息的真正內容,即iov的內容復制到內核空間 */if (MSG_CMSG_COMPAT & flags) {err = verify_compat_iovec(msg_sys, iov, &address, VERIFY_READ);} elseerr = verify_iovec(msg_sys, iov, &address, VERIFY_READ);if (err < 0)goto out_freeiov;total_len = err;err = -ENOBUFS;/* 與消息數據塊類似,復制控制消息塊,就不詳細描述了 */if (msg_sys->msg_controllen > INT_MAX)goto out_freeiov;ctl_len = msg_sys->msg_controllen;if ((MSG_CMSG_COMPAT & flags) && ctl_len) {err =cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,sizeof(ctl));if (err)goto out_freeiov;ctl_buf = msg_sys->msg_control;ctl_len = msg_sys->msg_controllen;} else if (ctl_len) {if (ctl_len > sizeof(ctl)) {ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);if (ctl_buf == NULL)goto out_freeiov;}err = -EFAULT;/** Careful! Before this, msg_sys->msg_control contains a user pointer.* Afterwards, it will be a kernel pointer. Thus the compiler-assisted* checking falls down on this.*/if (copy_from_user(ctl_buf,(void __user __force *)msg_sys->msg_control,ctl_len))goto out_freectl;msg_sys->msg_control = ctl_buf;}/* 設置消息標志 */msg_sys->msg_flags = flags;/* 如果套接字是非阻塞的,則設置消息標志MSG_DONTWAIT */if (sock->file->f_flags & O_NONBLOCK)msg_sys->msg_flags |= MSG_DONTWAIT;/** If this is sendmmsg() and current destination address is same as* previously succeeded address, omit asking LSM's decision.* used_address->name_len is initialized to UINT_MAX so that the first* destination address never matches.*//* 如果這次發送的目的地址與上次成功發送的目的地址一致,那就可以省略安全性檢查 */ if (used_address && msg_sys->msg_name &&used_address->name_len == msg_sys->msg_namelen &&!memcmp(&used_address->name, msg_sys->msg_name,used_address->name_len)) {/* 調用不進行安全性檢查的函數 */err = sock_sendmsg_nosec(sock, msg_sys, total_len);goto out_freectl;}/* 調用sock_sendmsg,需要安全性檢查,最終仍然會調用到sock_send_msg_nosec函數 */err = sock_sendmsg(sock, msg_sys, total_len);/** If this is sendmmsg() and sending to current destination address was* successful, remember it.*//* 如果本次發送成功,則保存當前的目的地址 */ if (used_address && err >= 0) {used_address->name_len = msg_sys->msg_namelen;if (msg_sys->msg_name)memcpy(&used_address->name, msg_sys->msg_name,used_address->name_len);}out_freectl:if (ctl_buf != ctl)sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov:if (iov != iovstack)sock_kfree_s(sock->sk, iov, iov_size); out:return err; }看完了__sys_sendmsg,我們可以確定,無論是哪個發送數據的系統調用,最終都會調用到sock_sendmsg。
文章有點長,sock_sendmsg放到下篇分析。
?
參考文檔:
1.?《Linux環境編程:從應用到內核》
2.? 淺析Linux網絡子系統(一)?
總結
以上是生活随笔為你收集整理的tcp/ip 协议栈Linux内核源码分析12 udp套接字发送流程一的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 添加一个hello wrold系统调用到
- 下一篇: 社会抚养费是什么意思