linux内核tcp_hdr,TCP的URG标志和内核实现
TCP的URG標志和內核實現之一:協議
定義urgent數據的目的:
urgent機制,是用于通知應用層需要接收urgent data,在urgent data接收完成后,通知應用層urgent data數據接收完畢。相關協議文本RFC793 RFC1122 RFC6093
哪些數據是urgent data?
協議規定
在TCP報頭的URG位有效的時候,通過TCP報頭中的urgent pointer來標識urgent data的位置,但是在urgent pointer的解析方式上各個協議文本的描述有差異:
解讀一:RFC793 P17,描述是“The urgent pointer points to the sequence number of the octet following the urgent data.”,在P41有描述“This mechanism permits a point in the data stream to be designated as the end of urgent information. Whenever this point is in advance of the receive sequence number (RCV.NXT) at the receiving TCP, that TCP must tell the user to go into “urgent mode”; when the receive sequence number catches up to the urgent pointer, the TCP must tell user to go”,可以認為是:當前接收的報文中SEQ在SEG.SEQ+Urgent Pointer之前的都是,而urgent pointer是第一個非urgent data( TCP已經接受,但是還沒有提交給應用的數據是不是呢?)
解讀二:在P56的描述是“If the urgent flag is set, then SND.UP
linux實現
雖然在RFC1122中消除了這一歧義,linux仍然使用了解讀一的解析方式,如果要使用解讀二定義的方式,需要使用tcp_stdurg這個配置項。
urgent data數據能有多長?
協議規定
按照RFC793 P41的描述,長度不受限,RFC1122 P84中,更是明確了“A TCP MUST support a sequence of urgent data of any length”
linux實現
其實,linux只支持1BYTE的urgent data
urgent data與OOB數據
OOB數據說的是帶外數據,也就是這些數據不是放到TCP流供讀取的,而是通過額外的接口來獲取,linux默認把urgent data實現為OOB數據;而按照協議的規定,urgent data不是out of band data
由于OOB數據的協議和實現上存在很多不確定因素,因此現在已經不建議使用了
TCP的URG標志和內核實現之二:發送的實現
Linxu內核在默認情況下,把urgent data實現為OOB數據
發送URG數據的接口
在內核態,使用kernel_sendmsg/kernel_sendpage完成發送,只不過需要加上MSG_OOB標志,表示要發送的URG數據。
URG數據發送接口的實現
分片主要在kernel_sendmsg中完成,在OOB數據的處理上,它和kernel_sendpage是一致
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,size_t size){。。。。。。。。。。。。。。/*如果flags設置了MSG_OOB該接口其實返回的mss_now關閉了TSO功能*/mss_now = tcp_send_mss(sk, &size_goal, flags);。。。。。。。。。。。。。。while (--iovlen >= 0) {size_t seglen = iov->iov_len;unsigned char __user *from = iov->iov_base;iov++;while (seglen > 0) {int copy = 0;int max = size_goal;skb = tcp_write_queue_tail(sk);if (tcp_send_head(sk)) {if (skb->ip_summed == CHECKSUM_NONE)max = mss_now;copy = max - skb->len;}if (copy <= 0) {new_segment:/* Allocate new segment. If the interface is SG,* allocate skb fitting to single page.*/if (!sk_stream_memory_free(sk))goto wait_for_sndbuf;skb = sk_stream_alloc_skb(sk,select_size(sk, sg),sk->sk_allocation);if (!skb)goto wait_for_memory;/** Check whether we can use HW checksum.*/if (sk->sk_route_caps & NETIF_F_ALL_CSUM)skb->ip_summed = CHECKSUM_PARTIAL;skb_entail(sk, skb);copy = size_goal;max = size_goal;}/* Try to append data to the end of skb. */if (copy > seglen)copy = seglen;/* Where to copy to? */if (skb_availroom(skb) > 0) {/* We have some space in skb head. Superb! */copy = min_t(int, copy, skb_availroom(skb));err = skb_add_data_nocache(sk, skb, from, copy);if (err)goto do_fault;} else {int merge = 0;int i = skb_shinfo(skb)->nr_frags;struct page *page = sk->sk_sndmsg_page;int off;if (page && page_count(page) == 1)sk->sk_sndmsg_off = 0;off = sk->sk_sndmsg_off;if (skb_can_coalesce(skb, i, page, off) &&off != PAGE_SIZE) {/* We can extend the last page* fragment. */merge = 1;} else if (i == MAX_SKB_FRAGS || !sg) {/* Need to add new fragment and cannot* do this because interface is non-SG,* or because all the page slots are* busy. */tcp_mark_push(tp, skb);goto new_segment;} else if (page) {if (off == PAGE_SIZE) {put_page(page);sk->sk_sndmsg_page = page = NULL;off = 0;}} elseoff = 0;if (copy > PAGE_SIZE - off)copy = PAGE_SIZE - off;if (!sk_wmem_schedule(sk, copy))goto wait_for_memory;if (!page) {/* Allocate new cache page. */if (!(page = sk_stream_alloc_page(sk)))goto wait_for_memory;}/* Time to copy data. We are close to* the end! */err = skb_copy_to_page_nocache(sk, from, skb,page, off, copy);if (err) {/* If this page was new, give it to the* socket so it does not get leaked.*/if (!sk->sk_sndmsg_page) {sk->sk_sndmsg_page = page;sk->sk_sndmsg_off = 0;}goto do_error;}/* Update the skb. */if (merge) {skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);} else {skb_fill_page_desc(skb, i, page, off, copy);if (sk->sk_sndmsg_page) {get_page(page);} else if (off + copy < PAGE_SIZE) {get_page(page);sk->sk_sndmsg_page = page;}}sk->sk_sndmsg_off = off + copy;}if (!copied)TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;tp->write_seq += copy;TCP_SKB_CB(skb)->end_seq += copy;skb_shinfo(skb)->gso_segs = 0;from += copy;copied += copy;if ((seglen -= copy) == 0 && iovlen == 0)goto out;/*對于OOB數據,即使一個分片用光,如果還有send_buff和OOB數據,就繼續積累分片*/if (skb->len < max || (flags & MSG_OOB))continue;if (forced_push(tp)) {tcp_mark_push(tp, skb);__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);} else if (skb == tcp_send_head(sk))tcp_push_one(sk, mss_now);continue;wait_for_sndbuf:set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);wait_for_memory:if (copied)tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)goto do_error;mss_now = tcp_send_mss(sk, &size_goal, flags);}}out:if (copied)tcp_push(sk, flags, mss_now, tp->nonagle);release_sock(sk);return copied;do_fault:if (!skb->len) {tcp_unlink_write_queue(skb, sk);/* It is the one place in all of TCP, except connection* reset, where we can be unlinking the send_head.*/tcp_check_send_head(sk, skb);sk_wmem_free_skb(sk, skb);}do_error:if (copied)goto out;out_err:err = sk_stream_error(sk, flags, err);release_sock(sk);return err;}
tcp_sendmsg中,涉及對OOB數據的處理主要有:
1、在調用tcp_send_mss確定分片大小的時候:
static int tcp_send_mss(struct sock *sk,int *size_goal, int flags){intmss_now;mss_now= tcp_current_mss(sk);/*如果是OOB數據,large_allowed=0,關閉TSO*/*size_goal= tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));returnmss_now;}
如果是OOB數據,其實是關閉了TSO功能,這樣做的原因是:天知道各個網卡芯片在執行分片的時候咋個處理TCP報頭中的URG標志和urgent point
2、在確定何時開始執行分片的發送的時候:
如果是OOB數據,即使當前已經積累了一整個分片,也不會想普通的數據一樣執行發送(tcp_push),而是繼續積累直到用戶下發的數據全部分片或者snd_buf/內存用盡。
3、執行tcp_push的時候:
在用戶下發的數據全部分片或者snd_buf/內存用盡后,進入tcp_push執行發送操作(所有的OOB數據,都會通過這個接口來執行發送)
static inline void tcp_push(struct sock*sk, int flags, int mss_now,int nonagle){if(tcp_send_head(sk)) {structtcp_sock *tp = tcp_sk(sk);if(!(flags & MSG_MORE) || forced_push(tp))tcp_mark_push(tp,tcp_write_queue_tail(sk));/*tcp_mark_urg設置tp->snd_up,標識進入OOB數據發送模式,設置urgent point指向urgentdata接受后的第一個字符*/tcp_mark_urg(tp,flags);__tcp_push_pending_frames(sk,mss_now,(flags & MSG_MORE) ? TCP_NAGLE_CORK :nonagle);}}
發送處理
使用struct tcp_sock中的snd_up來標識當前的urgent point,同時也使用該數據來判斷當前是否處于urgent data發送模式,在普通數據的發送模式中tcp_sock::snd_up總是和tcp_sock::snd_una相等,只有在有urgent data發送的時候,才在tcp_push—>tcp_mark_urg中設置為urgentpoint,進入到urgent data的處理模式
在tcp_transmit_skb中的以下代碼段負責urgent data相關的處理:
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {if (before(tp->snd_up, tcb->seq + 0x10000)) {th->urg_ptr = htons(tp->snd_up - tcb->seq);th->urg = 1;} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {th->urg_ptr = htons(0xFFFF);th->urg = 1;}}
只要當前待發送的skb的seq在tcp_sock記錄的urgent point前面,就需要在報頭中對URG標志置位,同時如果tcp_sock記錄的urgent point。如果該報文的seq距離大于16為能表示的最大值,就置TCP報頭中的urgent point為65535。
切換回普通模式:
在收到對方ACK的處理流程tcp_ack—>tcp_clean_rtx_queue中:
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))tp->snd_up = tp->snd_una;
報文體現
根據對發送代碼的分析,可以看到:如果用戶使用MSG_OOB數據發送一段比較長(若干個MSS)的數據,那么線路上的報文應該是分成了若干組,每組由若干個長度為MSS的報文構成,組內的每個報文有一樣的urgent pointer,指向下一組報文的起始seq,每一組的長度最長為65535。
TCP的URG標志和內核實現之三:接收的實現
大致的處理過程
TCP的接收流程:在tcp_v4_do_rcv中的相關處理(網卡收到報文觸發)中,會首先通過tcp_check_urg設置tcp_sock的urg_data為TCP_URG_NOTYET(urgent point指向的可能不是本報文,而是后續報文或者前面收到的亂序報文),并保存最新的urgent data的sequence和對于的1 BYTE urgent data到tcp_sock的urg_data (如果之前的urgent data沒有讀取,就會被覆蓋)。
用戶接收流程:在tcp_recvmsg流程中,如果發現當前的skb的數據中有urgent data,首先拷貝urgent data之前的數據,然后tcp_recvmsg退出,提示用戶來接收OOB數據;在用戶下一次調用tcp_recvmsg來接收數據的時候,會跳過urgent data,并設置urgent data數據接收完成。
相關的數據結構和定義
tcp_sock結構:
1、 urg_data成員,其高8bit為urgent data的接收狀態;其低8位為保存的1BYTE urgent數據。urgent data的接收狀態對應的宏的含義描述:
#defineTCP_URG_VALID 0x0100 /*urgent data已經讀到了tcp_sock::urg_data*/#defineTCP_URG_NOTYET 0x0200 /*已經發現有urgent data,還沒有讀取到tcp_sock::urg_data*/#defineTCP_URG_READ 0x0400 /*urgent data已經被用戶通過MSG_OOB讀取了*/
2、 urg_seq成員,為當前的urgent data的sequence
流程詳情
TCP的接收過程
在tcp_rcv_established的slow_path中
slow_path:if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))goto csum_error;/** Standard slow path.*/if (!tcp_validate_incoming(sk, skb, th, 1))return 0;step5:if (th->ack &&tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)goto discard;tcp_rcv_rtt_measure_ts(sk, skb);/* 處理緊急數據. */tcp_urg(sk, skb, th);
也就是在報文的CRC驗證和sequence驗證完成后,就會通過tcp_urg來處理接收到的urgent data :
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th){struct tcp_sock *tp = tcp_sk(sk);/*收到了urgent data,則檢查和設置urg_data和urg_seq成員*/if (th->urg)tcp_check_urg(sk, th);/* Do we wait for any urgent data? - normally not...發現了有urgent data,但是還沒有保存到tp->urg_data*/if (tp->urg_data == TCP_URG_NOTYET) {u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -th->syn;/* Is the urgent pointer pointing into this packet? */if (ptr < skb->len) {u8 tmp;if (skb_copy_bits(skb, ptr, &tmp, 1))BUG();tp->urg_data = TCP_URG_VALID | tmp;if (!sock_flag(sk, SOCK_DEAD))sk->sk_data_ready(sk, 0);}}}
檢查和設置urg_data和urg_seq成員的處理函數tcp_check_urg的具體流程
static void tcp_check_urg(struct sock *sk, const struct tcphdr *th){struct tcp_sock *tp = tcp_sk(sk);u32 ptr = ntohs(th->urg_ptr);/*兩種urgent point的解析方式:一是指向urgent data之后的第一個字節二是執行urgent data的結束字節(RFC1122)sysctl_tcp_stdurg被設置表示當前采用的是第二種模式不需要把urgent point -1來指向urgent data的結束字節*/if (ptr && !sysctl_tcp_stdurg)ptr--;ptr += ntohl(th->seq);/* Ignore urgent data that we've already seen and read.如果copied_seq已經大于urgent point,那么對于從tcp_rcv_established來執行的,前面的tcp_validate_incoming已經拒絕了這種報文(接收窗口外),這里要處理的是哪種情形?*/if (after(tp->copied_seq, ptr))return;/* Do not replay urg ptr.** NOTE: interesting situation not covered by specs.* Misbehaving sender may send urg ptr, pointing to segment,* which we already have in ofo queue. We are not able to fetch* such data and will stay in TCP_URG_NOTYET until will be eaten* by recvmsg(). Seems, we are not obliged to handle such wicked* situations. But it is worth to think about possibility of some* DoSes using some hypothetical application level deadlock.*//* 這種情況什么時候發生?沒搞明白*/if (before(ptr, tp->rcv_nxt))return;/* Do we already have a newer (or duplicate) urgent pointer?如果當前已經進入urg數據讀取模式,且urgent point不大于當前保存的值,那么之前已經開始了讀取tp->urg_seq對應的urgent 數據,無需重復處理了*/if (tp->urg_data && !after(ptr, tp->urg_seq))return;/* Tell the world about our new urgent pointer.*/sk_send_sigurg(sk);/* We may be adding urgent data when the last byte read was* urgent. To do this requires some care. We cannot just ignore* tp->copied_seq since we would read the last urgent byte again* as data, nor can we alter copied_seq until this data arrives* or we break the semantics of SIOCATMARK (and thus sockatmark())** NOTE. Double Dutch. Rendering to plain English: author of comment* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);* and expect that both A and B disappear from stream. This is _wrong_.* Though this happens in BSD with high probability, this is occasional.* Any application relying on this is buggy. Note also, that fix "works"* only in this artificial test. Insert some normal data between A and B and we will* decline of BSD again. Verdict: it is better to remove to trap* buggy users.*//*用戶下一次要讀取的數據就是用戶還沒有讀取的urgent數據且當前存在新的用戶未讀取數據*/if (tp->urg_seq == tp->copied_seq && tp->urg_data &&!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);tp->copied_seq++;if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {__skb_unlink(skb, &sk->sk_receive_queue);__kfree_skb(skb);}}tp->urg_data = TCP_URG_NOTYET;tp->urg_seq = ptr;/* Disable header prediction. */tp->pred_flags = 0;}
用戶接收數據接口
用戶接收URG數據的接口
在用戶接收數據的tcp_recvmsg函數中,如果用戶通過MSG_OOB來接收數據,會進入tcp_recv_urg處理
static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags){struct tcp_sock *tp = tcp_sk(sk);/* No URG data to read.用戶已經讀取過了*/if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||tp->urg_data == TCP_URG_READ)return -EINVAL; /* Yes this is right ! */if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))return -ENOTCONN;/*當前的tp->urg_data為合法的數據,可以讀取*/if (tp->urg_data & TCP_URG_VALID) {int err = 0;char c = tp->urg_data;/*標識urgent data已讀*/if (!(flags & MSG_PEEK))tp->urg_data = TCP_URG_READ;/* Read urgent data. */msg->msg_flags |= MSG_OOB;if (len > 0) {if (!(flags & MSG_TRUNC))err = memcpy_toiovec(msg->msg_iov, &c, 1);len = 1;} elsemsg->msg_flags |= MSG_TRUNC;return err ? -EFAULT : len;}if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))return 0;/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and* the available implementations agree in this case:* this call should never block, independent of the* blocking state of the socket.* Mike */return -EAGAIN;}
用戶接收普通數據的接口中的相關處理
在用戶接收數據的tcp_recvmsg函數中,在查找到待拷貝的skb后,首先拷貝urgent data數據前的數據,然后退出接收過程,在用戶下一次執行tcp_recvmsg的時候跳過urgent data,設置urgent data讀取結束
查找到準備拷貝的skb后的處理:
found_ok_skb:/* Ok so how much can we use? */used = skb->len - offset;if (len < used)used = len;/* 當前有urg_data數據*/if (tp->urg_data) {u32 urg_offset = tp->urg_seq - *seq;/*urgent data在當前待拷貝的數據范圍內*/if (urg_offset < used) {if (!urg_offset) {/*待拷貝的數據就是urgent data,跨過該urgent data,只給用戶讀取后面的數據*/if (!sock_flag(sk, SOCK_URGINLINE)) {++*seq;urg_hole++;offset++;used--;if (!used)goto skip_copy;}}} else/*指定只拷貝urgent data數據之前的,完成后在下一次循環開始的位置,會退出循環,返回用戶;下一次用戶調用tcp_recvmsg就進入到上面的分支了*/used = urg_offset;}}
skip_copy:/*用戶讀取的數據跨過了urgent point,設置讀取結束開啟fast path*/if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {tp->urg_data = 0;tcp_fast_path_check(sk);}if (used + offset < skb->len)continue;
在接收完urgent data數據前的所有數據之后, tcp_recvmsg的以下代碼片段得到執行,這段代碼退出當前接收過程,提示用戶有urgent data數據到來,需要用MSG_OOB來接收
if (tp->urg_data && tp->urg_seq == *seq) {if (copied)break;if (signal_pending(current)) {copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;break;}}
后記
TCP的urg數據,由于定義和實現上的混亂,當前已經不建議使用,但是為了兼容之前已經已經存在的實現,該機制會長期在內核中存在,如果不了解該機制及其內核行為,有可能就很難解釋一些奇怪的問題:比如某段代碼不小心地造成send接口事實上設置了MSG_OOB,就會造成接收端少了一個BYTE。
總結
以上是生活随笔為你收集整理的linux内核tcp_hdr,TCP的URG标志和内核实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: oppo android多大内存,OPP
- 下一篇: linux mysql max_allo