ip_vs实现分析(2)
生活随笔
收集整理的這篇文章主要介紹了
ip_vs实现分析(2)
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
本文檔的Copyleft歸yfydz所有,使用GPL發(fā)布,可以自由拷貝,轉(zhuǎn)載,轉(zhuǎn)載時(shí)請(qǐng)保持文檔的完整性,嚴(yán)禁用于任何商業(yè)用途。
msn:?yfydz_no1@hotmail.com
來源:http://yfydz.cublog.cn
4. 模塊初始化
初始化函數(shù)先初始化ipvs的各種處理機(jī)制,然后將ipvs的處理函數(shù)掛接到netfilter架構(gòu)中。
/* net/ipv4/ipvs/ip_vs_core.c */ static int __init ip_vs_init(void)
{
?int ret;
// ioctl初始化
?ret = ip_vs_control_init();
?if (ret < 0) {
??IP_VS_ERR("can't setup control.\n");
??goto cleanup_nothing;
?}
// 協(xié)議初始化
?ip_vs_protocol_init(); // 應(yīng)用層輔助協(xié)議初始化
?ret = ip_vs_app_init();
?if (ret < 0) {
??IP_VS_ERR("can't setup application helper.\n");
??goto cleanup_protocol;
?} // ipvs連接初始化
?ret = ip_vs_conn_init();
?if (ret < 0) {
??IP_VS_ERR("can't setup connection table.\n");
??goto cleanup_app;
?} // 下面分別掛接各個(gè)處理點(diǎn)到netfilter架構(gòu)中
?ret = nf_register_hook(&ip_vs_in_ops);
?if (ret < 0) {
??IP_VS_ERR("can't register in hook.\n");
??goto cleanup_conn;
?} ret = nf_register_hook(&ip_vs_out_ops);
?if (ret < 0) {
??IP_VS_ERR("can't register out hook.\n");
??goto cleanup_inops;
?}
?ret = nf_register_hook(&ip_vs_post_routing_ops);
?if (ret < 0) {
??IP_VS_ERR("can't register post_routing hook.\n");
??goto cleanup_outops;
?}
?ret = nf_register_hook(&ip_vs_forward_icmp_ops);
?if (ret < 0) {
??IP_VS_ERR("can't register forward_icmp hook.\n");
??goto cleanup_postroutingops;
?} IP_VS_INFO("ipvs loaded.\n");
?return ret;
// 以下是如果初始化出現(xiàn)失敗時(shí)依次進(jìn)行釋放
? cleanup_postroutingops:
?nf_unregister_hook(&ip_vs_post_routing_ops);
? cleanup_outops:
?nf_unregister_hook(&ip_vs_out_ops);
? cleanup_inops:
?nf_unregister_hook(&ip_vs_in_ops);
? cleanup_conn:
?ip_vs_conn_cleanup();
? cleanup_app:
?ip_vs_app_cleanup();
? cleanup_protocol:
?ip_vs_protocol_cleanup();
?ip_vs_control_cleanup();
? cleanup_nothing:
?return ret;
}
4.1 ip_vs_control_init /* net/ipv4/ipvs/ip_vs_ctl.c */
int ip_vs_control_init(void)
{
?int ret;
?int idx; EnterFunction(2); // 登記ipvs的sockopt控制,這樣用戶空間可通過setsockopt函數(shù)來和ipvs進(jìn)行通信
?ret = nf_register_sockopt(&ip_vs_sockopts);
?if (ret) {
??IP_VS_ERR("cannot register sockopt.\n");
??return ret;
?} // 建立/proc/net/ip_vs和/proc/net/ip_vs_stats只讀項(xiàng)
?proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
?proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); // 建立/proc/sys/net/ipv4/vs目錄下的各可讀寫控制參數(shù)
?sysctl_header = register_sysctl_table(vs_root_table, 0); // 初始化各種雙向鏈表
// svc_table是根據(jù)協(xié)議地址端口等信息進(jìn)行服務(wù)結(jié)構(gòu)struct ip_vs_service查找的HASH表
// svc_fwm_table是根據(jù)數(shù)據(jù)包的nfmark信息進(jìn)行服務(wù)結(jié)構(gòu)struct ip_vs_service查找的HASH表
?/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
?for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)? {
??INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
??INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
?}
// rtable是目的結(jié)構(gòu)struct ip_vs_dest的HASH鏈表
?for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)? {
??INIT_LIST_HEAD(&ip_vs_rtable[idx]);
?} // ipvs統(tǒng)計(jì)信息
?memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
// 統(tǒng)計(jì)鎖
?spin_lock_init(&ip_vs_stats.lock);
// 對(duì)當(dāng)前統(tǒng)計(jì)信息建立一個(gè)預(yù)估器,可用于計(jì)算服務(wù)器的性能參數(shù)
?ip_vs_new_estimator(&ip_vs_stats); /* Hook the defense timer */
// 掛一個(gè)定時(shí)操作,根據(jù)系統(tǒng)當(dāng)前負(fù)載情況定時(shí)調(diào)整系統(tǒng)參數(shù)
?schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); LeaveFunction(2);
?return 0;
}
4.2 ip_vs_protocol_init
/* net/ipv4/ipvs/ip_vs_proto.c */
int ip_vs_protocol_init(void)
{
// 掛接ipvs能進(jìn)行均衡處理的各種協(xié)議,目前支持TCP/UDP/AH/ESP
// 最好還要增加GRE,在PPTP服務(wù)器中使用
?char protocols[64];
#define REGISTER_PROTOCOL(p)???\
?do {?????\
??register_ip_vs_protocol(p);?\
??strcat(protocols, ", ");?\
??strcat(protocols, (p)->name);?\
?} while (0) // 0,1字符是給", "預(yù)留的
?protocols[0] = '\0';
?protocols[2] = '\0'; // 登記各種協(xié)議
#ifdef CONFIG_IP_VS_PROTO_TCP
?REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
?REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
?REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
?REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
// 第0,1字符分別為逗號(hào)','和空格' ',從第2字符起才是真正數(shù)據(jù)串
?IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); return 0;
} register_ip_vs_protocol()函數(shù)就是把ip_vs_protocol結(jié)構(gòu)掛接到協(xié)議HASH表中,不過其實(shí)沒幾個(gè)協(xié)議,沒必要用HASH,直接數(shù)組就行了,Linux內(nèi)核中缺省好象也只支持32種IP協(xié)議。 /*
?*?register an ipvs protocol
?*/
static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
?unsigned hash = IP_VS_PROTO_HASH(pp->protocol); // 把新協(xié)議節(jié)點(diǎn)掛接到HASH鏈表頭
?pp->next = ip_vs_proto_table[hash];
?ip_vs_proto_table[hash] = pp; // 調(diào)用該協(xié)議的初始化函數(shù)
?if (pp->init != NULL)
??pp->init(pp); return 0;
}
4.3 ip_vs_app_init IPVS應(yīng)用初始化 /* net/ipv4/ipvs/ip_vs_app.c */
int ip_vs_app_init(void)
{
?/* we will replace it with proc_net_ipvs_create() soon */
// 該函數(shù)就是建立一個(gè)/proc/net/ip_vs_app項(xiàng)
?proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
?return 0;
}
4.4 ip_vs_conn_init IPVS連接初始化 /* net/ipv4/ipvs/ip_vs_conn.c */
int ip_vs_conn_init(void)
{
?int idx; /*
? * Allocate the connection hash table and initialize its list heads
? */
// ipvs連接HASH表
?ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
?if (!ip_vs_conn_tab)
??return -ENOMEM; /* Allocate ip_vs_conn slab cache */
// ipvs連接cache,由于使用cache在內(nèi)存塊釋放時(shí)并不真正釋放,而是cache起來,
// 因此重新分配時(shí)速度更快
?ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
?????????? sizeof(struct ip_vs_conn), 0,
?????????? SLAB_HWCACHE_ALIGN, NULL, NULL);
?if (!ip_vs_conn_cachep) {
??vfree(ip_vs_conn_tab);
??return -ENOMEM;
?} IP_VS_INFO("Connection hash table configured "
???? "(size=%d, memory=%ldKbytes)\n",
???? IP_VS_CONN_TAB_SIZE,
???? (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
?IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
??? sizeof(struct ip_vs_conn));
// 初始化各HASH鏈表頭
?for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
??INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
?}
// 初始化各讀寫鎖
?for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)? {
??rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
?} // 建立/proc/net/ip_vs_conn項(xiàng)
?proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); /* calculate the random value for connection hash */
// 初始隨機(jī)數(shù)
?get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); return 0;
}
4.5 netfilter掛接點(diǎn)
nf_hook_ops分別在FORWARD點(diǎn)掛2個(gè), INPUT點(diǎn)和POST_ROUTING點(diǎn)各掛一個(gè)
/* net/ipv4/ipvs/ip_vs_core.c */ 4.5.1 ip_vs_in_ops
/* After packet filtering, forward packet through VS/DR, VS/TUN,
?? or VS/NAT(change destination), so that filtering rules can be
?? applied to IPVS. */
static struct nf_hook_ops ip_vs_in_ops = {
?.hook??= ip_vs_in,
?.owner??= THIS_MODULE,
?.pf??= PF_INET,
// INPUT點(diǎn)
?.hooknum??????? = NF_IP_LOCAL_IN,
// 此優(yōu)先級(jí)低于filter
?.priority?????? = 100,
};
ip_vs_in()這個(gè)函數(shù)對(duì)進(jìn)入本機(jī)的包進(jìn)行處理. /* net/ipv4/ipvs/ip_vs_core.c */ /*
?*?Check if it's for virtual services, look it up,
?*?and send it on its way...
?*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
? const struct net_device *in, const struct net_device *out,
? int (*okfn)(struct sk_buff *))
{
?struct sk_buff?*skb = *pskb;
?struct iphdr?*iph;
?struct ip_vs_protocol *pp;
?struct ip_vs_conn *cp;
?int ret, restart;
?int ihl; /*
? *?Big tappo: only PACKET_HOST (neither loopback nor mcasts)
? *?... don't know why 1st test DOES NOT include 2nd (?)
? */
?if (unlikely(skb->pkt_type != PACKET_HOST
?????? || skb->dev == &loopback_dev || skb->sk)) {
// input不處理目的非本機(jī)的包
??IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
???? skb->pkt_type,
???? skb->nh.iph->protocol,
???? NIPQUAD(skb->nh.iph->daddr));
??return NF_ACCEPT;
?} iph = skb->nh.iph;
?if (unlikely(iph->protocol == IPPROTO_ICMP)) {
// 如果是ICMP,可能是指示連接錯(cuò)誤的ICMP信息,調(diào)用ip_vs_in_icmp進(jìn)行檢查
// 是否是相關(guān)的ICMP信息
??int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); if (related)
???return verdict;
// 非相關(guān)ICMP,恢復(fù)處理流程
// 但其實(shí)ipvs是不均衡ICMP信息的,后面就返回了
??skb = *pskb;
??iph = skb->nh.iph;
?} /* Protocol supported? */
// 獲取協(xié)議支持模塊,由于只支持TCP、UDP、AH和ESP,如果是ICMP,返回為NULL
?pp = ip_vs_proto_get(iph->protocol);
?if (unlikely(!pp))
??return NF_ACCEPT; ihl = iph->ihl << 2; /*
? * Check if the packet belongs to an existing connection entry
? */
// 找到和該skb相關(guān)的ipvs連接,類似netfilter的根據(jù)tuple查找連接,
// 不過sk_buff結(jié)構(gòu)中沒有增加nfct那樣能直接指向連接的成員
// 對(duì)TCP協(xié)議來說是tcp_conn_in_get()
?cp = pp->conn_in_get(skb, pp, iph, ihl, 0); if (unlikely(!cp)) {
??int v;
// 如果沒有連接, 表明是新連接, 調(diào)用IPVS連接的conn_schedule調(diào)度連接分配和處理
// 連接調(diào)度要根據(jù)調(diào)度算法選擇一個(gè)真實(shí)目的服務(wù)器,然后建立新的IPVS連接
// 對(duì)TCP協(xié)議來說是tcp_conn_schedule()
??if (!pp->conn_schedule(skb, pp, &v, &cp))
???return v;
?} if (unlikely(!cp)) {
// 這種情況主要是沒內(nèi)存空間了,IPVS沒提供主動(dòng)刪除連接的機(jī)制
??/* sorry, all this trouble for a no-hit :) */
??IP_VS_DBG_PKT(12, pp, skb, 0,
???????? "packet continues traversal as normal");
??return NF_ACCEPT;
?} IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); /* Check the server status */
?if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
??/* the destination server is not available */
// 對(duì)于目的服務(wù)器失效的包丟棄
??if (sysctl_ip_vs_expire_nodest_conn) {
???/* try to expire the connection immediately */
???ip_vs_conn_expire_now(cp);
??}
??/* don't restart its timer, and silently
???? drop the packet. */
??__ip_vs_conn_put(cp);
??return NF_DROP;
?}
// 連接信息統(tǒng)計(jì)
?ip_vs_in_stats(cp, skb); // 進(jìn)行連接狀態(tài)的遷移, restart這個(gè)參數(shù)其實(shí)沒用
// 對(duì)TCP協(xié)議來說是調(diào)用tcp_state_transition
?restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); if (cp->packet_xmit)
// 將包發(fā)送出去, 具體xmit的實(shí)現(xiàn)在ip_vs_xmit.c中實(shí)現(xiàn),
// NAT模式下為 ip_vs_nat_xmit;
// 通道模式下為 ip_vs_tunnel_xmit;
// 直接路由模式下為:? ip_vs_dr_xmit;
// 本機(jī)數(shù)據(jù)為: ip_vs_null_xmit;
// 旁路模式下為: ip_vs_bypass_xmit;
// 函數(shù)成功時(shí)基本都返回NF_STOLEN使netfilter不再處理該包
// 所以對(duì)于NAT模式,應(yīng)該是不需要配置DNAT規(guī)則的,請(qǐng)求方向數(shù)據(jù)也不經(jīng)過FORWARD鏈
??ret = cp->packet_xmit(skb, cp, pp);
??/* do not touch skb anymore */
?else {
??IP_VS_DBG_RL("warning: packet_xmit is null");
??ret = NF_ACCEPT;
?} /* increase its packet counter and check if it is needed
??? to be synchronized */
?atomic_inc(&cp->in_pkts); // 在進(jìn)行均衡器熱備時(shí)將連接信息要從MASTER傳遞到SLAVE,使系統(tǒng)切換時(shí)
// 連接不丟棄,但還是要有一定條件才進(jìn)行同步
?if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
// 同步狀態(tài)類型為主機(jī)
???? (cp->protocol != IPPROTO_TCP ||
????? cp->state == IP_VS_TCP_S_ESTABLISHED) &&
// 非TCP連接或是已經(jīng)建立的連接
???? (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
????? == sysctl_ip_vs_sync_threshold[0]))
// 當(dāng)前連接的包數(shù)為N*thres[1]+thres[0]時(shí)
// 進(jìn)行連接的同步
??ip_vs_sync_conn(cp);
// 調(diào)整連接超時(shí),釋放連接計(jì)數(shù)
?ip_vs_conn_put(cp);
?return ret;
}
4.5.2 ip_vs_out_ops
/* After packet filtering, change source only for VS/NAT */
static struct nf_hook_ops ip_vs_out_ops = {
?.hook??= ip_vs_out,
?.owner??= THIS_MODULE,
?.pf??= PF_INET,
// FORWARD點(diǎn)
?.hooknum??????? = NF_IP_FORWARD,
// 此優(yōu)先級(jí)低于filter
?.priority?????? = 100,
}; ip_vs_out()這個(gè)函數(shù)對(duì)轉(zhuǎn)發(fā)包進(jìn)行處理, 只用在NAT模式的均衡處理,TUNNEL和DR方式下都是直接發(fā)送了,實(shí)際處理的只是服務(wù)器返回的回應(yīng)包,而客戶端請(qǐng)求的包是不經(jīng)過這里的,但如果設(shè)置了DNAT規(guī)則,數(shù)據(jù)包在PREROUTING點(diǎn)進(jìn)行了目的地址修改,這樣就不會(huì)再進(jìn)入INPUT點(diǎn)而是直接轉(zhuǎn)到FORWARD點(diǎn)處理,這時(shí)時(shí)針對(duì)該包的IPVS連接是沒有建立的。
/* net/ipv4/ipvs/ip_vs_core.c */ /*
?*?It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
?*?Check if outgoing packet belongs to the established ip_vs_conn,
?*????? rewrite addresses of the packet and send it on its way...
?*/
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
?? const struct net_device *in, const struct net_device *out,
?? int (*okfn)(struct sk_buff *))
{
?struct sk_buff? *skb = *pskb;
?struct iphdr?*iph;
?struct ip_vs_protocol *pp;
?struct ip_vs_conn *cp;
?int ihl; EnterFunction(11);
// 這個(gè)標(biāo)志只占一位
// 標(biāo)志設(shè)上就是已經(jīng)經(jīng)過IPVS處理了,直接返回
?if (skb->ipvs_property)
??return NF_ACCEPT; iph = skb->nh.iph;
?if (unlikely(iph->protocol == IPPROTO_ICMP)) {
// 處理可能的連接相關(guān)ICMP錯(cuò)誤信息,如地址端口不可達(dá)等
??int related, verdict = ip_vs_out_icmp(pskb, &related); if (related)
???return verdict;
??skb = *pskb;
??iph = skb->nh.iph;
?}
// 取得IPVS協(xié)議, tcp/udp/ah/esp之一
?pp = ip_vs_proto_get(iph->protocol);
?if (unlikely(!pp))
??return NF_ACCEPT; /* reassemble IP fragments */
?if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
?????? !pp->dont_defrag)) {
// 如果是碎片包進(jìn)行重組,基本不可能,因?yàn)閿?shù)據(jù)包進(jìn)入netfilter時(shí)就要進(jìn)行碎片重組
??skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
??if (!skb)
???return NF_STOLEN;
??iph = skb->nh.iph;
??*pskb = skb;
?} ihl = iph->ihl << 2; /*
? * Check if the packet belongs to an existing entry
? */
// 查找IPVS連接
?cp = pp->conn_out_get(skb, pp, iph, ihl, 0); if (unlikely(!cp)) {
// 沒找到IPVS連接,可能是請(qǐng)求方向的包經(jīng)過DNAT過來的
??if (sysctl_ip_vs_nat_icmp_send &&
????? (pp->protocol == IPPROTO_TCP ||
?????? pp->protocol == IPPROTO_UDP)) {
???__u16 _ports[2], *pptr; pptr = skb_header_pointer(skb, ihl,
??????? sizeof(_ports), _ports);
???if (pptr == NULL)
????return NF_ACCEPT;?/* Not for me */
// 用源地址,源端口來查真實(shí)服務(wù)器結(jié)構(gòu),如果是請(qǐng)求方向是找不到的
// 這種情況下數(shù)據(jù)包就不再被IPVS處理
???if (ip_vs_lookup_real_service(iph->protocol,
??????????? iph->saddr, pptr[0])) {
????/*
???? * Notify the real server: there is no
???? * existing entry if it is not RST
???? * packet or not TCP packet.
???? */
????if (iph->protocol != IPPROTO_TCP
??????? || !is_tcp_reset(skb)) {
?????icmp_send(skb,ICMP_DEST_UNREACH,
??????? ICMP_PORT_UNREACH, 0);
?????return NF_DROP;
????}
???}
??}
??IP_VS_DBG_PKT(12, pp, skb, 0,
???????? "packet continues traversal as normal");
??return NF_ACCEPT;
?}
// 找到連接,該包是服務(wù)器的回應(yīng)包
?IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
// skb數(shù)據(jù)包要求是可寫的
?if (!ip_vs_make_skb_writable(pskb, ihl))
??goto drop; /* mangle the packet */
// 修改協(xié)議部分信息,如TCP、UDP的端口
?if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
??goto drop;
// 修改源地址, 由于是服務(wù)器的返回包,只修改源地址
?skb = *pskb;
?skb->nh.iph->saddr = cp->vaddr;
?ip_send_check(skb->nh.iph); IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); // IPVS輸出統(tǒng)計(jì)
?ip_vs_out_stats(cp, skb);
?ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
?ip_vs_conn_put(cp); // 對(duì)該包設(shè)置標(biāo)志表示IPVS處理過了
?skb->ipvs_property = 1; LeaveFunction(11);
?return NF_ACCEPT; drop:
?ip_vs_conn_put(cp);
?kfree_skb(*pskb);
?return NF_STOLEN;
}
4.5.3 ip_vs_post_routing_ops
/* Before the netfilter connection tracking, exit from POST_ROUTING */
static struct nf_hook_ops ip_vs_post_routing_ops = {
?.hook??= ip_vs_post_routing,
?.owner??= THIS_MODULE,
?.pf??= PF_INET,
// POSTROUTING點(diǎn)
?.hooknum??????? = NF_IP_POST_ROUTING,
// 在源NAT之前進(jìn)行
?.priority?????? = NF_IP_PRI_NAT_SRC-1,
};
ip_vs_post_routing()這個(gè)函數(shù)對(duì)最后要發(fā)出的包進(jìn)行檢查,這個(gè)包是經(jīng)過FORWARD鏈的,源地址已經(jīng)被IPVS修改過了,不用再被netfilter進(jìn)行修改。如果是IPVS處理過的包,直接跳出POSTROUTING點(diǎn), 不再繼續(xù)可能的該點(diǎn)的更低優(yōu)先級(jí)的hook點(diǎn)操作,即不用進(jìn)行netfilter標(biāo)準(zhǔn)的SNAT操作。
/* net/ipv4/ipvs/ip_vs_core.c */ /*
?*????? It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
?*????? chain, and is used for VS/NAT.
?*????? It detects packets for VS/NAT connections and sends the packets
?*????? immediately. This can avoid that iptable_nat mangles the packets
?*????? for VS/NAT.
?*/
static unsigned int ip_vs_post_routing(unsigned int hooknum,
?????????? struct sk_buff **pskb,
?????????? const struct net_device *in,
?????????? const struct net_device *out,
?????????? int (*okfn)(struct sk_buff *))
{
// 如果沒被IPVS處理過,繼續(xù)后續(xù)hook點(diǎn)操作
?if (!((*pskb)->ipvs_property))
??return NF_ACCEPT;
?/* The packet was sent from IPVS, exit this chain */
// NF_STOP和NF_ACCEPT的區(qū)別就是STOP就不繼續(xù)后面的低優(yōu)先級(jí)的hook_ops的操作了
?return NF_STOP;
}
4.5.4 ip_vs_forward_icmp_ops
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
?? destined for 0.0.0.0/0, which is for incoming IPVS connections */
static struct nf_hook_ops ip_vs_forward_icmp_ops = {
?.hook??= ip_vs_forward_icmp,
?.owner??= THIS_MODULE,
?.pf??= PF_INET,
// FORWARD點(diǎn)
?.hooknum??????? = NF_IP_FORWARD,
// 在ip_vs_out_ops之前進(jìn)行
?.priority?????? = 99,
}; ip_vs_forward_icmp()這個(gè)函數(shù)對(duì)轉(zhuǎn)發(fā)的ICMP包進(jìn)行處理, 處理由于服務(wù)器失效而引起的網(wǎng)絡(luò)或端口不可達(dá)的ICMP信息,其他和服務(wù)器無關(guān)的ICMP信息不處理
/* net/ipv4/ipvs/ip_vs_core.c */ /*
?*?It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
?*????? related packets destined for 0.0.0.0/0.
?*????? When fwmark-based virtual service is used, such as transparent
?*????? cache cluster, TCP packets can be marked and routed to ip_vs_in,
?*????? but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
?*????? sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
?*????? and send them to ip_vs_in_icmp.
?*/
static unsigned int
ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
???? const struct net_device *in, const struct net_device *out,
???? int (*okfn)(struct sk_buff *))
{
?int r; if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
??return NF_ACCEPT;
// 實(shí)際調(diào)用ip_vs_in_icmp()來處理數(shù)據(jù)包
?return ip_vs_in_icmp(pskb, &r, hooknum);
}
/*
?*?Handle ICMP messages in the outside-to-inside direction (incoming).
?*?Find any that might be relevant, check against existing connections,
?*?forward to the right destination host if relevant.
?*?Currently handles error types - unreachable, quench, ttl exceeded.
?*/
static int?
ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
{
?struct sk_buff *skb = *pskb;
?struct iphdr *iph;
?struct icmphdr?_icmph, *ic;
?struct iphdr?_ciph, *cih;?/* The ip header contained within the ICMP */
?struct ip_vs_conn *cp;
?struct ip_vs_protocol *pp;
?unsigned int offset, ihl, verdict; // 這個(gè)參數(shù)指示該ICMP包是否和IPVS的連接相關(guān)
?*related = 1; /* reassemble IP fragments */
?if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
// 進(jìn)行碎片重組
??skb = ip_vs_gather_frags(skb,
?????????????????????????? hooknum == NF_IP_LOCAL_IN ?
????? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
??if (!skb)
???return NF_STOLEN;
??*pskb = skb;
?} iph = skb->nh.iph;
?offset = ihl = iph->ihl * 4;
?ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
?if (ic == NULL)
??return NF_DROP; IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
??? ic->type, ntohs(icmp_id(ic)),
??? NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); /*
? * Work through seeing if this is for us.
? * These checks are supposed to be in an order that means easy
? * things are checked first to speed up processing.... however
? * this means that some packets will manage to get a long way
? * down this stack and then be rejected, but that's life.
? */
?if ((ic->type != ICMP_DEST_UNREACH) &&
???? (ic->type != ICMP_SOURCE_QUENCH) &&
???? (ic->type != ICMP_TIME_EXCEEDED)) {
// 如果不是這三種ICMP信息,則該skb與IPVS無關(guān)
??*related = 0;
??return NF_ACCEPT;
?} /* Now find the contained IP header */
?offset += sizeof(_icmph);
?cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
?if (cih == NULL)
??return NF_ACCEPT; /* The packet looks wrong, ignore */
// 找的是ICMP信息中包含的原始包中的協(xié)議,而不是ICMP
?pp = ip_vs_proto_get(cih->protocol);
?if (!pp)
??return NF_ACCEPT; /* Is the embedded protocol header present? */
// 如果是碎片不處理直接返回
?if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
?????? pp->dont_defrag))
??return NF_ACCEPT; IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); offset += cih->ihl * 4; /* The embedded headers contain source and dest in reverse order */
// 查找IPVS連接
?cp = pp->conn_in_get(skb, pp, cih, offset, 1);
?if (!cp)
??return NF_ACCEPT; // 缺省的裁定結(jié)果是丟棄包
?verdict = NF_DROP; /* Ensure the checksum is correct */
?if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
// 檢查一下IP頭的校驗(yàn)和
???? ip_vs_checksum_complete(skb, ihl)) {
??/* Failed checksum! */
??IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
???? NIPQUAD(iph->saddr));
??goto out;
?} /* do the statistics and put it back */
// 進(jìn)行輸入統(tǒng)計(jì)
?ip_vs_in_stats(cp, skb);
// 如果內(nèi)部協(xié)議是TCP/UDP,發(fā)送偏移量要包括前4個(gè)字節(jié): 源端口和目的端口
?if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
??offset += 2 * sizeof(__u16);
// 發(fā)送ICMP
?verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
?/* do not touch skb anymore */ out:
?__ip_vs_conn_put(cp); return verdict;
} /* net/ipv4/ipvs/ip_vs_xmit.c */ /*
?*?ICMP packet transmitter
?*?called by the ip_vs_in_icmp
?*/
int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
??struct ip_vs_protocol *pp, int offset)
{
?struct rtable?*rt;?/* Route to the other host */
?int mtu;
?int rc; EnterFunction(10); /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
??? forwarded directly here, because there is no need to
??? translate address/port back */
?if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
// 如果不是NAT情況的IPVS連接, 即是TUNNEL或DR,直接調(diào)用連接的發(fā)送函數(shù)發(fā)送
??if (cp->packet_xmit)
???rc = cp->packet_xmit(skb, cp, pp);
??else
???rc = NF_ACCEPT;
??/* do not touch skb anymore */
??atomic_inc(&cp->in_pkts);
??goto out;
?} /*
? * mangle and send the packet here (only for VS/NAT)
? */
// 查找路由
?if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
??goto tx_error_icmp; /* MTU checking */
?mtu = dst_mtu(&rt->u.dst);
?if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
// 數(shù)據(jù)包過長超過MTU,但又是不允許分片的,發(fā)送ICMP出錯(cuò)包
??ip_rt_put(rt);
??icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
??IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
??goto tx_error;
?} /* copy-on-write the packet before mangling it */
// 讓skb可寫
?if (!ip_vs_make_skb_writable(&skb, offset))
??goto tx_error_put; // skb留出足夠的硬件頭空間
?if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
??goto tx_error_put; /* drop the old route when skb is not shared */
?dst_release(skb->dst);
?skb->dst = &rt->u.dst;
// 修改ICMP包
?ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */
?skb->local_df = 1; // 將該包用OUTPUT點(diǎn)的hook_ops進(jìn)行處理
?IP_VS_XMIT(skb, rt); // NF_STOLEN表示該skb不用返回到正常的IP棧了
?rc = NF_STOLEN;
?goto out; tx_error_icmp:
?dst_link_failure(skb);
? tx_error:
?dev_kfree_skb(skb);
?rc = NF_STOLEN;
? out:
?LeaveFunction(10);
?return rc;
? tx_error_put:
?ip_rt_put(rt);
?goto tx_error;
}
總結(jié)
以上是生活随笔為你收集整理的ip_vs实现分析(2)的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: zk 08之:Curator之一:zk客
- 下一篇: HTML+CSS实现个人简历