Linux内核：从上向下分析网络层IP协议

Sally ·

更新时间:2024-11-14

· 598 次阅读

　　上篇博文分析传输层终会调用函数ip_queue_xmit()函数，将发送数据的任务交给网络层，下面分析了下该函数：

　　该函数的主要函数调用关系图如下：

/* * Queues a packet to be sent， and starts the transmitter * if necessary. if free = 1 then we free the block after * transmit， otherwise we don't. If free==2 we not only * free the block but also don't assign a new ip seq number. * This routine also needs to put in the total length， * and compute the checksum */

void ip_queue_xmit(struct sock *sk， //发送数据的队列所对应的sock结构 struct device *dev，//发送该数据包的网卡设备 struct sk_buff *skb，//封装好的sk_buff结构，要发送的数据在该结构中 int free)//主要配合TCP协议使用，用于数据包的重发，UDP等协议调用是free=1 { struct iphdr *iph;//IP数据报首部指针 unsigned char *ptr;

/* Sanity check */ if (dev == NULL) { printk("IP: ip_queue_xmit dev = NULL "); return; }

IS_SKB(skb);

/* * Do some book-keeping in the packet for later */

skb->dev = dev;//进一步完整sk_buff的相应字段 skb->when = jiffies;//用于TCP协议的超时重传

/* * Find the IP header and set the length. This is bad * but once we get the skb data handling code in the * hardware will push its header sensibly and we will * set skb->ip_hdr to avoid this mess and the fixed * header length problem */

ptr = skb->data;//指针指向sk_buff中的数据部分 ptr += dev->hard_header_len;//hard_header_len为硬件首部长度，在net_init.c的函数eth_setup()函数中设置的，dev->hard_header_len = ETH_HLEN; 以太网首部长度为14 iph = (struct iphdr *)ptr;//prt已经指向IP数据包的首部 skb->ip_hdr = iph; iph->tot_len = ntohs(skb->len-dev->hard_header_len);//计算IP数据报的总长度

#ifdef CONFIG_IP_FIREWALL if(ip_fw_chk(iph， dev， ip_fw_blk_chain， ip_fw_blk_policy， 0) != 1) /* just don't send this packet */ return; #endif

/* * No reassigning numbers to fragments... */

if(free!=2) iph->id = htons(ip_id_count++); else free=1;

/* All buffers without an owner socket get freed */ if (sk == NULL) free = 1;

skb->free = free;//设置skb的free值，free=1，发送后立即释放；free=2，不但释放缓存，而且不分配新的序列号

/* * Do we need to fragment. Again this is inefficient. * We need to somehow lock the original buffer and use * bits of it. */ //数据帧中的数据部分必须小于等于MTU if(skb->len > dev->mtu + dev->hard_header_len)//发送的数据长度大于数据帧的数据部分和帧首部之和，则需要分片 { ip_fragment(sk，skb，dev，0);//对数据报分片后继续调用ip _queue_xmit()函数发送数据 IS_SKB(skb); kfree_skb(skb，FREE_WRITE); return; }

/* * Add an IP checksum */

ip_send_check(iph);//IP数据报首部检查

/* * Print the frame when debugging */

/* * More debugging. You cannot queue a packet already on a list * Spot this and moan loudly. */ if (skb->next != NULL)//说明该数据包仍然存在于某个缓存队列 { printk("ip_queue_xmit: next != NULL "); skb_unlink(skb);//将其从缓存链表中删除，否则可能导致内核错误 }

/* * If a sender wishes the packet to remain unfreed * we add it to his send queue. This arguably belongs * in the TCP level since nobody else uses it. BUT * remember IPng might change all the rules. */

if (!free)//free=0 { unsigned long flags; /* The socket now has more outstanding blocks */

sk->packets_out++;

/* Protect the list for a moment */ save_flags(flags); cli();

if (skb->link3 != NULL)//link3指向数据报道呃重发队列 { printk("ip.c: link3 != NULL "); skb->link3 = NULL; } //sk中send_tail和send_head是用户缓存的单向链表表尾和表头 if (sk->send_head == NULL) { sk->send_tail = skb; sk->send_head = skb; } else { sk->send_tail->link3 = skb;//link3指针用于数据包的连接 sk->send_tail = skb; } /* skb->link3 is NULL */

/* Interrupt restore */ restore_flags(flags); } else /* Remember who owns the buffer */ skb->sk = sk;

/* * If the indicated interface is up and running， send the packet. */ ip_statistics.IpOutRequests++; #ifdef CONFIG_IP_ACCT ip_acct_cnt(iph，dev， ip_acct_chain); #endif #ifdef CONFIG_IP_MULTICAST //这部分是IP数据报的多播处理

/* * Multicasts are looped back for other local users */ ....................................... #endif if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))//广播数据包的处理 ip_loopback(dev，skb); if (dev->flags & IFF_UP)//设备状态正常 { /* * If we have an owner use its priority setting， * otherwise use NORMAL */ //调用设备接口层函数发送数据: dev_queue_xmit()函数 if (sk != NULL) { dev_queue_xmit(skb， dev， sk->priority); } else { dev_queue_xmit(skb， dev， SOPRI_NORMAL); } } else//设备状态不正常 { ip_statistics.IpOutDiscards++; if (free) kfree_skb(skb， FREE_WRITE); } }

　　这个函数中对长度过长的数据包进行了分片，ip_fragment()函数，该函数没有详细分析。

void ip_fragment(struct sock *sk， struct sk_buff *skb， struct device *dev， int is_frag) { struct iphdr *iph; unsigned char *raw; unsigned char *ptr; struct sk_buff *skb2; int left， mtu， hlen， len; int offset; unsigned long flags;

/* * Point into the IP datagram header. */

raw = skb->data; iph = (struct iphdr *) (raw + dev->hard_header_len);

skb->ip_hdr = iph;

/* * Setup starting values. */

hlen = (iph->ihl * sizeof(unsigned long)); left = ntohs(iph->tot_len) - hlen; /* Space per frame */ hlen += dev->hard_header_len; /* Total header size */ mtu = (dev->mtu - hlen); /* Size of data space */ ptr = (raw + hlen); /* Where to start from */

/* * Check for any "DF" flag. [DF means do not fragment] */

if (ntohs(iph->frag_off) & IP_DF) { /* * Reply giving the MTU of the failed hop. */ ip_statistics.IpFragFails++; icmp_send(skb，ICMP_DEST_UNREACH， ICMP_FRAG_NEEDED， dev->mtu， dev); return; }

/* * The protocol doesn't seem to say what to do in the case that the * frame + options doesn't fit the mtu. As it used to fall down dead * in this case we were fortunate it didn't happen */

if(mtu<8) { /* It's wrong but it's better than nothing */ icmp_send(skb，ICMP_DEST_UNREACH，ICMP_FRAG_NEEDED，dev->mtu， dev); ip_statistics.IpFragFails++; return; }

/* * Fragment the datagram. */

/* * The initial offset is 0 for a complete frame. When * fragmenting fragments it's wherever this one starts. */

if (is_frag & 2) offset = (ntohs(iph->frag_off) & 0x1fff) << 3; else offset = 0;

/* * Keep copying data until we run out. */

while(left > 0) { len = left; /* IF: it doesn't fit， use 'mtu' - the data space left */ if (len > mtu) len = mtu; /* IF: we are not sending upto and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len/=8; len*=8; } /* * Allocate buffer. */

if ((skb2 = alloc_skb(len + hlen，GFP_ATOMIC)) == NULL) { printk("IP: frag: no memory for new fragment! "); ip_statistics.IpFragFails++; return; }

/* * Set up data on packet */

skb2->arp = skb->arp; if(skb->free==0) printk("IP fragmenter: BUG free!=1 in fragmenter "); skb2->free = 1; skb2->len = len + hlen; skb2->h.raw=(char *) skb2->data; /* * Charge the memory for the fragment to any owner * it might possess */

save_flags(flags); if (sk) { cli(); sk->wmem_alloc += skb2->mem_len; skb2->sk=sk; } restore_flags(flags); skb2->raddr = skb->raddr; /* For rebuild_header - must be here */

/* * Copy the packet header into the new buffer. */

memcpy(skb2->h.raw， raw， hlen);

/* * Copy a block of the IP datagram. */ memcpy(skb2->h.raw + hlen， ptr， len); left -= len;

skb2->h.raw+=dev->hard_header_len;

/* * Fill in the new header fields. */ iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/); iph->frag_off = htons((offset >> 3)); /* * Added AC : If we are fragmenting a fragment thats not the * last fragment then keep MF on each bit */ if (left > 0 || (is_frag & 1)) iph->frag_off |= htons(IP_MF); ptr += len; offset += len;

/* * Put this fragment into the sending queue. */

ip_statistics.IpFragCreates++;

ip_queue_xmit(sk， dev， skb2， 2);//还是调用ip_queue_xmit()函数来发送分片后的数据 } ip_statistics.IpFragOKs++; }

　　网络层的发送函数调用了设备接口层，相当于网络模型的链路层的发送函数dev_queue_xmit()

　　该函数的调用关系如下：

/* * Send (or queue for sending) a packet. * * IMPORTANT: When this is called to resend frames. The caller MUST * already have locked the sk_buff. Apart from that we do the * rest of the magic. */

void dev_queue_xmit(struct sk_buff *skb， struct device *dev， int pri) { unsigned long flags; int nitcount; struct packet_type *ptype; int where = 0; /* used to say if the packet should go */ /* at the front or the back of the */ /* queue - front is a retransmit try */ /* where=0 表示是刚从上层传递的新数据包；where=1 表示从硬件队列中取出的数据包*/

if (dev == NULL) { printk("dev.c: dev_queue_xmit: dev = NULL "); return; } if(pri>=0 && !skb_device_locked(skb))//锁定该skb再进行操作，避免造成内核的不一致情况 skb_device_lock(skb); /* Shove a lock on the frame */ #ifdef CONFIG_SLAVE_BALANCING save_flags(flags); cli(); if(dev->slave!=NULL && dev->slave->pkt_queue < dev->pkt_queue && (dev->slave->flags & IFF_UP)) dev=dev->slave; restore_flags(flags); #endif #ifdef CONFIG_SKB_CHECK IS_SKB(skb); #endif skb->dev = dev;

/* * This just eliminates some race conditions， but not all... */

if (skb->next != NULL) //这种条件似乎永远不能成立，因为发送数据包前，数据包已经从缓存队列摘下 {//以防内核代码有BUG /* * Make sure we haven't missed an interrupt. */ printk("dev_queue_xmit: worked around a missed interrupt "); start_bh_atomic(); dev->hard_start_xmit(NULL， dev); end_bh_atomic(); return; }

/* * Negative priority is used to flag a frame that is being pulled from the * queue front as a retransmit attempt. It therefore goes back on the queue * start on a failure. */ if (pri < 0) //优先级小于0表示是从硬件队列中取出的数据包 { pri = -pri-1; where = 1; }

if (pri >= DEV_NUMBUFFS) { printk("bad priority in dev_queue_xmit. "); pri = 1; }

/* * If the address has not been resolved. Call the device header rebuilder. * This can cover all protocols and technically not just ARP either. */ if (!skb->arp && dev->rebuild_header(skb->data， dev， skb->raddr， skb)) {//用于ARP协议，并重建MAC帧首部 return; }

save_flags(flags); cli(); if (!where) {//表示是新数据包，需要将其加入设备队列中 #ifdef CONFIG_SLAVE_BALANCING skb->in_dev_queue=1;//该数据包在设备队列 #endif skb_queue_tail(dev->buffs + pri，skb);//将发送数据包加入硬件队列 skb_device_unlock(skb); /* Buffer is on the device queue and can be freed safely */ skb = skb_dequeue(dev->buffs + pri);//从硬件队列中取出一个数据包 skb_device_lock(skb); /* New buffer needs locking down */ #ifdef CONFIG_SLAVE_BALANCING skb->in_dev_queue=0; #endif } restore_flags(flags);

/* copy outgoing packets to any sniffer packet handlers */ if(!where)//对于新的数据包，则遍历网络层协议队列，内核支持混杂模式 { for (nitcount= dev_nit， ptype = ptype_base; nitcount > 0 && ptype != NULL; ptype = ptype->next) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (ptype->type == htons(ETH_P_ALL) && (ptype->dev == dev || !ptype->dev) && ((struct sock *)ptype->data != skb->sk)) { struct sk_buff *skb2; if ((skb2 = skb_clone(skb， GFP_ATOMIC)) == NULL) break; /* * The protocol knows this has (for other paths) been taken off * and adds it back. */ skb2->len-=skb->dev->hard_header_len; ptype->func(skb2， skb->dev， ptype);//IP层函数对应func为ip_rcv()，将发送的数据回送一份给对应的网络层协议 nitcount--;//用于及时退出循环 } } } start_bh_atomic();//开始原子操作 if (dev->hard_start_xmit(skb， dev) == 0) {//调用硬件的发送函数发送数据 end_bh_atomic();//结束原子操作 /* * Packet is now solely the responsibility of the driver */ return;//到这里说明数据包成功发送 } //数据包没有成功发送，进行处理，将数据包从新加入硬件队列 end_bh_atomic();

/* * Transmission failed， put skb back into a list. Once on the list it's safe and * no longer device locked (it can be freed safely from the device queue) */ cli(); #ifdef CONFIG_SLAVE_BALANCING skb->in_dev_queue=1; dev->pkt_queue++; #endif skb_device_unlock(skb);//对SKB解锁 skb_queue_head(dev->buffs + pri，skb);//这次采用头插法插入硬件发送队列 restore_flags(flags); }

　　具体的硬件发送函数dev->hard_start_xmit的实现将做下篇博文中分析。

　　本文转自：http://blog.csdn.net/yming0221/article/details/7492423

ip ip协议 Linux

1024 个赞